# Text Processing for Indonesia Election 2019 Sentiment

## Load Dataset

In [None]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from deep_translator import GoogleTranslator
import emoji
from kbbi import KBBI

In [None]:
df = pd.read_csv('./tweet.csv')
df.drop('Unnamed: 0', axis=1, inplace=True)
print(df.shape)

(1815, 2)


In [None]:
df.head()

Unnamed: 0,sentimen,tweet
0,negatif,Kata @prabowo Indonesia tidak dihargai bangsa ...
1,netral,"Batuan Langka, Tasbih Jokowi Hadiah dari Habib..."
2,netral,"Di era Jokowi, ekonomi Indonesia semakin baik...."
3,positif,"Bagi Sumatera Selatan, Asian Games berdampak p..."
4,negatif,Negara kita ngutang buat bngun infrastruktur y...


In [None]:
df.isnull().sum()

sentimen    0
tweet       0
dtype: int64

In [None]:
df.duplicated().sum()

0

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1815 entries, 0 to 1814
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   sentimen  1815 non-null   object
 1   tweet     1815 non-null   object
dtypes: object(2)
memory usage: 28.5+ KB


In [None]:
df['sentimen'].value_counts()

sentimen
positif    612
netral     607
negatif    596
Name: count, dtype: int64

## Preprocessing

### Count Changes Function

Fungsi ini digunakan untuk mengetahui berapa banyak perubahan yang dilakukan untuk tiap proses

In [None]:
def count_changes(initial_df, df_after_step, file_name=None):
    # Create a boolean mask indicating rows with changes
    has_changes_mask = (initial_df != df_after_step).any(axis=1)

    # Apply the mask to select rows from the initial DataFrame
    changes = initial_df[has_changes_mask]
    
    # Rename existing column
    changes = changes.rename(columns={'tweet': 'before'})
    
    # Create 'after' column containing tweets after preprocessing
    changes['after'] = df_after_step.loc[has_changes_mask, 'tweet']
    
    # Count the number of changes
    count = len(changes)
    
    if file_name:
        save_path = './Text-Preprocessing/' + file_name + '.csv'
        changes.to_csv(save_path, index=True) 
    
    return count, changes

### String Parsing

In [None]:
initial_df = df.copy()
df['tweet'] = df['tweet'].astype(str)
count, changes = count_changes(initial_df, df, 'string_parsing')
print("Banyak perubahan pada step ini = " , count)

Banyak perubahan pada step ini =  0


### Split Hastag

#PemiluDamaiSekali -> Pemilu Damai Sekali

In [None]:
def split_camel_case_hashtag(text):
    # Check if the sentence contains a hashtag
    if not re.search(r'#', text):
        # If the sentence doesn't contain a hashtag, return the original text
        return text
    
    # Split the text into words
    words = text.split()
    
    # Process words that start with '#' and split camel case words
    for i, word in enumerate(words):
        if word.startswith('#'):
            # Remove the leading '#' and split camel case words
            words[i] = word.replace('#', '')
            words[i] = re.sub(r'([a-z])([A-Z])', r'\1 \2', words[i])
    
    # Join the modified words back into a single string
    return ' '.join(words)

In [None]:
initial_df = df.copy()
df['tweet'] = df['tweet'].apply(lambda x : split_camel_case_hashtag(x))
count, changes = count_changes(initial_df, df, 'split_camel_case_hashtag')
print("Banyak perubahan pada step ini = " , count)

Banyak perubahan pada step ini =  553


### Lowercasing

In [None]:
initial_df = df.copy()
df['tweet'] = df['tweet'].str.lower()
count, changes = count_changes(initial_df, df, 'lower_casing')
print("Banyak perubahan pada step ini = " , count)

Banyak perubahan pada step ini =  1754


### Remove URL

In [None]:
def clean_URLs(text):
    # Define the regex pattern to match URLs
    pattern = r"(https?://)?(www\.)?([a-zA-Z0-9-]+\.[a-zA-Z]{2,})(\S*)"
    
    # Use re.sub to replace matched URLs with an empty string
    cleaned_text = re.sub(pattern, "", text)
    
    return cleaned_text.strip()

In [None]:
initial_df = df.copy()
df['tweet'] = df['tweet'].apply(lambda x : clean_URLs(x))
count, changes = count_changes(initial_df, df, 'clean_url')
print("Banyak perubahan pada step ini = " , count)

Banyak perubahan pada step ini =  626


### Remove HTML Tags

In [None]:
def remove_html(text):
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', text)

In [None]:
initial_df = df.copy()
df['tweet'] = df['tweet'].apply(lambda x : remove_html(x))
count, changes = count_changes(initial_df, df, 'remove_html')
print("Banyak perubahan pada step ini = " , count)

Banyak perubahan pada step ini =  0


### Remove Numeric

"1"  atau " 01" -> satu dan "2" atau "02" -> dua, selain itu hapus 

In [None]:
def remove_numeric(text):
    # Use regular expression to find numbers in the text
    numbers = re.findall(r'\b(?:0?[12])\b', text)
    
    # Iterate through each found number
    for number in numbers:
        if number == '1' or number == "01":
            # Replace only if the number is standalone
            text = text.replace(number, 'satu')
        elif number == '2' or number == "02":
            # Replace only if the number is standalone
            text = text.replace(number, 'dua')
    
    # Remove any remaining numbers
    text = re.sub(r'\b\d+\b', '', text)
    text = re.sub('[0-9]+', '', text)
    return text

In [None]:
initial_df = df.copy()
df['tweet'] = df['tweet'].apply(lambda x : remove_numeric(x))
count, changes = count_changes(initial_df, df, 'remove_numeric')
print("Banyak perubahan pada step ini = " , count)

Banyak perubahan pada step ini =  757


### Remove String Emoticon

String Emoticon merupakan emoticon wajah menggunakan string = :) , :( , ;| dan lainnya

In [None]:
def remove_string_emoticons(text):
    # Remove emoticons (e.g., :), :-), :D, ...)
    emoticon_pattern = r'(?::|;|=)(?:-)?(?:\)|\(|D|P)'
    text = re.sub(emoticon_pattern, '', text)
    
    return text

In [None]:
initial_df = df.copy()
df['tweet'] = df['tweet'].apply(lambda x : remove_string_emoticons(x))
count, changes = count_changes(initial_df, df, 'remove_string_emoticon')
print("Banyak perubahan pada step ini = " , count)

Banyak perubahan pada step ini =  12


### Remove Punctuation

Semua tanda baca kecuali tanda petik satu di remove menggunakan spasi, sedangkan tanda petik satu di remove tanpa spasi

jum'at -> jumat

In [None]:
punctuations = string.punctuation
punctuations

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [None]:
def clean_punctuations(text):
    translator = str.maketrans(punctuations.replace("'", ''), ' ' * (len(punctuations) - 1))
    text = text.translate(translator)
    text = re.sub(' +', ' ', text) #Remove extra space
    text = re.sub(r'\'', '', text)
    return text

In [None]:
initial_df = df.copy()
df['tweet'] = df['tweet'].apply(lambda x : clean_punctuations(x))
count, changes = count_changes(initial_df, df, 'clean_punctuations')
print("Banyak perubahan pada step ini = " , count)

Banyak perubahan pada step ini =  1742


### Extract Emoji

https://carpedm20.github.io/emoji/

https://github.com/jolicode/emoji-search/blob/main/synonyms/cldr-emoji-annotation-synonyms-id.txt

In [None]:
# Load the DataFrame
emoji_df = pd.read_csv('./Utils/cldr_emoji.csv')

# Create a dictionary mapping emoji to their meanings
emoji_meaning_dict = dict(zip(emoji_df['emoji'], emoji_df['meaning']))

In [None]:
def extract_emoji(text):
    # Use demojize to check if the sentence remains the same
    if text == emoji.demojize(text):
        return text  # If the sentence remains unchanged, return the original text

    # Initialize an empty list to store the modified tokens
    modified_tokens = []
    
    # Split the text into tokens
    tokens = re.findall(r'\b\w+\b|\s|.', text)
    
    # Iterate over each token in the text
    for token in tokens:
        # Check if the token is an emoji and is in the emoji dictionary
        if token in emoji_meaning_dict:
            modified_tokens.append(emoji_meaning_dict[token])
        # If the token is an emoji but not in the dictionary, skip it (remove)
        elif emoji.demojize(token) != token:
            pass
        # If the token is not an emoji, add it as it is
        else:
            modified_tokens.append(token)
    
    # Join the modified tokens to form the new text
    new_text = ''.join(modified_tokens)
    
    return new_text


In [None]:
initial_df = df.copy()
df['tweet'] = df['tweet'].apply(lambda x : extract_emoji(x))
count, changes = count_changes(initial_df, df, 'extract_emoji')
print("Banyak perubahan pada step ini = " , count)

Banyak perubahan pada step ini =  48


In [None]:
import pandas as pd
df = pd.read_csv('./Text-Preprocessing/extract_emoji.csv')
df

Unnamed: 0.1,Unnamed: 0,sentimen,before,after
0,8,netral,iya bener aa kita mantap kan pilihan ke pemimp...,iya bener aa kita mantap kan pilihan ke pemimp...
1,42,netral,gameof thrones winter is coming pidato jokowi ...,gameof thrones winter is coming pidato jokowi ...
2,172,positif,liat video ini netes juga air mata kita doakan...,liat video ini netes juga air mata kita doakan...
3,256,netral,rb kl anda bl di agen langsung malah dapet rb ...,rb kl anda bl di agen langsung malah dapet rb ...
4,259,positif,memiliki pemikiran yang cemerlang untuk kemaju...,memiliki pemikiran yang cemerlang untuk kemaju...
5,270,netral,gas normal aja kg rb bbm aman saja malah setar...,gas normal aja kg rb bbm aman saja malah setar...
6,315,positif,siapapun pilihan pak kiai saya ttp menghargai ...,siapapun pilihan pak kiai saya ttp menghargai ...
7,330,netral,jalannya debat ini jawaban jokowi kepada sandi...,jalannya debat ini jawaban jokowi kepada sandi...
8,408,positif,insyaallah sudah paham mbah maksud saya apakah...,insyaallah sudah paham mbah maksud saya apakah...
9,460,positif,hahaha takdelah maâ€™aruf amin yang jadi calon...,hahaha takdelah maâ€aruf amin yang jadi calon ...


### Remove Special Character

Example = ¦¬‡ðÿ

In [None]:
def remove_special_characters(text):
    # Split the text into words
    words = text.split()
    # Define a pattern to match words consisting of lowercase letters (a-z)
    pattern = r'^[a-z]+$'
    # Filter out words that don't match the pattern
    filtered_words = [word for word in words if re.match(pattern, word)]
    # Join the filtered words back into a sentence
    cleaned_text = ' '.join(filtered_words)
    return cleaned_text

In [None]:
initial_df = df.copy()
df['tweet'] = df['tweet'].apply(lambda x : remove_special_characters(x))
count, changes = count_changes(initial_df, df, 'remove_special_characters')
print("Banyak perubahan pada step ini = " , count)

Banyak perubahan pada step ini =  1111


### Remove 3 Repeating Character

UBAH KARAKTER BERULANG MINIMAL 3 BARU DI PROCESS

yaaaa -> ya

In [None]:
def clean_repeating_characters(text):
    return re.sub(r'(.)\1{2,}', r'\1', text)

In [None]:
initial_df = df.copy()
df['tweet'] = df['tweet'].apply(lambda x : clean_repeating_characters(x))
count, changes = count_changes(initial_df, df, 'clean_repeating_characters')
print("Banyak perubahan pada step ini = " , count)

Banyak perubahan pada step ini =  84


### Remove Single Word

apa kabar q -> apa kabar

In [None]:
def clean_single_alphabet(text):
    # Define a regular expression pattern to match standalone single characters
    pattern = r'\b\w\b'
    
    # Use re.findall to find all matches of the pattern in the text
    single_chars = re.findall(pattern, text)
    
    # If no single characters are found, return the original text
    if not single_chars:
        return text
    
    # If single characters are found, remove them and extra spaces
    cleaned_text = re.sub(pattern, '', text)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
    
    return cleaned_text.strip()

In [None]:
initial_df = df.copy()
df['tweet'] = df['tweet'].apply(lambda x : clean_single_alphabet(x))
count, changes = count_changes(initial_df, df, 'clean_single_alphabet')
print("Banyak perubahan pada step ini = " , count)

Banyak perubahan pada step ini =  138


### Informal Words

In [None]:
# List to save informal words
informal_words = []

kbbi_df = pd.read_csv('./Utils/kbbi_data.csv')

In [None]:
def find_informal_word(sentence):
    global informal_words  # Use global to access the informal_words list
    # Split the sentence into words
    words = sentence.split()
    # Check each word
    for word in words:
        # Check if the word is not in the KBBI DataFrame
        if word not in kbbi_df.values:
            informal_words.append(word)  # Add the word to the list of informal words
    return sentence

In [None]:
df['tweet'] = df['tweet'].apply(lambda x : find_informal_word(x))

In [None]:
# Convert list to DataFrame
informal_words_df = pd.DataFrame({'words': informal_words})

# Remove duplicates
informal_words_df = informal_words_df.drop_duplicates()

# Sort by the alphabeth
informal_words_df = informal_words_df.sort_values(by='words')

# Save DataFrame to CSV
informal_words_df.to_csv('./Text-Preprocessing/informal_words.csv', index=False)

### Language Transalation

English ke Indonesia tidak dilakukan karena terlalu banyak kata yang malah jadi salah makna

In [None]:
# from nltk.corpus import words

# nltk.download('words')

# translator = GoogleTranslator(source='en', target='id')

# # Dictionary to save tranlated words
# translation_dict = {'english': [], 'bahasa': []}

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


In [None]:
# def translate_english(text):
#     result = text
#     for word in text.split():
#         if word in words.words():
#             translation = translator.translate(text=word).lower()
#             # Save original word dan translation
#             translation_dict['english'].append(word)
#             translation_dict['bahasa'].append(translation)
#             result = result.replace(word, translation)
#     return result

In [None]:
# initial_df = df.copy()
# df['tweet'] = df['tweet'].apply(lambda x : translate_english(x))
# count, changes = count_changes(initial_df, df, 'translate_english')
# print("Banyak perubahan pada step ini = " , count)

In [None]:
# # Convert dictionary to DataFrame
# translation_df = pd.DataFrame.from_dict(translation_dict)

# # Remove duplicates
# translation_df = translation_df.drop_duplicates()

# # Sort by the column (English)
# translation_df = translation_df.sort_values(by='english')

# # Save DataFrame to CSV
# translation_df.to_csv('./Text-Preprocessing/translation_dict.csv', index=False)

### POS Tagging

In [None]:
# check nlp_id's stopwords composition
postagger = PosTag()
postag_sw = postagger.get_pos_tag(" ".join(stopwords))

dct = {}
for word, tag in postag_sw:
    if tag not in dct.keys():
        dct[tag] = 1
    else:
        dct[tag] += 1
dct

  from .autonotebook import tqdm as notebook_tqdm
2024-03-18 13:01:03.994149: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-03-18 13:01:04.127357: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2024-03-18 13:01:04.127394: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2024-03-18 13:01:04.149367: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-03-18 13:01:0

ModuleNotFoundError: No module named 'flair.data_fetcher'

### Stemming

In [None]:
!pip install Sastrawi


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [None]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

# Create a Sastrawi stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

In [None]:
def stemming(text):
    return stemmer.stem(text)

In [None]:
initial_df = df.copy()
df['tweet'] = df['tweet'].apply(lambda x : stemming(x))
count, changes = count_changes(initial_df, df, 'stemming')
print("Banyak perubahan pada step ini = " , count)

Banyak perubahan pada step ini =  1690


### Remove stopwords 

In [None]:
nltk.download('stopwords')

stop_words = set(stopwords.words('indonesian'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Check apakah suatu kata ada pada stopword atau tidak

In [None]:
# Ubah ini
stopword_to_check = [ "tidak", "bukan", "belum", "enggak", "entah", "mungkin", "ragu", "seharusnya", "sebaiknya", "sebelumnya"]

for i in stopword_to_check:
    if i in stop_words:
        print(i + ' termasuk stopword')
    else:
        print(i + ' tidak termasuk stopword')

tidak termasuk stopword
bukan termasuk stopword
belum termasuk stopword
enggak termasuk stopword
entah termasuk stopword
mungkin termasuk stopword
ragu tidak termasuk stopword
seharusnya termasuk stopword
sebaiknya termasuk stopword
sebelumnya termasuk stopword


Stopword tidak dilakukan karena terdapat beberapa kata yang termasuk stopword yang akan menghilangkan konteks analisis sentimen apabila dihapus 

In [None]:
# def clean_stopwords(text):
#     return " ".join([word for word in str(text).split() if word not in STOPWORDS])

In [None]:
# initial_df = df.copy()
# df['tweet'] = df['tweet'].apply(lambda x : stop_word(x))
# count, changes = count_changes(initial_df, df, 'stop_word')
# print("Banyak perubahan pada step ini = " , count)

Banyak perubahan pada step ini =  1797


### Hasil Akhir Preprocessing Tweet 

In [None]:
df

Unnamed: 0,sentimen,tweet
0,negatif,kata prabowo indonesia tidak dihargai bangsa a...
1,netral,batuan langka tasbih jokowi hadiah dari habib ...
2,netral,di era jokowi ekonomi indonesia semakin baik i...
3,positif,bagi sumatera selatan asian games berdampak pd...
4,negatif,negara kita ngutang buat bngun infrastruktur y...
...,...,...
1810,netral,negarawan sejati sll bangga dan mengedepankan ...
1811,netral,satu hrs ceramah di damai indonesiaku dua pere...
1812,netral,mari bangun bangsa dgn mendukung perekonomian ...
1813,netral,bantu majukan perekonomian bangsa bersama pak ...


In [None]:
df.to_csv('./Preprocessed/cleaned_data.csv', index=False)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=2b3c5800-c216-4f08-93af-5173ca1bb328' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>