In [1]:
import pandas as pd
import re #regex library
import nltk
import string
# import word_tokenize & FreqDist from NLTK
from nltk.tokenize import word_tokenize 
from nltk.probability import FreqDist

In [2]:
TWEET_DATA = pd.read_excel('fixbanget.xlsx')
TWEET_DATA

Unnamed: 0.1,Unnamed: 0,created_at,username,text,label
0,0,2022-11-22,dollfaceAngie88,Selalu menyenangkan naik gojek :),1
1,1,2022-11-22,465O5,Pelayanan GOJEK jelek _-,0


In [3]:
##case folding
TWEET_DATA['text'] = TWEET_DATA['text'].str.lower()
TWEET_DATA

Unnamed: 0.1,Unnamed: 0,created_at,username,text,label
0,0,2022-11-22,dollfaceAngie88,selalu menyenangkan naik gojek :),1
1,1,2022-11-22,465O5,pelayanan gojek jelek _-,0


In [4]:
##pengahpusan simbol
def remove_tweet_special(text):
    # remove tab, new line, ans back slice
    text = text.replace('\\t'," ").replace('\\n'," ").replace('\\u'," ").replace('\\',"")
    # remove non ASCII (emoticon, chinese word, .etc)
    text = text.encode('ascii', 'replace').decode('ascii')
    # remove mention, link, hashtag
    text = ' '.join(re.sub("([@#][A-Za-z0-9]+)|(\w+:\/\/\S+)"," ", text).split())
    # remove incomplete URL
    return text.replace("http://", " ").replace("https://", " ")
                
TWEET_DATA['text'] = TWEET_DATA['text'].apply(remove_tweet_special)
#remove number
def remove_number(text):
    return  re.sub(r"\d+", "", text)

TWEET_DATA['text'] = TWEET_DATA['text'].apply(remove_number)

#remove punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans("","",string.punctuation))

TWEET_DATA['text'] = TWEET_DATA['text'].apply(remove_punctuation)

#remove whitespace leading & trailing
def remove_whitespace_LT(text):
    return text.strip()

TWEET_DATA['text'] = TWEET_DATA['text'].apply(remove_whitespace_LT)

#remove multiple whitespace into single whitespace
def remove_whitespace_multiple(text):
    return re.sub('\s+',' ',text)

TWEET_DATA['text'] = TWEET_DATA['text'].apply(remove_whitespace_multiple)

# remove single char
def remove_singl_char(text):
    return re.sub(r"\b[a-zA-Z]\b", "", text)

TWEET_DATA['text'] = TWEET_DATA['text'].apply(remove_singl_char)
TWEET_DATA['created_at'] = TWEET_DATA['created_at'].apply(lambda a: pd.to_datetime(a).date())
TWEET_DATA

Unnamed: 0.1,Unnamed: 0,created_at,username,text,label
0,0,2022-11-22,dollfaceAngie88,selalu menyenangkan naik gojek,1
1,1,2022-11-22,465O5,pelayanan gojek jelek,0


In [5]:
df_preprocessed = TWEET_DATA.copy()
df_preprocessed = df_preprocessed.drop(columns=['created_at'])
df_preprocessed.head()

Unnamed: 0.1,Unnamed: 0,username,text,label
0,0,dollfaceAngie88,selalu menyenangkan naik gojek,1
1,1,465O5,pelayanan gojek jelek,0


In [6]:
df_preprocessed

Unnamed: 0.1,Unnamed: 0,username,text,label
0,0,dollfaceAngie88,selalu menyenangkan naik gojek,1
1,1,465O5,pelayanan gojek jelek,0


In [7]:
#tokenisasi 
def word_tokenize_wrapper(text):
    return word_tokenize(text)

df_preprocessed['tweet_tokens'] = df_preprocessed['text'].apply(word_tokenize_wrapper)

print('Tokenizing Result : \n') 
print(df_preprocessed['tweet_tokens'].head())
print('\n\n\n')
# TWEET_DATA

Tokenizing Result : 

0    [selalu, menyenangkan, naik, gojek]
1              [pelayanan, gojek, jelek]
Name: tweet_tokens, dtype: object






In [8]:
# NLTK calc frequency distribution
def freqDist_wrapper(text):
    return FreqDist(text)

df_preprocessed['tweet_tokens_fdist'] = df_preprocessed['tweet_tokens'].apply(freqDist_wrapper)
print('Frequency Tokens : \n') 
print(df_preprocessed['tweet_tokens_fdist'].head().apply(lambda x : x.most_common()))
# TWEET_DATA

Frequency Tokens : 

0    [(selalu, 1), (menyenangkan, 1), (naik, 1), (g...
1             [(pelayanan, 1), (gojek, 1), (jelek, 1)]
Name: tweet_tokens_fdist, dtype: object


In [9]:
from nltk.corpus import stopwords

# ----------------------- get stopword from NLTK stopword -------------------------------
# get stopword indonesia
list_stopwords = stopwords.words('indonesian')


# ---------------------------- manualy add stopword  ------------------------------------
# append additional stopword
list_stopwords.extend(["yg", "dg", "rt", "dgn", "ny", "d", 'klo', 
                       'kalo', 'amp', 'biar', 'bikin', 'bilang', 
                       'gak', 'ga', 'krn', 'nya', 'nih', 'sih', 
                       'si', 'tau', 'tdk', 'tuh', 'utk', 'ya', 
                       'jd', 'jgn', 'sdh', 'aja', 'n', 't', 
                       'nyg', 'hehe', 'pen', 'u', 'nan', 'loh', 'rt',
                       '&amp', 'yah'])

# ----------------------- add stopword from txt file ------------------------------------
# read txt stopword using pandas
txt_stopword = pd.read_csv("stopwords.txt", names= ["stopwords"], header = None)

# convert stopword string to list & append additional stopword
list_stopwords.extend(txt_stopword["stopwords"][0].split(' '))

# ---------------------------------------------------------------------------------------

# convert list to dictionary
list_stopwords = set(list_stopwords)


#remove stopword pada list token
def stopwords_removal(words):
    return [word for word in words if word not in list_stopwords]

df_preprocessed['tweet_tokens_WSW'] = df_preprocessed['tweet_tokens'].apply(stopwords_removal) 


print(df_preprocessed['tweet_tokens_WSW'].head())

0        [menyenangkan, gojek]
1    [pelayanan, gojek, jelek]
Name: tweet_tokens_WSW, dtype: object


In [10]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
def stemming(dokumen):
    return [StemmerFactory().create_stemmer().stem(kata) for kata in dokumen if kata and kata.isascii()]
df_preprocessed['steeming'] = df_preprocessed['tweet_tokens_WSW'].apply(stemming)
print(df_preprocessed['steeming'].head())

0          [senang, gojek]
1    [layan, gojek, jelek]
Name: steeming, dtype: object


In [11]:
df_preprocessed

Unnamed: 0.1,Unnamed: 0,username,text,label,tweet_tokens,tweet_tokens_fdist,tweet_tokens_WSW,steeming
0,0,dollfaceAngie88,selalu menyenangkan naik gojek,1,"[selalu, menyenangkan, naik, gojek]","{'selalu': 1, 'menyenangkan': 1, 'naik': 1, 'g...","[menyenangkan, gojek]","[senang, gojek]"
1,1,465O5,pelayanan gojek jelek,0,"[pelayanan, gojek, jelek]","{'pelayanan': 1, 'gojek': 1, 'jelek': 1}","[pelayanan, gojek, jelek]","[layan, gojek, jelek]"


In [12]:
#Creating and save dataset to excel
df_preprocessed.to_excel("okefix.xlsx")  