## Import Library

In [2]:
import pandas as pd
import numpy as np
import re
import string

In [3]:
# !pip install nltk
import nltk
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist

## Read File

In [4]:
dataset = pd.read_excel("assets/nadiem_comment_2.xlsx", usecols=['content', 'sentiment'])
dataset['content'].str.encode('ascii', 'ignore')
dataset.head(1)

Unnamed: 0,sentiment,content
0,neutral,Paa bales DM pa saya mau nanya soal paket bela...


## Drop Nan

In [5]:
dataset.shape

(1451, 2)

In [6]:
dataset.dropna(inplace=True)
dataset.shape

(1451, 2)

In [7]:
dataset.reset_index(drop=True, inplace=True)
dataset.head(1)

Unnamed: 0,sentiment,content
0,neutral,Paa bales DM pa saya mau nanya soal paket bela...


## Cleansing

In [8]:
def remove_unique_character(text):
    text = text.replace('\t', " ").replace('\n', " ").replace(r'\u', ' ') # remove tab, new line, ans back slice
    text = text.encode('ascii', 'replace').decode('ascii') # remove non ASCII (emoticon, chinese word, .etc)
    text = ' '.join(re.sub("([@#][A-Za-z0-9]+)|(\w+:\/\/\S+)"," ", text).split()) # remove mention, link, hashtag
    text = text.replace("http://", " ").replace("https://", " ") # remove incomplete URL
    text = re.sub(r"\d+", "", text) #remove number
    text = text.translate(str.maketrans("", "", string.punctuation)) #remove punctuation/tanda baca
    text = text.strip() #remove whitespace leading & trailing
    text = re.sub('\s+',' ',text) #remove multiple whitespace into single whitespace
    text = re.sub(r"\b[a-zA-Z]\b", "", text) #remove single character
    return  text

In [9]:
dataset['cleansing'] = dataset['content'].apply(remove_unique_character)
dataset.head(1)

Unnamed: 0,sentiment,content,cleansing
0,neutral,Paa bales DM pa saya mau nanya soal paket bela...,Paa bales DM pa saya mau nanya soal paket bela...


## Case Folding

In [10]:
dataset['case_folding'] = dataset['cleansing'].str.lower()
dataset.head(1)

Unnamed: 0,sentiment,content,cleansing,case_folding
0,neutral,Paa bales DM pa saya mau nanya soal paket bela...,Paa bales DM pa saya mau nanya soal paket bela...,paa bales dm pa saya mau nanya soal paket bela...


## Tokenization

In [11]:
def word_tokenize_wrapper(text):
    return word_tokenize(text)

dataset['tokenization'] = dataset['case_folding'].apply(word_tokenize_wrapper)
dataset.head(1)

Unnamed: 0,sentiment,content,cleansing,case_folding,tokenization
0,neutral,Paa bales DM pa saya mau nanya soal paket bela...,Paa bales DM pa saya mau nanya soal paket bela...,paa bales dm pa saya mau nanya soal paket bela...,"[paa, bales, dm, pa, saya, mau, nanya, soal, p..."


## Normalization

In [12]:
# ----------- normalized ------------
normalizad_word = pd.read_excel(r"assets/key_norm.xlsx")
normalizad_word_dict = {}

for index, row in normalizad_word.iterrows():
    if row[0] not in normalizad_word_dict:
        normalizad_word_dict[row[0]] = row[1]

def normalized_term(document):
    return [normalizad_word_dict[term] if term in normalizad_word_dict else term for term in document]


In [13]:
dataset['normalization'] = dataset['tokenization'].apply(normalized_term)
dataset.head(2)

Unnamed: 0,sentiment,content,cleansing,case_folding,tokenization,normalization
0,neutral,Paa bales DM pa saya mau nanya soal paket bela...,Paa bales DM pa saya mau nanya soal paket bela...,paa bales dm pa saya mau nanya soal paket bela...,"[paa, bales, dm, pa, saya, mau, nanya, soal, p...","[paa, balas, dm, pa, saya, mau, bertanya, soal..."
1,positive,Berikan kesejahteraan buat guru honorer pak......,Berikan kesejahteraan buat guru honorer pak An...,berikan kesejahteraan buat guru honorer pak an...,"[berikan, kesejahteraan, buat, guru, honorer, ...","[berikan, kesejahteraan, buat, guru, honorer, ..."


## Stopword

In [14]:
from nltk.corpus import stopwords
# ----------------------- get stopword from NLTK stopword -------------------------------
# get stopword indonesia
list_stopwords = stopwords.words('indonesian')
list_stopwords[:5]

['ada', 'adalah', 'adanya', 'adapun', 'agak']

In [15]:
# ---------------------------- Custom stopword  ------------------------------------
# append additional stopword
list_stopwords.extend(["yg", "dg", "rt", "dgn", "ny", "d", 'klo',
                       'kalo', 'amp', 'biar', 'bikin', 'bilang',
                       'gak', 'ga', 'krn', 'nya', 'nih', 'sih',
                       'si', 'tau', 'tdk', 'tuh', 'utk', 'ya',
                       'jd', 'jgn', 'sdh', 'aja', 'n', 't',
                       'nyg', 'hehe', 'pen', 'u', 'nan', 'loh', 'rt',
                       '&amp', 'yah', 'man'])

In [16]:
list_stopwords = set(list_stopwords)

#remove stopword pada list token
def stopwords_removal(words):
    return [word for word in words if word not in list_stopwords]

dataset['stopword'] = dataset['normalization'].apply(stopwords_removal)
dataset.head(2)

Unnamed: 0,sentiment,content,cleansing,case_folding,tokenization,normalization,stopword
0,neutral,Paa bales DM pa saya mau nanya soal paket bela...,Paa bales DM pa saya mau nanya soal paket bela...,paa bales dm pa saya mau nanya soal paket bela...,"[paa, bales, dm, pa, saya, mau, nanya, soal, p...","[paa, balas, dm, pa, saya, mau, bertanya, soal...","[paa, balas, dm, pa, paket, belajar, telkomsel]"
1,positive,Berikan kesejahteraan buat guru honorer pak......,Berikan kesejahteraan buat guru honorer pak An...,berikan kesejahteraan buat guru honorer pak an...,"[berikan, kesejahteraan, buat, guru, honorer, ...","[berikan, kesejahteraan, buat, guru, honorer, ...","[kesejahteraan, guru, honorer, angkat, pns, be..."


## Stemming

In [17]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import swifter

In [18]:
# create stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

In [19]:
def get_stemmed_term(document):
    return [stemmer.stem(term) for term in document]

dataset['stemmed'] = dataset['stopword'].swifter.apply(get_stemmed_term)
dataset.head(5)

Pandas Apply:   0%|          | 0/1451 [00:00<?, ?it/s]

Unnamed: 0,sentiment,content,cleansing,case_folding,tokenization,normalization,stopword,stemmed
0,neutral,Paa bales DM pa saya mau nanya soal paket bela...,Paa bales DM pa saya mau nanya soal paket bela...,paa bales dm pa saya mau nanya soal paket bela...,"[paa, bales, dm, pa, saya, mau, nanya, soal, p...","[paa, balas, dm, pa, saya, mau, bertanya, soal...","[paa, balas, dm, pa, paket, belajar, telkomsel]","[paa, balas, dm, pa, paket, ajar, telkomsel]"
1,positive,Berikan kesejahteraan buat guru honorer pak......,Berikan kesejahteraan buat guru honorer pak An...,berikan kesejahteraan buat guru honorer pak an...,"[berikan, kesejahteraan, buat, guru, honorer, ...","[berikan, kesejahteraan, buat, guru, honorer, ...","[kesejahteraan, guru, honorer, angkat, pns, be...","[sejahtera, guru, honorer, angkat, pns, tahap,..."
2,neutral,Pak sekolah tatap muka pak😃,Pak sekolah tatap muka pak,pak sekolah tatap muka pak,"[pak, sekolah, tatap, muka, pak]","[pak, sekolah, tatap, muka, pak]","[sekolah, tatap, muka]","[sekolah, tatap, muka]"
3,neutral,@putiageng25 cari beasiswa aja. Rajin2 cari in...,cari beasiswa aja Rajin cari info di website p...,cari beasiswa aja rajin cari info di website p...,"[cari, beasiswa, aja, rajin, cari, info, di, w...","[cari, beasiswa, saja, rajin, cari, informasi,...","[cari, beasiswa, rajin, cari, informasi, situs...","[cari, beasiswa, rajin, cari, informasi, situs..."
4,positive,@sahabathistoria capek2 kuliah tapi kerja sesu...,capek kuliah tapi kerja sesuai progessi gak di...,capek kuliah tapi kerja sesuai progessi gak di...,"[capek, kuliah, tapi, kerja, sesuai, progessi,...","[capek, kuliah, tapi, kerja, sesuai, progessi,...","[capek, kuliah, kerja, sesuai, progessi, dihar...","[capek, kuliah, kerja, sesuai, progessi, harga..."


## Export

In [52]:
cleansing_1 = pd.read_excel('assets/cleansing.xlsx').iloc[:, 1:]
cleansing_1.lable = cleansing_1.lable.replace('nrgatif', 'negatif')
cleansing_1.shape

(1515, 8)

**Drop nan**

In [55]:
cleansing_1.dropna(inplace=True)
cleansing_1.shape

(1508, 8)

In [62]:
cleansing_1.lable.value_counts()

negatif    825
netral     399
positif    284
Name: lable, dtype: int64

In [59]:
dataset.rename(columns={'sentiment': 'lable'}, inplace=True)

In [64]:
dataset.lable = dataset.lable.replace('neutral', 'netral')
dataset.lable = dataset.lable.replace('positive', 'positif')
dataset.lable = dataset.lable.replace('negative', 'negatif')

In [68]:
df_join = pd.concat([dataset, cleansing_1], axis=0)
df_join.shape

(2959, 8)

In [69]:
df_join.dropna(inplace=True)
df_join.shape

(2959, 8)

In [71]:
df_join.drop_duplicates('content', inplace=True)

In [73]:
df_join.reset_index(drop=True, inplace=True)

In [74]:
df_join.to_excel('assets/cleansing_1.xlsx')

In [190]:
import json
with open('assets/cleansing.json', 'w') as file:
    json.dump(dataset.to_dict(orient='records'), file, indent=4)