In [71]:
import pickle
import pandas as pd
import re
import string
import unicodedata
import nltk
from bs4 import BeautifulSoup
from emo_unicode import UNICODE_EMO, EMOTICONS
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

In [73]:
class TextPreprocessing:
    def __init__(self, text="test"):
        self.text = text

    def lowercase(self):
        """Convert to lowercase"""
        self.text = str(self.text).lower()
        self.text = self.text.strip()
        return self

    def strip_html(self):
        """Stopword removal"""
        soup = BeautifulSoup(self.text, "html.parser")
        self.text = soup.get_text()
        return self

    def remove_url(self):
        """Remove URL (http/https/www) or custom URL"""
        self.text = re.sub(r"https?://\S+|www\.\S+", "", self.text)
        return self

    def remove_email(self):
        """Remove email"""
        self.text = re.sub("\S*@\S*\s?", "", self.text)
        return self

    def remove_between_square_brackets(self):
        """Remove string beetwen square brackets []"""
        self.text = re.sub("\[[^]]*\]", "", self.text)
        return self

    def remove_numbers(self):
        """Remove numbers"""
        self.text = re.sub("[-+]?[0-9]+", "", self.text)
        return self

    def remove_emoji(self):
        """Remove emoji, e.g 😜😀 """
        emoji_pattern = re.compile(
            "["
            u"\U0001F600-\U0001F64F"  # emoticons
            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
            u"\U0001F680-\U0001F6FF"  # transport & map symbols
            u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
            u"\U00002500-\U00002BEF"  # chinese char
            "]+",
            flags=re.UNICODE,
        )
        self.text = emoji_pattern.sub(r"", self.text)
        return self

    def remove_emoticon(self):
        """Remove emoticon, e.g :-)"""
        emoticon_pattern = re.compile(u"(" + u"|".join(k for k in EMOTICONS) + u")")
        self.text = emoticon_pattern.sub(r"", self.text)
        return self
    
    def convert_emoji(self):
        """Convert emoji to word"""
        for emoji in UNICODE_EMO:
            self.text = self.text.replace(
                emoji,
                "_".join(UNICODE_EMO[emoji].replace(",", "").replace(":", "").split()),
            )
        return self

    def convert_emoticon(self):
        """Convert emoticon to word"""
        for emoticon in EMOTICONS:
            self.text = re.sub(
                u"(" + emoticon + ")",
                "_".join(EMOTICONS[emoticon].replace(",", "").split()),
                self.text,
            )
        return self

    def remove_punctuation(self):
        """Remove punctuation"""
        self.text = re.sub(r"[^\w\s]", "", self.text)
        return self

    def remove_non_ascii(self):
        """Remove non-ascii character"""
        self.text = (
            unicodedata.normalize("NFKD", self.text)
            .encode("ascii", "ignore")
            .decode("utf-8", "ignore")
        )
        return self

    def normalize_word(self):
        """Normalize slang world"""
        normal_word_path = pd.read_csv("C:/Users/ASUS/TA01/00_data/key_norm.csv")

        self.text = " ".join(
            [
                normal_word_path[normal_word_path["singkat"] == word]["hasil"].values[0]
                if (normal_word_path["singkat"] == word).any()
                else word
                for word in self.text.split()
            ]
        )
        return self

    def stemming(self):
        """Stemming for Bahasa with Sastrawi"""
        factory = StemmerFactory()
        stemmer = factory.create_stemmer()

        self.text = stemmer.stem(self.text)
        return self

    def tokenize(self):
        """Tokenize words"""
        self.words = nltk.word_tokenize(self.text)
        return self

    def stopwords_removal(self):
        """Stopword removal"""
        stopword = stopwords.words("indonesian")
        more_stopword = [
            "assalamualaikum", "wr", "wb", "pak",
            "bu", "selamat", "siang", "pagi",
            "sore", "malam", "saya",
            "terimakasih", "terima",
            "kasih", "kepada", "bpk",
            "ibu", "mohon", "tolong",
            "maaf", "dear", "wassalamualaikum", "regards", ]  # add more stopword to default corpus
        stop_factory = stopword + more_stopword
        stop_factory.remove('tak')
        
        clean_words = []
        for word in self.words:
            if word not in stop_factory:
                clean_words.append(word)
        self.words = clean_words  
        return self

    def join_words(self):
        """Jonin all words"""
        self.words = " ".join(self.words)
        return self
    
    def do_all(self, text):
        """Do all text preprocessing process"""  # or custom process
        self.text = text
        self = self.lowercase()
        self = self.strip_html()
        self = self.remove_url()
        self = self.remove_email()
        self = self.remove_between_square_brackets()
        self = self.remove_numbers()
        self = self.remove_emoticon()
        self = self.remove_emoji()
        self = self.convert_emoticon()
        self = self.convert_emoji()
        self = self.remove_punctuation()
        self = self.remove_non_ascii()
        self = self.normalize_word()
        self = self.stemming()
        self = self.tokenize()
        self = self.stopwords_removal()
        self = self.join_words()
        return self.words

In [74]:
data_path = 'C:/Users/ASUS/TA01/01_data_analysis/01_pickle/01_data_training.pickle'

with open(data_path, 'rb') as data_training:
    data = pickle.load(data_training)

data

Unnamed: 0,keluhan,bagian,id,keluhan_length
0,Dear BAA Mohon di bantu untuk merubah status a...,BAA,1,253
1,"Dear BAA Telkom University,\nSaya sebagai sala...",BAA,1,202
2,Mohon maaf saya haidar mau komplain pada saat ...,BAA,1,213
3,"permisi saya mau komplain, biaya pendidikan sa...",BAA,1,186
4,"""Assalamualaikum wr wb.Maaf pak, saya sudah me...",BAA,1,366
...,...,...,...,...
788,"Assalamualaikum Wr. Wb Mohon maaf sebelumnya, ...",LABORAN,1,957
789,gaji asisten untuk FRI tidak sebanding (jika d...,LABORAN,1,71
790,"Selamat siang, maaf mangganggu. Saya Yusrin da...",LABORAN,1,184
791,"maaf sebelumnya, saya olyvia fransiska dari te...",LABORAN,1,222


In [75]:
tp = TextPreprocessing() # load module text preprocessing

data['clean_keluhan'] = data['keluhan'].apply(tp.do_all)

In [76]:
import time
import dask.dataframe as dd
from dask.multiprocessing import get

tp = TextPreprocessing() # load module text preprocessing

def dask_this(data):
    data['clean_keluhan'] = data['keluhan'].apply(tp.do_all)
    return data

ddata = dd.from_pandas(data, npartitions=10)

try:
    start_time = time.time()
    data = ddata.map_partitions(dask_this).compute(scheduler='processes', num_workers=10)
except:
    print('Text preprocessing failed !')
else:
    data.to_csv('C:/Users/ASUS/TA01/00_data/clean_data_training.csv', encoding='utf-8')
    print('Text preprocessing success !')
    print('Elapsed time:', time.time() - start_time, 'seconds')
finally:
    print('\nFinish')

Text preprocessing success !
Elapsed time: 38.818790912628174 seconds

Finish


In [77]:
columns = ['clean_keluhan', 'bagian']
data = data[columns]

with open('02_pickle/02_clean_data.pickle', 'wb') as output:
    pickle.dump(data, output)

In [78]:
data

Unnamed: 0,clean_keluhan,bagian
0,baa bantu rubah status akademik an rezza rijki...,BAA
1,baa telkom university salah alumni lihat data ...,BAA
2,haidar komplain bayar bank bni bank milik tera...,BAA
3,permisi komplain biaya didik lunas status biay...,BAA
4,wbmaaf cancel ksm dosen wali salah input mk am...,BAA
...,...,...
788,fakultas ilmu terap liburtanggal merah weekend...,LABORAN
789,gaji asisten fri banding dibandingin fakultas,LABORAN
790,mangganggu yusrin lab magics fte lampu toilet ...,LABORAN
791,olyvia fransiska teknik telekomunikasi keluh u...,LABORAN
