In [103]:
import pickle
import pandas as pd
import re
import string
import unicodedata
import nltk
from bs4 import BeautifulSoup
from emo_unicode import UNICODE_EMO, EMOTICONS
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

In [104]:
class TextPreprocessing:
    def __init__(self, text="test"):
        self.text = text

    def lowercase(self):
        """Convert to lowercase"""
        self.text = self.text.lower()
        self.text = self.text.strip()
        return self

    def strip_html(self):
        """Stopword removal"""
        soup = BeautifulSoup(self.text, "lxml")
        self.text = soup.get_text()
        return self

    def remove_url(self):
        """Remove URL (http/https/www) or custom URL"""
        self.text = re.sub(r"https?://\S+|www\.\S+", "", self.text)
        self.text = re.sub(r"pic.twitter.com\S+", "", self.text)  # custom for twitter
        return self

    def remove_email(self):
        """Remove email"""
        self.text = re.sub("\S*@\S*\s?", "", self.text)
        return self

    def remove_between_square_brackets(self):
        """Remove string beetwen square brackets []"""
        self.text = re.sub("\[[^]]*\]", "", self.text)
        return self

    def remove_numbers(self):
        """Remove numbers"""
        self.text = re.sub("[-+]?[0-9]+", "", self.text)
        return self

    def remove_emoji(self):
        """Remove emoji, e.g 😜😀 """
        emoji_pattern = re.compile(
            "["
            u"\U0001F600-\U0001F64F"  # emoticons
            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
            u"\U0001F680-\U0001F6FF"  # transport & map symbols
            u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
            u"\U00002500-\U00002BEF"  # chinese char
            "]+",
            flags=re.UNICODE,
        )
        self.text = emoji_pattern.sub(r"", self.text)
        return self

    def remove_emoticon(self):
        """Remove emoticon, e.g :-)"""
        emoticon_pattern = re.compile(u"(" + u"|".join(k for k in EMOTICONS) + u")")
        self.text = emoticon_pattern.sub(r"", self.text)
        return self

    def convert_emoji(self):
        """Convert emoji to word"""
        for emoji in UNICODE_EMO:
            self.text = self.text.replace(
                emoji,
                "_".join(UNICODE_EMO[emoji].replace(",", "").replace(":", "").split()),
            )
        return self

    def convert_emoticon(self):
        """Convert emoticon to word"""
        for emoticon in EMOTICONS:
            self.text = re.sub(
                u"(" + emoticon + ")",
                "_".join(EMOTICONS[emoticon].replace(",", "").split()),
                self.text,
            )
        return self

    def remove_punctuation(self):
        """Remove punctuation"""
        self.text = re.sub(r"[^\w\s]", "", self.text)
        return self

    def remove_non_ascii(self):
        """Remove non-ascii character"""
        self.text = (
            unicodedata.normalize("NFKD", self.text)
            .encode("ascii", "ignore")
            .decode("utf-8", "ignore")
        )
        return self

    def normalize_word(self):
        """Normalize slang world"""
        normal_word_path = pd.read_csv("C:/Users/ASUS/TA01/key_norm.csv")

        self.text = " ".join(
            [
                normal_word_path[normal_word_path["singkat"] == word]["hasil"].values[0]
                if (normal_word_path["singkat"] == word).any()
                else word
                for word in self.text.split()
            ]
        )
        return self

    def stemming(self):
        """Stemming for Bahasa with Sastrawi"""
        factory = StemmerFactory()
        stemmer = factory.create_stemmer()

        self.text = stemmer.stem(self.text)
        return self

    def tokenize(self):
        """Tokenize words"""
        self.words = nltk.word_tokenize(self.text)
        return self

    def stopwords_removal(self):
        """Stopword removal"""
        stopword = stopwords.words("indonesian")
        more_stopword = [
            "assalamualaikum", "wr", "wb", "pak",
            "bu", "selamat", "siang", "pagi",
            "sore", "malam", "saya",
            "terimakasih", "terima",
            "kasih", "kepada", "bpk",
            "ibu", "mohon", "tolong",
            "maaf", "dear", "wassalamualaikum", "regards", ]  # add more stopword to default corpus
        stop_factory = stopword + more_stopword

        clean_words = []
        for word in self.words:
            if word not in stop_factory:
                clean_words.append(word)
        self.words = clean_words
        return self

    def join_words(self):
        """Jonin all words"""
        self.words = " ".join(self.words)
        return self

    def do_all(self, text):
        """Do all text preprocessing process"""  # or custom process
        self.text = text
        self = self.lowercase()
        self = self.strip_html()
        self = self.remove_url()
        self = self.remove_email()
        self = self.remove_between_square_brackets()
        self = self.remove_numbers()
        self = self.remove_emoticon()
        self = self.remove_emoji()
        self = self.convert_emoticon()
        self = self.convert_emoji()
        self = self.remove_punctuation()
        self = self.remove_non_ascii()
        self = self.normalize_word()
        self = self.stemming()
        self = self.tokenize()
        self = self.stopwords_removal()
        self = self.join_words()
        return self.words

In [105]:
data_path = 'C:/Users/ASUS/TA01/01_pickle/01_data_training.pickle'

with open(data_path, 'rb') as data_training:
    data = pickle.load(data_training)

data.head(10)

Unnamed: 0,keluhan,bagian,id,keluhan_length
0,"Dear BAA,Mohon di bantu untuk merubah satatus ...",BAA,1,254.0
1,"Dear BAA Telkom University,\nSaya sebagai sala...",BAA,1,202.0
2,Mohon maaf saya haidar mau komplain pada saat ...,BAA,1,213.0
3,"permisi saya mau komplain, biaya pendidikan sa...",BAA,1,186.0
4,"""Assalamualaikum wr wb.Maaf pak, saya sudah me...",BAA,1,366.0
5,"Assalamualaikum pak/bu, saya Rizqillah Zahra (...",BAA,1,443.0
6,"""Saya lupa melakukan cetak nilai akhir (KHS), ...",BAA,1,180.0
7,saya ingin mencetak transkip nilai akhir tapi ...,BAA,1,111.0
8,"""Selamat siang, ;\nSaya Aulia Fiya Maulida Mah...",BAA,1,563.0
9,saya ingin mencetak transkip nilai akhir tapi ...,BAA,1,111.0


In [113]:
import time
import dask.dataframe as dd
from dask.multiprocessing import get

tp = TextPreprocessing() # load module text preprocessing

def dask_this(data):
    data['clean_keluhan'] = data['keluhan'].apply(tp.do_all)
    return data

ddata = dd.from_pandas(data, npartitions=10)

try:
    start_time = time.time()
    data = ddata.map_partitions(dask_this).compute(scheduler='processes', num_workers=10)
except:
    print('Text preprocessing failed !')
else:
    data.to_csv('C:/Users/ASUS/TA01/clean_data_training.csv', encoding='utf-8')
    print('Text preprocessing success !')
    print('Elapsed time:', time.time() - start_time, 'seconds')

Text preprocessing failed !
