# Import Libraries

In [24]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

# Preprocession Text
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from sklearn.feature_extraction.text import TfidfVectorizer
from keras.preprocessing.text import Tokenizer
from IPython.display import clear_output
from imblearn.over_sampling import SMOTE
from nlp_id.lemmatizer import Lemmatizer
from nltk.tokenize import word_tokenize
from googletrans import Translator
from nltk.corpus import stopwords
from collections import Counter
from string import punctuation

# Report
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

# Modelling
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

# Utility
from pandas import read_csv
from pandas import DataFrame
import pickle, re, json, time
import numpy as np
from requests import get
from io import StringIO
import nltk, re, json, csv
from tqdm import tqdm
tqdm.pandas()
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/yelf/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Load Dataset

In [25]:
dataset_path = './pubg_dataset.csv'
review = read_csv(dataset_path)
print("\"{}\" is loaded succesfully".format(dataset_path))

# Create New Dataframe to keep every text preprocessing result
df = DataFrame(review['content'])
df.head()

"./pubg_dataset.csv" is loaded succesfully


Unnamed: 0,content
0,"Tencent, saya bukan player pro,. Tapi kami sec..."
1,Bug setelah update. 1 . Turun pesawat ga kebag...
2,"Keseluruhan game nya udah bagus, apalagi setel..."
3,"Pubg setelah update bnyak aja masalahnya, sepe..."
4,Game bagus...tapi masih banyak bug...masih per...


# Features Extraction

## Function Declaration

### Text Cleaning

In [26]:
# REMOVE SPECIAL CHARACTERS & CASE FOLDING
def cleaning(text):
    text = re.sub(r'@[A-Za-z0-9]+', '', str(text))              # menghapus mention
    text = re.sub(r'#[A-Za-z0-9]+', '', text)                   # menghapus hashtag
    text = re.sub(r'RT[\s]', '', text)                          # menghapus RT
    text = re.sub(r"http\S+", '', text)                         # menghapus link
    text = re.sub(r'[0-9]+', '', text)                          # menghapus angka
    text = re.sub(r"[,.;@#?!&$]+\ *", " ", text)                # mengganti tanda baca dengan spasi
    text = re.sub(r'[^\w\s]', '', text)                         # menghapus karakter selain huruf dan angka
    text = text.replace('\n', ' ')                              # mengganti baris baru dengan spasi
    text = text.translate(str.maketrans('', '', punctuation))   # menghapus semua tanda baca
    text = text.strip(' ')                                      # menghapus karakter spasi dari kiri dan kanan teks
    text = re.sub(r' +', ' ', text)                             # Remove double space
    return text.lower()

In [27]:
print("Proses Cleaning     : START")
df['text_clean'] = df['content'].progress_apply(cleaning)
print("Proses Cleaning     : DONE\n")

sample = df.sample(n=1, random_state=1)
print("Printing sample...")
print("Text Original :", sample['content'].values[0])
print("Text Output   :", sample['text_clean'].values[0])

Proses Cleaning     : START


100%|██████████| 20000/20000 [00:00<00:00, 29732.86it/s]

Proses Cleaning     : DONE

Printing sample...
Text Original : Pubg mobile sekarang sering nge bug mulai dari voice chat gak keluar suara (gak fungsi), Bug masuk ke permainan, sampai bug server tidak merespon... Ini mengganggu kami par pemain.. Tolong segera diperbaiki
Text Output   : pubg mobile sekarang sering nge bug mulai dari voice chat gak keluar suara gak fungsi bug masuk ke permainan sampai bug server tidak merespon ini mengganggu kami par pemain tolong segera diperbaiki





### Fixing Slangwords

In [28]:
# REMOVE SLANG WORDS
def fixSlangwords(text):
    words = text.split()
    fixed_words = []
    with open('./resources/slangwords/slangwords_dict.json', mode='r') as file:
        slangwords = json.load(file)
        
    for word in words:
        if word.lower() in slangwords:
            fixed_words.append(slangwords[word.lower()])
        else:
            fixed_words.append(word.lower())
    
    return ' '.join(fixed_words)

In [29]:
print("Proses Slangword    : START")
df['text_slangwords'] = df['text_clean'].progress_apply(fixSlangwords)
print("Proses Slangword    : DONE\n")

sample = df.sample(n=1, random_state=1)
print("Printing sample...")
print("Text Original :", sample['text_clean'].values[0])
print("Text Output   :", sample['text_slangwords'].values[0])

Proses Slangword    : START


100%|██████████| 20000/20000 [01:39<00:00, 200.37it/s]

Proses Slangword    : DONE

Printing sample...
Text Original : pubg mobile sekarang sering nge bug mulai dari voice chat gak keluar suara gak fungsi bug masuk ke permainan sampai bug server tidak merespon ini mengganggu kami par pemain tolong segera diperbaiki
Text Output   : pubg mobile sekarang sering nge bug mulai dari voice chat tidak keluar suara tidak fungsi bug masuk ke permainan sampai bug server tidak merespon ini mengganggu kami par pemain tolong segera diperbaiki





### Remove Stopwords

In [30]:
# REMOVE STOPWORDS
def stopwordsRemove(text):
    words = text.split()
    fixed_words = []
    factory = StopWordRemoverFactory()
    stopwords_sastrawi = factory.get_stop_words()
    stopwords_indonesia = set(stopwords.words('indonesian'))
    with open('./resources/stopwords/stopwords_dict.json', mode='r') as file:
        stopwords_dict = json.load(file)
        
    fixed_words = [word for word in words if word.lower() not in stopwords_dict]
    fixed_words = [word for word in fixed_words if word.lower() not in stopwords_indonesia]
    fixed_words = [word for word in fixed_words if word.lower() not in stopwords_sastrawi]
    
    return ' '.join(fixed_words)

In [31]:
print("Proses Stopword     : START")
df['text_stopwords'] = df['text_slangwords'].progress_apply(stopwordsRemove)
print("Proses Stopword     : DONE\n")

sample = df.sample(n=1, random_state=1)
print("Printing sample...")
print("Text Original :", sample['text_slangwords'].values[0])
print("Text Output   :", sample['text_stopwords'].values[0])

Proses Stopword     : START


100%|██████████| 20000/20000 [01:22<00:00, 241.80it/s]

Proses Stopword     : DONE

Printing sample...
Text Original : pubg mobile sekarang sering nge bug mulai dari voice chat tidak keluar suara tidak fungsi bug masuk ke permainan sampai bug server tidak merespon ini mengganggu kami par pemain tolong segera diperbaiki
Text Output   : pubg mobile bug voice chat suara fungsi bug masuk permainan bug server merespon mengganggu par pemain diperbaiki





### Lemmatizing Words

In [32]:
# LEMMATIZING
def lemmatizerWords(text):
    words = text.split()
    lemmatized_words = []
    lemmatizer  = Lemmatizer()
    lemmatized_words.append(lemmatizer.lemmatize(word.lower()) for word in words)
    return ' '.join(lemmatized_words[0])

In [33]:
print("Proses Lemmatizing  : START")
df['text_lemmatize'] = df['text_stopwords'].progress_apply(lemmatizerWords)
print("Proses Lemmatizing  : DONE\n")

sample = df.sample(n=1, random_state=1)
print("Printing sample...")
print("Text Original :", sample['text_stopwords'].values[0])
print("Text Output   :", sample['text_lemmatize'].values[0])

Proses Lemmatizing  : START


100%|██████████| 20000/20000 [2:32:18<00:00,  2.19it/s]  

Proses Lemmatizing  : DONE

Printing sample...
Text Original : pubg mobile bug voice chat suara fungsi bug masuk permainan bug server merespon mengganggu par pemain diperbaiki
Text Output   : pubg mobile bug voice chat suara fungsi bug masuk main bug server merespon ganggu par main baik





In [35]:
df.head()

Unnamed: 0,content,text_clean,text_slangwords,text_stopwords,text_lemmatize
0,"Tencent, saya bukan player pro,. Tapi kami sec...",tencent saya bukan player pro tapi kami secara...,tencent saya bukan player profesional tapi kam...,tencent player profesional mengeluh update bug...,tencent player profesional keluh update bug fr...
1,Bug setelah update. 1 . Turun pesawat ga kebag...,bug setelah update turun pesawat ga kebagian p...,bug setelah update turun pesawat tidak kebagia...,bug update turun pesawat kebagian parasut kara...,bug update turun pesawat bagi parasut karakter...
2,"Keseluruhan game nya udah bagus, apalagi setel...",keseluruhan game nya udah bagus apalagi setela...,keseluruhan game nya sudah bagus apalagi setel...,game bagus update grafik makinn mantap kekecew...,game bagus update grafik makinn mantap kecewa ...
3,"Pubg setelah update bnyak aja masalahnya, sepe...",pubg setelah update bnyak aja masalahnya seper...,pubg setelah update banyak saja masalahnya sep...,pubg update bug map pas game player suka jalan...,pubg update bug map pas game player suka jalan...
4,Game bagus...tapi masih banyak bug...masih per...,game bagus tapi masih banyak bug masih perlu d...,game bagus tapi masih banyak bug masih perlu d...,game bagus bug diperbaiki mode tdm bug player ...,game bagus bug baik mode tdm bug player arena ...


### Save Dataset

In [34]:
df.to_csv('./pubg_preprocessing_new.csv', index=False)

## Load Dataset preprocessing

In [None]:
df = read_csv('./pubg_preprocessing.csv')
df.head()

## Create Words Dictionary

In [None]:
# Counter words function declaration
def counter_word(texts):
    count = Counter()
    for text in texts.values:
        for word in text.split():
            count[word] += 1
    return count

# Count unique words
counter = counter_word(df['text_slang'])
num_unique_words = len(counter)     # 20993
# counter.most_common(5)
print("\nUnique words : ", num_unique_words)

# Tokenizer word from text_lemmatize coloumn
tokenizer = Tokenizer(num_words=num_unique_words)
tokenizer.fit_on_texts(df['text_slang'])

# Get Word Index from tokenizer
word_index = tokenizer.word_index

# Save word dictionary to json
with open('./word_dictionary.json', mode='w') as file:
  json.dump(word_index, file, indent=4)
  
# Save word dictionary to csv
word_df = DataFrame(counter.items(), columns=["Word", "Count"])
word_df.head()
word_df.to_csv('./word_dictionary.csv', index=False)