# 1. Import Libraries

In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

# Preprocession Text
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from sklearn.feature_extraction.text import TfidfVectorizer
from IPython.display import clear_output
from imblearn.over_sampling import SMOTE
from nlp_id.lemmatizer import Lemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation
from transformers import pipeline

# Report
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

# Modelling
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

# Utility
from datetime import datetime
from pandas import read_csv
from pandas import DataFrame
import pickle, re, json
import numpy as np
from requests import get
from io import StringIO
import nltk, re, json, csv
from tqdm import tqdm
tqdm.pandas()
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/yelf/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# 2. Features Extraction

## 2.1 Load Dataset

In [2]:
dataset_path = './pubg_dataset.csv'
review = read_csv(dataset_path)
print("\"{}\" is loaded succesfully".format(dataset_path))

# Create New Dataframe to keep every text preprocessing result
df = DataFrame(review['content'])
df.head()

"./pubg_dataset.csv" is loaded succesfully


Unnamed: 0,content
0,"Tencent, saya bukan player pro,. Tapi kami sec..."
1,Bug setelah update. 1 . Turun pesawat ga kebag...
2,"Keseluruhan game nya udah bagus, apalagi setel..."
3,"Pubg setelah update bnyak aja masalahnya, sepe..."
4,Game bagus...tapi masih banyak bug...masih per...


## 2.2 Function Declaration

### 2.2.1 Text Cleaning

In [3]:
# REMOVE SPECIAL CHARACTERS & CASE FOLDING
def cleaning(text):
    text = re.sub(r'@[A-Za-z0-9]+', '', str(text))              # menghapus mention
    text = re.sub(r'#[A-Za-z0-9]+', '', text)                   # menghapus hashtag
    text = re.sub(r'RT[\s]', '', text)                          # menghapus RT
    text = re.sub(r"http\S+", '', text)                         # menghapus link
    text = re.sub(r'[0-9]+', '', text)                          # menghapus angka
    text = re.sub(r"[,.;@#?!&$]+\ *", " ", text)                # mengganti tanda baca dengan spasi
    text = re.sub(r'[^\w\s]', '', text)                         # menghapus karakter selain huruf dan angka
    text = text.replace('\n', ' ')                              # mengganti baris baru dengan spasi
    text = text.translate(str.maketrans('', '', punctuation))   # menghapus semua tanda baca
    text = text.strip(' ')                                      # menghapus karakter spasi dari kiri dan kanan teks
    text = re.sub(r' +', ' ', text)                             # Remove double space
    return text.lower()

In [4]:
print("Proses Cleaning     : START")
df['text_clean'] = df['content'].progress_apply(cleaning)
print("Proses Cleaning     : DONE\n")

sample = df.sample(n=1, random_state=1)
print("Printing sample...")
print("Text Original :", sample['content'].values[0])
print("Text Output   :", sample['text_clean'].values[0])

Proses Cleaning     : START


100%|██████████| 20000/20000 [00:00<00:00, 27414.21it/s]

Proses Cleaning     : DONE

Printing sample...
Text Original : Pubg mobile sekarang sering nge bug mulai dari voice chat gak keluar suara (gak fungsi), Bug masuk ke permainan, sampai bug server tidak merespon... Ini mengganggu kami par pemain.. Tolong segera diperbaiki
Text Output   : pubg mobile sekarang sering nge bug mulai dari voice chat gak keluar suara gak fungsi bug masuk ke permainan sampai bug server tidak merespon ini mengganggu kami par pemain tolong segera diperbaiki





### 2.2.2 Fixing Slangwords

In [5]:
# REMOVE SLANG WORDS
def fixSlangwords(text):
    words = text.split()
    fixed_words = []
    with open('./resources/slangwords/slangwords_dict.json', mode='r') as file:
        slangwords = json.load(file)
        
    for word in words:
        if word.lower() in slangwords:
            fixed_words.append(slangwords[word.lower()])
        else:
            fixed_words.append(word.lower())
    
    return ' '.join(fixed_words)

In [6]:
print("Proses Slangword    : START")
df['text_slangwords'] = df['text_clean'].progress_apply(fixSlangwords)
print("Proses Slangword    : DONE\n")

sample = df.sample(n=1, random_state=1)
print("Printing sample...")
print("Text Original :", sample['text_clean'].values[0])
print("Text Output   :", sample['text_slangwords'].values[0])

Proses Slangword    : START


100%|██████████| 20000/20000 [01:37<00:00, 205.30it/s]

Proses Slangword    : DONE

Printing sample...
Text Original : pubg mobile sekarang sering nge bug mulai dari voice chat gak keluar suara gak fungsi bug masuk ke permainan sampai bug server tidak merespon ini mengganggu kami par pemain tolong segera diperbaiki
Text Output   : pubg mobile sekarang sering nge bug mulai dari voice chat tidak keluar suara tidak fungsi bug masuk ke permainan sampai bug server tidak merespon ini mengganggu kami para pemain tolong segera diperbaiki





### 2.2.3 Remove Stopwords

In [12]:
# REMOVE STOPWORDS
def stopwordsRemove(text):
    words = text.split()
    fixed_words = []
    # factory = StopWordRemoverFactory()
    # stopwords_sastrawi = factory.get_stop_words()
    stopwords_indonesia = set(stopwords.words('indonesian'))
    stopwords_english = set(stopwords.words('english'))
    with open('./resources/stopwords/stopwords_dict.json', mode='r') as file:
        stopwords_dict = json.load(file)
        
    fixed_words = [word for word in words if word.lower() not in stopwords_dict]
    fixed_words = [word for word in fixed_words if word.lower() not in stopwords_english]
    fixed_words = [word for word in fixed_words if word.lower() not in stopwords_indonesia]
    # fixed_words = [word for word in fixed_words if word.lower() not in stopwords_sastrawi]
    
    return ' '.join(fixed_words)

In [13]:
print("Proses Stopword     : START")
df['text_stopwords'] = df['text_slangwords'].progress_apply(stopwordsRemove)
print("Proses Stopword     : DONE\n")

sample = df.sample(n=1, random_state=1)
print("Printing sample...")
print("Text Original :", sample['text_slangwords'].values[0])
print("Text Output   :", sample['text_stopwords'].values[0])

Proses Stopword     : START


100%|██████████| 20000/20000 [01:15<00:00, 263.47it/s]

Proses Stopword     : DONE

Printing sample...
Text Original : pubg mobile sekarang sering nge bug mulai dari voice chat tidak keluar suara tidak fungsi bug masuk ke permainan sampai bug server tidak merespon ini mengganggu kami para pemain tolong segera diperbaiki
Text Output   : pubg mobile bug voice chat suara fungsi bug masuk permainan bug server merespon mengganggu pemain tolong diperbaiki





### 2.2.4. Lemmatizing Words

In [14]:
# LEMMATIZING
def lemmatizerWords(text):
    words = text.split()
    lemmatized_words = []
    lemmatizer  = Lemmatizer()
    lemmatized_words.append(lemmatizer.lemmatize(word.lower()) for word in words)
    return ' '.join(lemmatized_words[0])

In [15]:
print("Proses Lemmatizing  : START")
df['text_lemmatize'] = df['text_stopwords'].progress_apply(lemmatizerWords)
print("Proses Lemmatizing  : DONE\n")

sample = df.sample(n=1, random_state=1)
print("Printing sample...")
print("Text Original :", sample['text_stopwords'].values[0])
print("Text Output   :", sample['text_lemmatize'].values[0])

Proses Lemmatizing  : START


100%|██████████| 20000/20000 [2:32:29<00:00,  2.19it/s]  

Proses Lemmatizing  : DONE

Printing sample...
Text Original : pubg mobile bug voice chat suara fungsi bug masuk permainan bug server merespon mengganggu pemain tolong diperbaiki
Text Output   : pubg mobile bug voice chat suara fungsi bug masuk main bug server merespon ganggu main tolong baik





## 2.3 Labeling

In [20]:
lexicon_positive, lexicon_negative = {}, {}

response = get(url='https://raw.githubusercontent.com/angelmetanosaa/dataset/main/lexicon_positive.csv')
def create_dict(url, dictType):
    response = get(url=url)
    if response.status_code == 200:
        reader = csv.reader(StringIO(response.text), delimiter=',')
        if dictType == 'positive':
            print("Fetching {} lexicon data".format(dictType))
            for row in reader:
                lexicon_positive[row[0]] = int(row[1])
        else:
            print("Fetching {} lexicon data".format(dictType))
            for row in reader:
                lexicon_negative[row[0]] = int(row[1])
    else:
        print("Failed to fetch lexicon data")

create_dict('https://raw.githubusercontent.com/angelmetanosaa/dataset/main/lexicon_positive.csv', "positive")
create_dict('https://raw.githubusercontent.com/angelmetanosaa/dataset/main/lexicon_negative.csv', "negative")

def sentiment_analysis_lexicon_indonesia(text):
    score = 0
    text = text.split()
    for word in text:
        if (word in lexicon_positive):
            score = score + lexicon_positive[word]

    for word in text:
        if (word in lexicon_negative):
            score = score + lexicon_negative[word]

    return score, "positive" if score > 0 else "negative" if score < 0 else "neutral"

Fetching positive lexicon data
Fetching negative lexicon data


In [52]:
results = df['text_lemmatize'].progress_apply(sentiment_analysis_lexicon_indonesia)
results = list(zip(*results))
df['polarity_score'] = results[0]
df['sentiment'] = results[1]

100%|██████████| 19997/19997 [00:00<00:00, 207982.96it/s]


In [56]:
df.to_csv("./pubg_preprocessing.csv", index=False)
df.head(5)

Unnamed: 0,content,text_clean,text_slangwords,text_stopwords,text_lemmatize,polarity_score,sentiment
0,"Tencent, saya bukan player pro,. Tapi kami sec...",tencent saya bukan player pro tapi kami secara...,tencent saya bukan player profesional tapi kam...,tencent player profesional langsung mengeluh u...,tencent player profesional langsung keluh upda...,3,positive
1,Bug setelah update. 1 . Turun pesawat ga kebag...,bug setelah update turun pesawat ga kebagian p...,bug setelah update turun pesawat tidak kebagia...,bug update turun pesawat kebagian parasut kara...,bug update turun pesawat bagi parasut karakter...,-28,negative
2,"Keseluruhan game nya udah bagus, apalagi setel...",keseluruhan game nya udah bagus apalagi setela...,keseluruhan game nya sudah bagus apalagi setel...,game bagus update grafik makinn mantap kekecew...,game bagus update grafik makinn mantap kecewa ...,-4,negative
3,"Pubg setelah update bnyak aja masalahnya, sepe...",pubg setelah update bnyak aja masalahnya seper...,pubg setelah update banyak saja masalahnya sep...,pubg update bug map pas game player suka jalan...,pubg update bug map pas game player suka jalan...,-5,negative
4,Game bagus...tapi masih banyak bug...masih per...,game bagus tapi masih banyak bug masih perlu d...,game bagus tapi masih banyak bug masih perlu d...,game bagus bug diperbaiki mode tdm bug player ...,game bagus bug baik mode tdm bug player arena ...,-4,negative


## 2.4 TFIDF and UPSAMPLING

In [2]:
df = read_csv("./pubg_preprocessing.csv")

In [3]:
X, y = df["text_lemmatize"], df["sentiment"]

tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(X)

# Save tfidf vectorizer
with open('./resources/tfidfVectorizer.pkl', 'wb') as file:
    pickle.dump(tfidf, file)
    
features = DataFrame(X_tfidf.toarray(), columns=tfidf.get_feature_names_out())
print("Features Shape : ", features.shape)
print("\nSebelum Upsampling\n", y.value_counts())

# UPSAMPLING data
smote = SMOTE()
x_over, y_over = smote.fit_resample(X_tfidf, y)
print("\nSetelah Upsampling\n", y_over.value_counts())

Features Shape :  (19997, 17401)

Sebelum Upsampling
 sentiment
negative    10841
positive     8196
neutral       960
Name: count, dtype: int64

Setelah Upsampling
 sentiment
positive    10841
negative    10841
neutral     10841
Name: count, dtype: int64


# 3. Model Training

## 3.1 Support Vector Machine | Split Data 70:30

In [4]:
X_train, X_test, y_train, y_test = train_test_split(x_over, y_over, test_size=0.3, random_state=42)     # splitting features
print("\nData Testing : ", len(y_train))
print("Data Testing : ", len(y_test))


Data Testing :  22766
Data Testing :  9757


In [5]:
svc_models = SVC(kernel='rbf', random_state=42)

search_space = {
    'C' : [2],
    'gamma' : [1.5]
}

grid_search_svm = GridSearchCV(
                estimator = svc_models, 
                param_grid = search_space,
                cv=5,
                verbose=4)


svc_model = grid_search_svm.fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5] END ....................C=2, gamma=1.5;, score=0.938 total time= 2.6min
[CV 2/5] END ....................C=2, gamma=1.5;, score=0.942 total time= 2.6min
[CV 3/5] END ....................C=2, gamma=1.5;, score=0.942 total time= 2.6min
[CV 4/5] END ....................C=2, gamma=1.5;, score=0.942 total time= 2.5min
[CV 5/5] END ....................C=2, gamma=1.5;, score=0.947 total time= 2.5min


In [6]:
svc_model = grid_search_svm.best_estimator_
y_pred = svc_model.predict(X_test)

# cetak 10 hasil prediksi
print("Aktual   :", np.array(y_test)[:10])
print("Prediksi :", y_pred[:10])

# akurasi
accuracy = accuracy_score(y_test, y_pred) * 100
print(f"\nAkurasi model SVM : {accuracy:.2f}%")

print("\n", classification_report(y_test, y_pred))

Aktual   : ['positive' 'neutral' 'neutral' 'positive' 'positive' 'neutral' 'negative'
 'negative' 'negative' 'negative']
Prediksi : ['positive' 'neutral' 'neutral' 'positive' 'positive' 'neutral' 'positive'
 'negative' 'negative' 'negative']

Akurasi model SVM : 94.85%

               precision    recall  f1-score   support

    negative       0.91      0.94      0.93      3316
     neutral       1.00      0.98      0.99      3209
    positive       0.94      0.92      0.93      3232

    accuracy                           0.95      9757
   macro avg       0.95      0.95      0.95      9757
weighted avg       0.95      0.95      0.95      9757



In [7]:
# save model
with open("./models/svm_model.pkl", mode="wb") as file:
    pickle.dump(svc_model, file)

## 3.2 Neural  Network | Split Data 60:40

In [8]:
X_train, X_test, y_train, y_test = train_test_split(x_over, y_over, test_size=0.4, random_state=42)     # splitting features
print("\nData Testing : ", len(y_train))
print("Data Testing : ", len(y_test))


Data Testing :  19513
Data Testing :  13010


In [9]:
mlp_models = MLPClassifier(random_state=42)

search_space_mlp = {
    "hidden_layer_sizes" : [200], 
    "activation" : ['tanh', 'relu'], 
    "solver" : ['adam'], 
    "learning_rate" : ['adaptive'] 
}

grid_search_mlp = GridSearchCV(
                    estimator= mlp_models,
                    param_grid= search_space_mlp,
                    cv=3,
                    verbose=4
)

grid_search_mlp.fit(X_train, y_train)

Fitting 3 folds for each of 2 candidates, totalling 6 fits
[CV 1/3] END activation=tanh, hidden_layer_sizes=200, learning_rate=adaptive, solver=adam;, score=0.899 total time= 5.9min
[CV 2/3] END activation=tanh, hidden_layer_sizes=200, learning_rate=adaptive, solver=adam;, score=0.906 total time= 6.1min
[CV 3/3] END activation=tanh, hidden_layer_sizes=200, learning_rate=adaptive, solver=adam;, score=0.899 total time= 5.6min
[CV 1/3] END activation=relu, hidden_layer_sizes=200, learning_rate=adaptive, solver=adam;, score=0.903 total time= 5.9min
[CV 2/3] END activation=relu, hidden_layer_sizes=200, learning_rate=adaptive, solver=adam;, score=0.906 total time= 6.0min
[CV 3/3] END activation=relu, hidden_layer_sizes=200, learning_rate=adaptive, solver=adam;, score=0.903 total time= 5.4min


In [11]:
mlp_model = grid_search_mlp.best_estimator_
y_pred_mlp = mlp_model.predict(X_test)

print("Aktual   : ", np.array(y_pred_mlp[:10]))
print("Prediksi : ", y_pred_mlp[:10])

mlp_accuracy = accuracy_score(y_test, y_pred_mlp) * 100
print(f"\nAkurasi Model MLP : {mlp_accuracy:.2f}%")

print("\n", classification_report(y_test, y_pred_mlp))

Aktual   :  ['positive' 'neutral' 'neutral' 'positive' 'positive' 'neutral' 'negative'
 'negative' 'negative' 'neutral']
Prediksi :  ['positive' 'neutral' 'neutral' 'positive' 'positive' 'neutral' 'negative'
 'negative' 'negative' 'neutral']

Akurasi Model MLP : 92.41%

               precision    recall  f1-score   support

    negative       0.96      0.87      0.91      4404
     neutral       0.88      1.00      0.94      4316
    positive       0.94      0.90      0.92      4290

    accuracy                           0.92     13010
   macro avg       0.93      0.92      0.92     13010
weighted avg       0.93      0.92      0.92     13010



In [13]:
# save model
with open("./models/mlp_model.pkl", mode="wb") as file:
    pickle.dump(mlp_model, file)

## 3.3 : Extreme Gradient Boosting | Split Data 80:20

In [14]:
X_train, X_test, y_train, y_test = train_test_split(x_over, y_over, test_size=0.2, random_state=42)     # splitting features
print("\nData Testing : ", len(y_train))
print("Data Testing : ", len(y_test))


Data Testing :  26018
Data Testing :  6505


In [14]:
xgb_model = GradientBoostingClassifier(random_state=42, n_estimators=1500, learning_rate=0.2, max_depth=10)
xgb_model.fit(X_train, y_train)

In [15]:
# xgb_model = grid_search_xgb.best_estimator_
y_pred_xgb = xgb_model.predict(X_test)

print("Aktual   : ", np.array(y_test[:10]))
print("Prediksi : ", y_pred_xgb[:10])

xgb_accuracy = accuracy_score(y_test, y_pred_xgb) * 100
print(f"\nAkurasi Model XGB: {xgb_accuracy:.2f}%")

print("\n", classification_report(y_test, y_pred_xgb))

Aktual   :  ['positive' 'neutral' 'neutral' 'positive' 'positive' 'neutral' 'negative'
 'negative' 'negative' 'negative']
Prediksi :  ['positive' 'neutral' 'neutral' 'negative' 'positive' 'neutral' 'positive'
 'negative' 'negative' 'negative']

Akurasi Model XGB: 92.64%

               precision    recall  f1-score   support

    negative       0.90      0.90      0.90      2181
     neutral       0.97      0.98      0.97      2109
    positive       0.91      0.90      0.91      2215

    accuracy                           0.93      6505
   macro avg       0.93      0.93      0.93      6505
weighted avg       0.93      0.93      0.93      6505



# 4. Inference

In [9]:
def preprocessingText(text):
    fixed_words = []
    lemmatized_words = []
    lemmatizer  = Lemmatizer()
    
    stopwords_indonesia = set(stopwords.words('indonesian'))
    stopwords_english = set(stopwords.words('english'))
    
    with open('./resources/slangwords/slangwords_dict.json', mode='r') as file:
        slangwords = json.load(file)
    with open('./resources/stopwords/stopwords_dict.json', mode='r') as file:
        stopwords_dict = json.load(file)
        
    text = re.sub(r'@[A-Za-z0-9]+', '', str(text))              # menghapus mention
    text = re.sub(r'#[A-Za-z0-9]+', '', text)                   # menghapus hashtag
    text = re.sub(r'RT[\s]', '', text)                          # menghapus RT
    text = re.sub(r"http\S+", '', text)                         # menghapus link
    text = re.sub(r'[0-9]+', '', text)                          # menghapus angka
    text = re.sub(r"[,.;@#?!&$]+\ *", " ", text)                # mengganti tanda baca dengan spasi
    text = re.sub(r'[^\w\s]', '', text)                         # menghapus karakter selain huruf dan angka
    text = text.replace('\n', ' ')                              # mengganti baris baru dengan spasi
    text = text.translate(str.maketrans('', '', punctuation))   # menghapus semua tanda baca
    text = text.strip(' ')                                      # menghapus karakter spasi dari kiri dan kanan teks
    text = re.sub(r' +', ' ', text)
    text = text.lower()
    
    words = text.split()
    for word in words:
        if word.lower() in slangwords:
            fixed_words.append(slangwords[word.lower()])
        else:
            fixed_words.append(word.lower())

        
    fixed_words = [word for word in fixed_words if word.lower() not in stopwords_dict]
    fixed_words = [word for word in fixed_words if word.lower() not in stopwords_english]
    fixed_words = [word for word in fixed_words if word.lower() not in stopwords_indonesia]
    
    lemmatized_words.append(lemmatizer.lemmatize(word.lower()) for word in fixed_words)
    
    return ' '.join(lemmatized_words[0])

In [11]:
def inferenceTfidf(text, model):
    with open('./resources/tfidfVectorizer.pkl', 'rb') as file:
        tfV = pickle.load(file)
        
    text_pre = preprocessingText(text)
    x = tfV.transform([text_pre]).toarray()
    
    if model == 'svm':
        model_name = "Support Vector Machine"
        pred = svc_model.predict(x)
    elif model == 'xgb':
        model_name = "Extreme Gradient Boosting"
        pred = xgb_model.predict(x)
    elif model == 'mlp':
        model_name = "Multi Layer Perceptron"
        pred = mlp_model.predict(x)
    
    return print(f"Model      : {model_name}\nText       : \"{text}\"\nSentimeent : \033[1m{pred[0]}\nPreprocess : {text_pre}")

In [52]:
# Inference or Testing
inferenceTfidf("Update terbarunya asik dan juga banyak fitur baru", "xgb")

Model      : Extreme Gradient Boosting
Text       : "Update terbarunya asik dan juga banyak fitur baru"
Sentimeent : [1mpositive
