# Library

In [1]:
# UPSAMPLING
from imblearn.over_sampling import SMOTE

# Preprocession Text
from sklearn.feature_extraction.text import TfidfVectorizer
from nlp_id.lemmatizer import Lemmatizer
from nltk.tokenize import word_tokenize
from string import punctuation

# Report
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

# Modelling
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

# Utility
from pandas import read_csv
from pandas import DataFrame
import pickle, re, json
import numpy as np
from requests import get
from io import StringIO

import nltk, re, json, csv

# Dataset

In [2]:
# LOAD dataset
dataset_path = './dataset_minecraft.csv'
minecraft_df = read_csv(dataset_path)
print("\"{}\" is loaded succesfully".format(dataset_path))

"./dataset_minecraft.csv" is loaded succesfully


In [19]:
minecraft_df.head(1)

Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,appVersion
0,9d03bd81-f407-44e2-af1d-f12aac9806e3,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,"Gameplay sudah bagus, tapi ada sedikit bug pad...",1,1632,1.21.2.02,2024-07-26 18:44:38,,,1.21.2.02


# Features Extraction

In [10]:
def preProcessing(text):
    fixed_slangwords = []
    fix_stopwords = []
    stopwordsDict =[]
    
    text = re.sub(r'@[A-Za-z0-9]+', '', str(text))
    text = re.sub(r'#[A-Za-z0-9]+', '', text)
    text = re.sub(r'RT[\s]', '', text)
    text = re.sub(r"http\S+", '', text)
    text = re.sub(r'[0-9]+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = text.replace('\n', ' ')
    text = text.translate(str.maketrans('', '', punctuation))
    text = text.strip(' ')
    text = text.lower()

    with open('./resource/slangwords_dict.txt', 'r') as file :
        slangwords = json.loads(file.readline())
    words = text.split()
    
    for word in words:
        if word.lower() in slangwords: fixed_slangwords.append(slangwords[word.lower()])
        else : fixed_slangwords.append(word)
    text = ' '.join(fixed_slangwords)
    
    text = word_tokenize(text)
    with open('./resource/stopwords.txt', 'r') as file:
        stopwords = file.readlines()
    for word in stopwords:
        word = word.replace('\n', '')
        stopwordsDict.append(word)
    for txt in text:
        if txt not in stopwordsDict: fix_stopwords.append(txt)
    text = fix_stopwords

    lemmatizer = Lemmatizer()
    text = [lemmatizer.lemmatize(word.lower()) for word in text]

    return ' '.join(word for word in text)

In [3]:
# REMOVE SPECIAL CHARACTERS & CASE FOLDING
def cleaning(text):
    text = re.sub(r'@[A-Za-z0-9]+', '', str(text)) # menghapus mention
    text = re.sub(r'#[A-Za-z0-9]+', '', text) # menghapus hashtag
    text = re.sub(r'RT[\s]', '', text) # menghapus RT
    text = re.sub(r"http\S+", '', text) # menghapus link
    text = re.sub(r'[0-9]+', '', text) # menghapus angka
    text = re.sub(r'[^\w\s]', '', text) # menghapus karakter selain huruf dan angka
    text = text.replace('\n', ' ') # mengganti baris baru dengan spasi
    text = text.translate(str.maketrans('', '', punctuation)) # menghapus semua tanda baca
    text = text.strip(' ') # menghapus karakter spasi dari kiri dan kanan teks
    return text.lower()

# REMOVE SLANG WORDS
def fixSlangwords(text):
    with open('./slangwords.txt', 'r') as file :
        slangwords = file.readlines()
        
    for slang in slangwords:
        slang = slang.replace('\n', '')
        slang = slang.split('\t')
        
    words = text.split()
    fixed_words = []
    for word in words:
        if word.lower() in slangwords:
            fixed_words.append(slang[word.lower()])
        else :
            fixed_words.append(word)
    return ' '.join(fixed_words)

# TOKENIZING
def tokenizer(text):
    return word_tokenize(text)

# REMOVE STOPWORDS
def stopwordsRemove(text):
    with open('./stopwords.txt', 'r') as file:
        stopwords = file.readlines()
        
    stopwordsDict =[]
    for word in stopwords:
        word = word.replace('\n', '')
        stopwordsDict.append(word)

    fix_words = []
    for txt in text:
        if txt not in stopwordsDict:
            fix_words.append(txt)
    return fix_words

# LEMMATIZING
def lemmatizerWord(text):
    lemmatizer = Lemmatizer()
    return [lemmatizer.lemmatize(word.lower()) for word in text]

# Final
def toSentence(text):
    return ' '.join(word for word in text)

## Apply to Dataset

In [22]:
print("Proses Cleaning     : START")
minecraft_df['text_clean'] = minecraft_df.content.apply(cleaning)
print("Proses Cleaning     : DONE\n")

Proses Cleaning     : START
Proses Cleaning     : DONE



In [23]:
print("Proses Slangword    : START")
minecraft_df['text_slangwords'] = minecraft_df['text_clean'].apply(fixSlangwords)
print("Proses Slangword    : DONE\n")

Proses Slangword    : START
Proses Slangword    : DONE



In [24]:
print("Proses Tokenizing   : START")
minecraft_df['text_tokenizingText'] = minecraft_df['text_slangwords'].apply(tokenizer)
print("Proses Tokenizing   : DONE\n")

Proses Tokenizing   : START
Proses Tokenizing   : DONE



In [25]:
print("Proses Stopword     : START")
minecraft_df['text_stopword'] = minecraft_df['text_tokenizingText'].apply(stopwordsRemove)
print("Proses Stopword     : DONE\n")

Proses Stopword     : START
Proses Stopword     : DONE



In [26]:
print("Proses Lemmatizing     : START")
minecraft_df['text_lemmatizing'] = minecraft_df['text_stopword'].apply(lemmatizerWord)
print("Proses Lemmatizing     : DONE\n")

Proses Lemmatizing     : START
Proses Lemmatizing     : DONE



In [27]:
print("Proses Final        : START")
minecraft_df['text_akhir'] = minecraft_df['text_lemmatizing'].apply(toSentence)
print("Proses Final        : DONE\n")

Proses Final        : START
Proses Final        : DONE



In [28]:
lexicon_positive, lexicon_negative = {}, {}

response = get(url='https://raw.githubusercontent.com/angelmetanosaa/dataset/main/lexicon_positive.csv')
def create_dict(url, dictType):
    response = get(url=url)
    if response.status_code == 200:
        reader = csv.reader(StringIO(response.text), delimiter=',')
        if dictType == 'positive':
            print("Fetching {} lexicon data".format(dictType))
            for row in reader:
                lexicon_positive[row[0]] = int(row[1])
        else:
            print("Fetching {} lexicon data".format(dictType))
            for row in reader:
                lexicon_negative[row[0]] = int(row[1])
    else:
        print("Failed to fetch lexicon data")

create_dict('https://raw.githubusercontent.com/angelmetanosaa/dataset/main/lexicon_positive.csv', "positive")
create_dict('https://raw.githubusercontent.com/angelmetanosaa/dataset/main/lexicon_negative.csv', "negative")

Fetching positive lexicon data
Fetching negative lexicon data


In [29]:
def sentiment_analysis_lexicon_indonesia(text):
    score = 0

    for word in text:
        if (word in lexicon_positive):
            score = score + lexicon_positive[word]

    for word in text:
        if (word in lexicon_negative):
            score = score + lexicon_negative[word]

    sentiment=''

    if (score > 0) :sentiment = 'positive'
    elif (score < 0) :sentiment = 'negative'
    else: sentiment = 'neutral'

    return score, sentiment

In [30]:
results = minecraft_df['text_lemmatizing'].apply(sentiment_analysis_lexicon_indonesia)
results = list(zip(*results))
minecraft_df['polarity_score'] = results[0]
minecraft_df['sentiment'] = results[1]

In [4]:
# minecraft_df.to_csv('dataset_minecraft_preprocessed.csv', index=False)
minecraft_df = read_csv('./dataset_minecraft_preprocessed.csv')
minecraft_df.dropna(subset=['text_akhir'], inplace=True)

## TFIDF and UPSAMPLING

In [5]:
X = minecraft_df['text_akhir']
y = minecraft_df['sentiment']


tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(X)

# Save tfidf vectorizer
with open('../tfidf/tfidfVectorizer.pkl', 'wb') as file:
    pickle.dump(tfidf, file)

features = DataFrame(X_tfidf.toarray(), columns=tfidf.get_feature_names_out())
print("Features Shape : ", features.shape)
print("\nSebelum Upsampling\n", y.value_counts())

# UPSAMPLING data
smote = SMOTE()
x_over, y_over = smote.fit_resample(X_tfidf, y)
print("\nSetelah Upsampling\n", y_over.value_counts())

X_train, X_test, y_train, y_test = train_test_split(x_over, y_over, test_size=0.2, random_state=42)     # splitting features
print("\nData Testing : ", len(y_train))
print("Data Testing : ", len(y_test))

Features Shape :  (14996, 13980)

Sebelum Upsampling
 sentiment
positive    7080
negative    6318
neutral     1598
Name: count, dtype: int64

Setelah Upsampling
 sentiment
negative    7080
positive    7080
neutral     7080
Name: count, dtype: int64

Data Testing :  16992
Data Testing :  4248


# Model Training

### Skema 1 : Support Vector Machine

In [33]:
svc_models = SVC(kernel='rbf', random_state=42)

search_space = {
    'C' : [2],
    'gamma' : [1.1, 1.3, 1.5]
}

grid_search_svm = GridSearchCV(
                estimator = svc_models, 
                param_grid = search_space,
                cv=5,
                verbose=4)

svc_model = grid_search_svm.fit(X_train, y_train)

Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV 1/5] END ....................C=2, gamma=1.1;, score=0.911 total time=  20.4s
[CV 2/5] END ....................C=2, gamma=1.1;, score=0.911 total time=  19.9s
[CV 3/5] END ....................C=2, gamma=1.1;, score=0.904 total time=  20.3s
[CV 4/5] END ....................C=2, gamma=1.1;, score=0.903 total time=  20.0s
[CV 5/5] END ....................C=2, gamma=1.1;, score=0.910 total time=  19.6s
[CV 1/5] END ....................C=2, gamma=1.3;, score=0.907 total time=  20.9s
[CV 2/5] END ....................C=2, gamma=1.3;, score=0.910 total time=  20.4s
[CV 3/5] END ....................C=2, gamma=1.3;, score=0.901 total time=  19.6s
[CV 4/5] END ....................C=2, gamma=1.3;, score=0.901 total time=  19.5s
[CV 5/5] END ....................C=2, gamma=1.3;, score=0.907 total time=  19.7s
[CV 1/5] END ....................C=2, gamma=1.5;, score=0.900 total time=  21.8s
[CV 2/5] END ....................C=2, gamma=1.5;,

In [34]:
svc_model = grid_search_svm.best_estimator_
y_pred = svc_model.predict(X_test)

# cetak 10 hasil prediksi
print("Aktual   :", np.array(y_test)[:10])
print("Prediksi :", y_pred[:10])

# akurasi
accuracy = accuracy_score(y_test, y_pred) * 100
print(f"\nAkurasi model SVM : {accuracy:.2f}%")

print("\n", classification_report(y_test, y_pred))

Aktual   : ['positive' 'neutral' 'neutral' 'negative' 'neutral' 'negative' 'neutral'
 'positive' 'neutral' 'positive']
Prediksi : ['positive' 'positive' 'positive' 'negative' 'negative' 'negative'
 'neutral' 'positive' 'neutral' 'positive']

Akurasi model SVM : 92.42%

               precision    recall  f1-score   support

    negative       0.89      0.94      0.91      1390
     neutral       0.98      0.91      0.94      1403
    positive       0.91      0.93      0.92      1455

    accuracy                           0.92      4248
   macro avg       0.93      0.92      0.92      4248
weighted avg       0.93      0.92      0.92      4248



### Skema 2 : Extreme Gradient Boosting

In [35]:
xgb_models = GradientBoostingClassifier(random_state=42)

search_space_xgb = {
    "n_estimators" : [500, 1000],
    "learning_rate" : [0.1],
    "max_depth" : [6, 9]
}

grid_search_xgb = GridSearchCV(
                    estimator= xgb_models,
                    param_grid= search_space_xgb,
                    cv=2,
                    verbose=4
)

grid_search_xgb.fit(X_train, y_train)

Fitting 2 folds for each of 4 candidates, totalling 8 fits
[CV 1/2] END learning_rate=0.1, max_depth=6, n_estimators=500;, score=0.821 total time= 2.2min
[CV 2/2] END learning_rate=0.1, max_depth=6, n_estimators=500;, score=0.818 total time= 2.1min
[CV 1/2] END learning_rate=0.1, max_depth=6, n_estimators=1000;, score=0.831 total time= 4.3min
[CV 2/2] END learning_rate=0.1, max_depth=6, n_estimators=1000;, score=0.825 total time= 4.2min
[CV 1/2] END learning_rate=0.1, max_depth=9, n_estimators=500;, score=0.824 total time= 2.9min
[CV 2/2] END learning_rate=0.1, max_depth=9, n_estimators=500;, score=0.823 total time= 2.9min
[CV 1/2] END learning_rate=0.1, max_depth=9, n_estimators=1000;, score=0.832 total time= 5.8min
[CV 2/2] END learning_rate=0.1, max_depth=9, n_estimators=1000;, score=0.829 total time= 5.6min


In [36]:
xgb_model = grid_search_xgb.best_estimator_
y_pred_xgb = xgb_model.predict(X_test)

print("Aktual   : ", np.array(y_test[:10]))
print("Prediksi : ", y_pred_xgb[:10])

xgb_accuracy = accuracy_score(y_test, y_pred_xgb) * 100
print(f"\nAkurasi Model XGB: {xgb_accuracy:.2f}%")

print("\n", classification_report(y_test, y_pred_xgb))

Aktual   :  ['positive' 'neutral' 'neutral' 'negative' 'neutral' 'negative' 'neutral'
 'positive' 'neutral' 'positive']
Prediksi :  ['positive' 'positive' 'neutral' 'negative' 'neutral' 'negative' 'neutral'
 'positive' 'positive' 'positive']

Akurasi Model XGB: 86.72%

               precision    recall  f1-score   support

    negative       0.88      0.83      0.86      1390
     neutral       0.85      0.94      0.89      1403
    positive       0.88      0.84      0.86      1455

    accuracy                           0.87      4248
   macro avg       0.87      0.87      0.87      4248
weighted avg       0.87      0.87      0.87      4248



### Skema 3 : Neural  Network

In [6]:
mlp_models = MLPClassifier(random_state=42)

search_space_mlp = {
    "hidden_layer_sizes" : [200, 256, 300], 
    "activation" : ['tanh', 'relu'], 
    "solver" : ['adam'], 
    "learning_rate" : ['adaptive'] 
}

grid_search_mlp = GridSearchCV(
                    estimator= mlp_models,
                    param_grid= search_space_mlp,
                    cv=2,
                    verbose=4
)

grid_search_mlp.fit(X_train, y_train)

Fitting 2 folds for each of 6 candidates, totalling 12 fits
[CV 1/2] END activation=tanh, hidden_layer_sizes=200, learning_rate=adaptive, solver=adam;, score=0.846 total time= 3.3min
[CV 2/2] END activation=tanh, hidden_layer_sizes=200, learning_rate=adaptive, solver=adam;, score=0.848 total time= 3.1min
[CV 1/2] END activation=tanh, hidden_layer_sizes=256, learning_rate=adaptive, solver=adam;, score=0.847 total time= 4.0min
[CV 2/2] END activation=tanh, hidden_layer_sizes=256, learning_rate=adaptive, solver=adam;, score=0.851 total time= 3.9min
[CV 1/2] END activation=tanh, hidden_layer_sizes=300, learning_rate=adaptive, solver=adam;, score=0.845 total time= 7.0min
[CV 2/2] END activation=tanh, hidden_layer_sizes=300, learning_rate=adaptive, solver=adam;, score=0.851 total time= 7.5min
[CV 1/2] END activation=relu, hidden_layer_sizes=200, learning_rate=adaptive, solver=adam;, score=0.850 total time= 2.6min
[CV 2/2] END activation=relu, hidden_layer_sizes=200, learning_rate=adaptive, s

In [9]:
mlp_model = grid_search_mlp.best_estimator_
y_pred_mlp = mlp_model.predict(X_test)

print("Aktual   : ", np.array(y_pred_mlp[:10]))
print("Prediksi : ", y_pred_mlp[:10])

mlp_accuracy = accuracy_score(y_test, y_pred_mlp) * 100
print(f"\nAkurasi Model MLP : {mlp_accuracy:.2f}")

print("\n", classification_report(y_test, y_pred_mlp))

Aktual   :  ['positive' 'neutral' 'neutral' 'negative' 'neutral' 'negative' 'positive'
 'positive' 'neutral' 'positive']
Prediksi :  ['positive' 'neutral' 'neutral' 'negative' 'neutral' 'negative' 'positive'
 'positive' 'neutral' 'positive']

Akurasi Model MLP : 90.09

               precision    recall  f1-score   support

    negative       0.95      0.87      0.91      1396
     neutral       0.83      0.98      0.90      1422
    positive       0.95      0.85      0.90      1430

    accuracy                           0.90      4248
   macro avg       0.91      0.90      0.90      4248
weighted avg       0.91      0.90      0.90      4248



# Inference or Testing

In [49]:
def inferenceTfidf(text, model):
    with open('../tfidf/tfidfVectorizer.pkl', 'rb') as file:
        tfV = pickle.load(file)
        
    text_pre = preprocessingText(text)
    x = tfV.transform([text_pre]).toarray()
    
    if model == 'svm':
        model_name = "Support Vector Machine"
        pred = svc_model.predict(x)
    elif model == 'xgb':
        model_name = "Extreme Gradient Boosting"
        pred = xgb_model.predict(x)
    elif model == 'mlp':
        model_name = "Multi Layer Perceptron"
        pred = mlp_model.predict(x)
    
    return print(f"Model      : {model_name}\nText       : \"{text}\"\nSentimeent : \033[1m{pred[0]}")

In [50]:
def preprocessingText (text):
    x = cleaning(text)
    x = fixSlangwords(x)
    x = tokenizer(x)
    x = stopwordsRemove(x)
    x = lemmatizerWord(x)
    return toSentence(x)

In [52]:
# Inference or Testing
inferenceTfidf("Update terbarunya asik dan juga banyak fitur baru", "xgb")

Model      : Extreme Gradient Boosting
Text       : "Update terbarunya asik dan juga banyak fitur baru"
Sentimeent : [1mpositive
