# Library

In [None]:
# UPSAMPLING
from imblearn.over_sampling import SMOTE

# Preprocession Text
from sklearn.feature_extraction.text import TfidfVectorizer
from nlp_id.lemmatizer import Lemmatizer
from nltk.tokenize import word_tokenize
from string import punctuation

# Report
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

# Modelling
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

# Utility
from pandas import read_csv
from pandas import DataFrame
import pickle, re, json
import numpy as np

# Dataset

In [9]:
# LOAD dataset
dataset_path = './dataset/dataset_minecraft.csv'
df = read_csv(dataset_path)
df = df.dropna()
df = df.drop_duplicates()
print("\"{}\" is loaded succesfully".format(dataset_path))

dataset = df['Text Clean'].to_numpy()
label = df['Sentiment'].to_numpy()

"./dataset/dataset_minecraft.csv" is loaded succesfully


# Features Extraction

In [10]:
def preProcessing(text):
    fixed_slangwords = []
    fix_stopwords = []
    stopwordsDict =[]
    
    text = re.sub(r'@[A-Za-z0-9]+', '', str(text))
    text = re.sub(r'#[A-Za-z0-9]+', '', text)
    text = re.sub(r'RT[\s]', '', text)
    text = re.sub(r"http\S+", '', text)
    text = re.sub(r'[0-9]+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = text.replace('\n', ' ')
    text = text.translate(str.maketrans('', '', punctuation))
    text = text.strip(' ')
    text = text.lower()

    with open('./resource/slangwords_dict.txt', 'r') as file :
        slangwords = json.loads(file.readline())
    words = text.split()
    
    for word in words:
        if word.lower() in slangwords: fixed_slangwords.append(slangwords[word.lower()])
        else : fixed_slangwords.append(word)
    text = ' '.join(fixed_slangwords)
    
    text = word_tokenize(text)
    with open('./resource/stopwords.txt', 'r') as file:
        stopwords = file.readlines()
    for word in stopwords:
        word = word.replace('\n', '')
        stopwordsDict.append(word)
    for txt in text:
        if txt not in stopwordsDict: fix_stopwords.append(txt)
    text = fix_stopwords

    lemmatizer = Lemmatizer()
    text = [lemmatizer.lemmatize(word.lower()) for word in text]

    return ' '.join(word for word in text)

## TFIDF and UPSAMPLING

In [11]:
X = df['Text Clean']
y = df['Sentiment']

tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(X)

features = DataFrame(X_tfidf.toarray(), columns=tfidf.get_feature_names_out())
print("Features Shape : ", features.shape)
print("\nSebelum Upsampling\n", y.value_counts())

# UPSAMPLING data
smote = SMOTE()
x_over, y_over = smote.fit_resample(X_tfidf, y)
print("\nSetelah Upsampling\n", y_over.value_counts())

X_train, X_test, y_train, y_test = train_test_split(x_over, y_over, test_size=0.2, random_state=42)     # splitting features
print("\nData Testing : ", len(y_train))
print("Data Testing : ", len(y_test))

Features Shape :  (14637, 13998)

Sebelum Upsampling
 Sentiment
positive    7034
negative    6122
neutral     1481
Name: count, dtype: int64

Setelah Upsampling
 Sentiment
negative    7034
positive    7034
neutral     7034
Name: count, dtype: int64

Data Testing :  16881
Data Testing :  4221


# Model Training

### Skema 1 : Support Vector Machine

In [15]:
svc_models = SVC(kernel='rbf', random_state=42)

search_space = {
    'C' : [2],
    'gamma' : [1.1, 1.3, 1.5]
}

grid_search_svm = GridSearchCV(
                estimator = svc_models, 
                param_grid = search_space,
                cv=5,
                verbose=4)

svc_model = grid_search_svm.fit(X_train, y_train)

Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV 1/5] END ....................C=2, gamma=1.1;, score=0.904 total time=  35.6s
[CV 2/5] END ....................C=2, gamma=1.1;, score=0.910 total time=  33.1s
[CV 3/5] END ....................C=2, gamma=1.1;, score=0.900 total time=  32.2s
[CV 4/5] END ....................C=2, gamma=1.1;, score=0.901 total time=  34.1s
[CV 5/5] END ....................C=2, gamma=1.1;, score=0.902 total time=  33.5s
[CV 1/5] END ....................C=2, gamma=1.3;, score=0.901 total time=  36.2s
[CV 2/5] END ....................C=2, gamma=1.3;, score=0.907 total time=  34.7s
[CV 3/5] END ....................C=2, gamma=1.3;, score=0.898 total time=  34.5s
[CV 4/5] END ....................C=2, gamma=1.3;, score=0.898 total time=  35.0s
[CV 5/5] END ....................C=2, gamma=1.3;, score=0.902 total time=  36.4s
[CV 1/5] END ....................C=2, gamma=1.5;, score=0.896 total time=  43.9s
[CV 2/5] END ....................C=2, gamma=1.5;,

In [16]:
svc_model = grid_search_svm.best_estimator_
y_pred = svc_model.predict(X_test)

# cetak 10 hasil prediksi
print("Aktual   :", np.array(y_test)[:10])
print("Prediksi :", y_pred[:10])

# akurasi
accuracy = accuracy_score(y_test, y_pred) * 100
print(f"\nAkurasi model SVM : {accuracy:.2f}%")

print("\n", classification_report(y_test, y_pred))

Aktual   : ['negative' 'neutral' 'positive' 'neutral' 'negative' 'negative' 'negative' 'negative' 'negative' 'neutral']
Prediksi : ['negative' 'neutral' 'positive' 'negative' 'negative' 'negative' 'negative' 'negative' 'negative' 'negative']

Akurasi model SVM : 92.28%

               precision    recall  f1-score   support

    negative       0.90      0.92      0.91      1415
     neutral       0.98      0.91      0.94      1408
    positive       0.89      0.94      0.92      1398

    accuracy                           0.92      4221
   macro avg       0.92      0.92      0.92      4221
weighted avg       0.92      0.92      0.92      4221



### Skema 2 : Extreme Gradient Boosting

In [21]:
xgb_models = GradientBoostingClassifier(random_state=42)

search_space_xgb = {
    "n_estimators" : [500, 1000],
    "learning_rate" : [0.1],
    "max_depth" : [6, 9]
}

grid_search_xgb = GridSearchCV(
                    estimator= xgb_models,
                    param_grid= search_space_xgb,
                    cv=2,
                    verbose=4
)

grid_search_xgb.fit(X_train, y_train)

Fitting 2 folds for each of 4 candidates, totalling 8 fits
[CV 1/2] END learning_rate=0.1, max_depth=6, n_estimators=500;, score=0.817 total time= 2.9min
[CV 2/2] END learning_rate=0.1, max_depth=6, n_estimators=500;, score=0.817 total time= 3.0min
[CV 1/2] END learning_rate=0.1, max_depth=6, n_estimators=1000;, score=0.823 total time= 5.7min
[CV 2/2] END learning_rate=0.1, max_depth=6, n_estimators=1000;, score=0.828 total time= 5.4min
[CV 1/2] END learning_rate=0.1, max_depth=9, n_estimators=500;, score=0.821 total time= 3.8min
[CV 2/2] END learning_rate=0.1, max_depth=9, n_estimators=500;, score=0.823 total time= 3.7min
[CV 1/2] END learning_rate=0.1, max_depth=9, n_estimators=1000;, score=0.826 total time= 7.5min
[CV 2/2] END learning_rate=0.1, max_depth=9, n_estimators=1000;, score=0.829 total time= 7.1min


In [22]:
xgb_model = grid_search_xgb.best_estimator_
y_pred_xgb = xgb_model.predict(X_test)

print("Aktual   : ", np.array(y_test[:10]))
print("Prediksi : ", y_pred_xgb[:10])

xgb_accuracy = accuracy_score(y_test, y_pred_xgb) * 100
print(f"\nAkurasi Model XGB: {xgb_accuracy:.2f}%")

print("\n", classification_report(y_test, y_pred_xgb))

Aktual   :  ['negative' 'neutral' 'positive' 'neutral' 'negative' 'negative' 'negative' 'negative' 'negative' 'neutral']
Prediksi :  ['negative' 'neutral' 'positive' 'neutral' 'positive' 'neutral' 'negative' 'neutral' 'negative' 'negative']

Akurasi Model XGB: 88.04%

               precision    recall  f1-score   support

    negative       0.90      0.84      0.87      1415
     neutral       0.87      0.94      0.90      1408
    positive       0.88      0.86      0.87      1398

    accuracy                           0.88      4221
   macro avg       0.88      0.88      0.88      4221
weighted avg       0.88      0.88      0.88      4221



### Skema 3 : Neural  Network

In [23]:
mlp_models = MLPClassifier(random_state=42)

search_space_mlp = {
    "hidden_layer_sizes" : [150, 200, 250, 300], 
    "activation" : ['tanh'], 
    "solver" : ['adam'], 
    "learning_rate" : ['adaptive'] 
}

grid_search_mlp = GridSearchCV(
                    estimator= mlp_models,
                    param_grid= search_space_mlp,
                    cv=2,
                    verbose=4
)

grid_search_mlp.fit(X_train, y_train)

Fitting 2 folds for each of 4 candidates, totalling 8 fits
[CV 1/2] END activation=tanh, hidden_layer_sizes=150, learning_rate=adaptive, solver=adam;, score=0.850 total time= 2.6min
[CV 2/2] END activation=tanh, hidden_layer_sizes=150, learning_rate=adaptive, solver=adam;, score=0.851 total time= 2.3min
[CV 1/2] END activation=tanh, hidden_layer_sizes=200, learning_rate=adaptive, solver=adam;, score=0.851 total time= 3.3min
[CV 2/2] END activation=tanh, hidden_layer_sizes=200, learning_rate=adaptive, solver=adam;, score=0.849 total time= 3.1min
[CV 1/2] END activation=tanh, hidden_layer_sizes=250, learning_rate=adaptive, solver=adam;, score=0.847 total time= 4.5min
[CV 2/2] END activation=tanh, hidden_layer_sizes=250, learning_rate=adaptive, solver=adam;, score=0.850 total time= 5.0min
[CV 1/2] END activation=tanh, hidden_layer_sizes=300, learning_rate=adaptive, solver=adam;, score=0.849 total time= 8.4min
[CV 2/2] END activation=tanh, hidden_layer_sizes=300, learning_rate=adaptive, so

In [24]:
mlp_model = grid_search_mlp.best_estimator_
y_pred_mlp = mlp_model.predict(X_test)

print("Aktual   : ", np.array(y_pred_mlp[:10]))
print("Prediksi : ", y_pred_mlp[:10])

mlp_accuracy = accuracy_score(y_test, y_pred_mlp) * 100
print(f"\nAkurasi Model MLP : {mlp_accuracy:.2f}")

print("\n", classification_report(y_test, y_pred_mlp))

Aktual   :  ['negative' 'neutral' 'positive' 'neutral' 'positive' 'negative' 'negative' 'negative' 'negative' 'neutral']
Prediksi :  ['negative' 'neutral' 'positive' 'neutral' 'positive' 'negative' 'negative' 'negative' 'negative' 'neutral']

Akurasi Model MLP : 90.93

               precision    recall  f1-score   support

    negative       0.96      0.87      0.92      1415
     neutral       0.83      0.99      0.91      1408
    positive       0.95      0.86      0.91      1398

    accuracy                           0.91      4221
   macro avg       0.92      0.91      0.91      4221
weighted avg       0.92      0.91      0.91      4221



# Inference or Testing

In [26]:
def inferenceTfidf(text, model):
    with open('./tfidf/tfidfVectorizer.pkl', 'rb') as file:
        tfV = pickle.load(file)
        
    text_pre = preProcessing(text)
    x = tfV.transform([text_pre]).toarray()
    
    if model == 'svm':
        model_name = "Support Vector Machine"
        pred = svc_model.predict(x)
    elif model == 'xgb':
        model_name = "Extreme Gradient Boosting"
        pred = xgb_model.predict(x)
    elif model == 'mlp':
        model_name = "Multi Layer Perceptron"
        pred = mlp_model.predict(x)
    
    return print(f"Model      : {model_name}\nText       : \"{text}\"\nSentimeent : \033[1m{pred[0]}")

In [27]:
# Inference or Testing
inferenceTfidf("Update terbarunya asik dan juga banyak fitur baru", "mlp")

Model      : Multi Layer Perceptron
Text       : "Update terbarunya asik dan juga banyak fitur baru"
Sentimeent : [1mpositive
