# Library

In [7]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.losses import categorical_crossentropy
from keras.optimizers import Adam
from keras.models import Sequential
from keras.callbacks import Callback
from keras import layers
from keras.regularizers import l2
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from collections import Counter
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from pandas import read_csv

import numpy as np
import pandas as pd

In [4]:
# LOAD dataset
df = read_csv('./dataset_minecraft.csv')
df = df.dropna()
print(df['Sentiment'].value_counts())
df.head(1)

Sentiment
positive    7159
negative    6219
neutral     1618
Name: count, dtype: int64


Unnamed: 0,Text Clean,Sentiment
0,gameplay bagus bug controlernya controler kere...,negative


In [30]:
dataset = df['Text Clean'].apply(lambda x: [x])
label = df['Sentiment']

In [12]:
label.value_counts()

Sentiment
positive    7159
negative    6219
neutral     1618
Name: count, dtype: int64

In [35]:
label

0        negative
1        negative
2        positive
3        negative
4        negative
           ...   
14995    positive
14996    positive
14997    positive
14998    negative
14999    negative
Name: Sentiment, Length: 14996, dtype: object

### SMOTE

# Model Traning

In [37]:
from sklearn.feature_extraction.text import TfidfVectorizer

x = df['Text Clean']
y = df['Sentiment']

tfidf = TfidfVectorizer()
x_tfidf = tfidf.fit_transform(x)

# features_df = pd.DataFrame(x_tfidf.toarray(), columns=tfidf.get_feature_names_out())
# features_df

X_train, X_test, y_train, y_test = train_test_split(x_tfidf, y, test_size=0.2, random_state=42)

In [38]:
smote = SMOTE(sampling_strategy='minority')
x_over, y_over = smote.fit_resample(x_tfidf, y)

In [39]:
new_df = pd.DataFrame(list(zip(x_over, y_over)), columns=['features', 'label'])
new_df['label'].value_counts()

label
positive    7159
neutral     7159
negative    6219
Name: count, dtype: int64

# Feature Extraction

In [40]:
# Pisahkan atribut dengan label
dataset = df['Text Clean'].to_numpy()
label = df.Sentiment.to_numpy()
####################################################################################

# UNIQUE WORDS
def counter_word(texts):
    count = Counter()
    for text in texts.values:
        for word in text.split():
            count[word] += 1
    return count

counter = counter_word(df['Text Clean'])
num_unique_words = len(counter) # 14000
print("Unique words : ", num_unique_words)
####################################################################################

# TOKENIZING
# tokenizing dataset
tokenizer = Tokenizer(num_words=num_unique_words)
tokenizer.fit_on_texts(dataset)
print(tokenizer.word_index)
####################################################################################

# FEATURE ENCODER
# dictionary for word
word_index = tokenizer.word_index
# turn dataset to sequence
dataset_seq = tokenizer.texts_to_sequences(dataset)
print("\n===Feature Encoder Test=== ")
print("Text Original : ", dataset[6])
print("Text Encoded  : ", dataset_seq[6])
####################################################################################

# FEATURE DECODER
# Flip word dictionary (key, value)
reverse_word_index = dict([(idx, word) for (word, idx) in word_index.items()])
def decode(sequence):
    return " ".join([reverse_word_index.get(idx, "?") for idx in sequence])
decoded_text = decode(dataset_seq[6])
print("\n===Feature Decoder Test=== ")
print("Text Sequence : ", dataset_seq[6])
print("Text Decoded  : ", decoded_text)
####################################################################################

# LABEL ENCODE
encoder = OneHotEncoder(sparse_output=False)
label_one_hot = encoder.fit_transform(label.reshape(-1, 1))
print("\n===Label Encoder Test=== ")
print(label_one_hot[0])
print("\n===Label Decoder Test=== ")
print(encoder.get_feature_names_out())
####################################################################################

# WORDS EMBEDDING
word2vec = Word2Vec(
                vector_size=100, 
                window=5,           
                min_count=1,        # minimal words to process
                workers=12          # NUM of CPU threads that are gonna to use
)
word2vec.build_vocab(review, progress_per=1000)
word2vec.train(review, total_examples=word2vec.corpus_count, epochs=word2vec.epochs)
word2vec.save('./word2vec-minecraft.model')
print("\n===Word Emdedding Test=== ")
word2vec.wv.most_similar("bagus")
# word2vec.wv

Unique words :  14000

===Feature Encoder Test=== 
Text Original :  game nya bagus serutapi freeze kamera detik menggangu survivalsaya bikin lag frame drop freeze kamera baik cepat baik
Text Encoded  :  [1, 2, 3, 1548, 132, 466, 143, 347, 4905, 39, 42, 83, 78, 132, 466, 6, 133, 6]

===Feature Decoder Test=== 
Text Sequence :  [1, 2, 3, 1548, 132, 466, 143, 347, 4905, 39, 42, 83, 78, 132, 466, 6, 133, 6]
Text Decoded  :  game nya bagus serutapi freeze kamera detik menggangu survivalsaya bikin lag frame drop freeze kamera baik cepat baik

===Label Encoder Test=== 
[1. 0. 0.]

===Label Decoder Test=== 
['x0_negative' 'x0_neutral' 'x0_positive']

===Word Emdedding Test=== 


[('cobain', 0.970811665058136),
 ('gamenya', 0.9653555154800415),
 ('kotakkotak', 0.958956778049469),
 ('sumpah', 0.9573140740394592),
 ('nya', 0.9573035836219788),
 ('mes', 0.9571770429611206),
 ('grafik', 0.9548986554145813),
 ('geme', 0.9542639255523682),
 ('banget', 0.9531177878379822),
 ('pixel', 0.9522908329963684)]

In [57]:
x = word2vec.wv
y = label_one_hot
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score
 
# Membuat objek model Naive Bayes (Bernoulli Naive Bayes)
naive_bayes = BernoulliNB()
 
# Melatih model Naive Bayes pada data pelatihan
naive_bayes.fit(X_train.toarray(), y_train)
 
# Prediksi sentimen pada data pelatihan dan data uji
y_pred_train_nb = naive_bayes.predict(X_train.toarray())
y_pred_test_nb = naive_bayes.predict(X_test.toarray())
 
# Evaluasi akurasi model Naive Bayes
accuracy_train_nb = accuracy_score(y_pred_train_nb, y_train)
accuracy_test_nb = accuracy_score(y_pred_test_nb, y_test)
 
# Menampilkan akurasi
print('Naive Bayes - accuracy_train:', accuracy_train_nb)
print('Naive Bayes - accuracy_test:', accuracy_test_nb)

ValueError: Found input variables with inconsistent numbers of samples: [13657, 14996]