# Library

In [None]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.losses import categorical_crossentropy
from keras.optimizers import Adam
from keras.models import Sequential
from keras.callbacks import Callback
from keras import layers
from keras.regularizers import l2
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from pandas import read_csv

import pandas as pd
import numpy as np
np.set_printoptions(linewidth=np.inf)

# Dataset

In [None]:
# LOAD dataset
dataset_path = './dataset_minecraft.csv'
df = read_csv(dataset_path)
df = df.dropna()
df = df.drop_duplicates()
print("\"{}\" is loaded succesfully".format(dataset_path))

dataset = df['Text Clean'].to_numpy()
label = df['Sentiment'].to_numpy()

# Features Extraction

### Dataset

In [None]:
# MAX WORD LENGTH
word_len = []
for word in dataset:
    word_len.append(len(word.split()))
max_word_length = max(word_len)
print("Max Word Length :\033[1m", max_word_length)
####################################################################################

# UNIQUE WORDS
def counter_word(texts):
    count = Counter()
    for text in texts.values:
        for word in text.split():
            count[word] += 1
    return count

counter = counter_word(df['Text Clean'])
num_unique_words = len(counter) # 14000
print("\nUnique words :\033[1m", num_unique_words)
####################################################################################

# TOKENIZING
# tokenizing dataset
tokenizer = Tokenizer(num_words=num_unique_words)
tokenizer.fit_on_texts(dataset)
# print(tokenizer.word_index.keys())
# print(tokenizer.word_index.values())
####################################################################################

# FEATURE ENCODER
# dictionary for word
word_index = tokenizer.word_index
# turn dataset to sequence
dataset_seq = tokenizer.texts_to_sequences(dataset)
print("\n===Feature Encoder Test=== ")
print("Text Original : ", dataset[6])
print("Text Encoded  : ", dataset_seq[6])
####################################################################################

# FEATURE DECODER
# Flip word dictionary (key, value)
reverse_word_index = dict([(idx, word) for (word, idx) in word_index.items()])
def decode(sequence):
    return " ".join([reverse_word_index.get(idx, "?") for idx in sequence])
decoded_text = decode(dataset_seq[6])
print("\n===Feature Decoder Test=== ")
print("Text Sequence : ", dataset_seq[6])
print("Text Decoded  : ", decoded_text)
####################################################################################

# Padding
dataset_padded = pad_sequences(dataset_seq, maxlen=max_word_length, padding="pre", truncating="pre")
print("\nPadding Test : ")
print("Text Ori   : ", dataset[6])
print("Text Token : ", dataset_seq[6])
print("Token Pad  : ", dataset_padded[6])
####################################################################################

### Oversampling

In [None]:
smote = SMOTE()
x_over, y_over = smote.fit_resample(dataset_padded, label)

In [None]:
# BEFORE OVERSAMPLING
df.Sentiment.value_counts()

In [None]:
# AFTER OVERSAMPLING
new_df = pd.DataFrame(list(zip(x_over, y_over)), columns=['features', 'label'])
new_df['label'].value_counts()

### Label

In [None]:
# LABEL ENCODE
encoder = OneHotEncoder(sparse_output=False)
label_one_hot = encoder.fit_transform(y_over.reshape(-1, 1))
print("\n===Label Encoder Test=== ")
print(encoder.get_feature_names_out())
print("Onehot \"{}\" : {}".format(df['Sentiment'][0], label_one_hot[0]))

### Splitting

In [98]:
print(x_over.shape, label_one_hot.shape)
X_train, X_test, y_train, y_test = train_test_split(x_over, label_one_hot, test_size=0.2, random_state=42)

(21102, 88) (21102, 3)


# Model

In [None]:
# DEFINE CALLBACK
class myCallback(Callback):
    def on_epoch_end(self, epoch, logs={}):
        if (logs.get('accuracy') >= 0.92 and logs.get('val_accuracy') >= 0.92):
            self.ltm.stop_training=True
            print('\nModel telah mencapai akurasi 92%')

In [None]:
# Feature Extraction
wordEmbedding = layers.Embedding(
                        input_dim=num_unique_words, 
                        output_dim=88, 
                        input_length=max_word_length)

lstm = Sequential([
    wordEmbedding,
    layers.Bidirectional(layers.LSTM(88, dropout=0.2, kernel_regularizer=l2(0.01))),
    # layers.Bidirectional(layers.LSTM(88, return_sequences=True, dropout=0.2, kernel_regularizer=l2(0.01))),e
    # layers.Bidirectional(layers.LSTM(64, return_sequences=True, dropout=0.2, kernel_regularizer=l2(0.01))),
    # layers.Bidirectional(layers.LSTM(32, return_sequences=True, dropout=0.2, kernel_regularizer=l2(0.01))),
    # layers.Bidirectional(layers.LSTM(16, dropout=0.2, kernel_regularizer=l2(0.01))),
    # layers.Dense(64, activation='relu'),
    # layers.Dropout(0.5),
    # layers.Dense(32, activation='relu'),
    # layers.Dropout(0.5),
    # layers.Dense(16, activation='relu'),
    # layers.Dropout(0.8),
    layers.Dense(3, activation='softmax')
])

lstm.summary()

In [None]:
lstm.compile(loss=categorical_crossentropy,
             optimizer=Adam(learning_rate=0.00001),
             metrics=['accuracy'])

history_lstm = lstm.fit(X_train, y_train,
                        validation_data=(X_test, y_test),
                        batch_size=64, 
                        epochs=500, 
                        callbacks=myCallback())