# Import Library

In [None]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.regularizers import l2
from keras.losses import categorical_crossentropy
from keras.optimizers import Adam
from keras.models import Sequential
from keras.callbacks import Callback
from keras import layers

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from collections import Counter
from pandas import read_csv

import numpy as np
import pandas as pd

# Data Preprocessing

### loading dataset

In [None]:
df = read_csv("dataset_pubg.csv")
# print(df.shape, end="\n\n")
print(df['Sentiment'].value_counts())
df.head()

# Pisahkan atribut dengan label
dataset = df['Text Clean'].to_numpy()
label = df.Sentiment.to_numpy()

### Count unique words

In [None]:
# count unique words
def counter_word(texts):
    count = Counter()
    for text in texts.values:
        for word in text.split():
            count[word] += 1
    return count

counter = counter_word(df['Text Clean'])
num_unique_words = len(counter)
num_unique_words
# counter.most_common(5)

### Tokenizing

In [None]:
# tokenizing dataset
tokenizer = Tokenizer(num_words=num_unique_words)
tokenizer.fit_on_texts(dataset)

# dictionary for word
word_index = tokenizer.word_index

# turn dataset to sequence
dataset_seq = tokenizer.texts_to_sequences(dataset)

### Paddding

In [None]:
max_word_length = 120
dataset_padded = pad_sequences(dataset_seq, maxlen=max_word_length, padding="post", truncating="post")

In [None]:
print("Text Ori   : ", dataset[10])
print("Text Token : ", dataset_seq[10])
print("Token Pad  : ", dataset_padded[10])

### Decoder

In [None]:
# Flip word dictionary (key, value)
reverse_word_index = dict([(idx, word) for (word, idx) in word_index.items()])

def decode(sequence):
    return " ".join([reverse_word_index.get(idx, "?") for idx in sequence])

decoded_text = decode(dataset_seq[10])
print("Text Sequence : ", dataset_seq[10])
print("Text Decoded  : ", decoded_text)

In [None]:
# One-Hot Encoding for labels
encoder = OneHotEncoder(sparse_output=False)
label_one_hot = encoder.fit_transform(label.reshape(-1, 1))

### Data Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(dataset_padded, label_one_hot, test_size=0.1, random_state=42)

# Modelling

### define callback

In [None]:
class myCallback(Callback):
    def on_epoch_end(self, epoch, logs={}):
        if (logs.get('accuracy') >= 0.92 and logs.get('val_accuracy') >= 0.92):
            self.ltm.stop_training=True
            print('\nModel telah mencapai akurasi 92%')

### Skema 2 : LSTM

In [None]:
# Feature Extraction
wordEmbedding = layers.Embedding(
                        input_dim=num_unique_words, 
                        output_dim=120, 
                        input_length=max_word_length)

lstm = Sequential([
    wordEmbedding,
    layers.Bidirectional(layers.LSTM(120, return_sequences=True, dropout=0.2, kernel_regularizer=l2(0.01))),
    layers.Bidirectional(layers.LSTM(64, return_sequences=True, dropout=0.2, kernel_regularizer=l2(0.01))),
    layers.Bidirectional(layers.LSTM(32, return_sequences=True, dropout=0.2, kernel_regularizer=l2(0.01))),
    layers.Bidirectional(layers.LSTM(16, dropout=0.2, kernel_regularizer=l2(0.01))),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(32, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(16, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(3, activation='softmax')
])

lstm.summary()

In [None]:
lstm.compile(loss=categorical_crossentropy,
             optimizer=Adam(learning_rate=0.00001),
             metrics=['accuracy'])

In [None]:
history_lstm = lstm.fit(X_train, y_train,
                        validation_data=(X_test, y_test),
                        batch_size=64, 
                        epochs=500, 
                        callbacks=myCallback())