# Import Library

In [6]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.regularizers import l2
from keras.losses import categorical_crossentropy
from keras.optimizers import Adam
from keras.models import Sequential
from keras.callbacks import Callback
from keras import layers

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from collections import Counter
from pandas import read_csv

import numpy as np
import pandas as pd

# Data Preprocessing

In [7]:
# Loading Dataset
df = read_csv("dataset_pubg.csv")
# print(df.shape, end="\n\n")
print("Dataframe :\n", df.head())
print("\nLabel Spec :\n", df['Sentiment'].value_counts())

# Pisahkan atribut dengan label
dataset = df['Text Clean'].to_numpy()
label = df.Sentiment.to_numpy()

# count unique words
def counter_word(texts):
    count = Counter()
    for text in texts.values:
        for word in text.split():
            count[word] += 1
    return count

counter = counter_word(df['Text Clean'])
num_unique_words = len(counter)     # 20993
# counter.most_common(5)
print("\nUnique words : ", num_unique_words)

# Tokenizing
# tokenizing dataset
tokenizer = Tokenizer(num_words=num_unique_words)
tokenizer.fit_on_texts(dataset)
# dictionary for word
word_index = tokenizer.word_index
# turn dataset to sequence
dataset_seq = tokenizer.texts_to_sequences(dataset)
print("\nEncoder Test : ")
print("Text Original : ", dataset[6])
print("Text Encoded  : ", dataset_seq[6])

# Padding
max_word_length = 120
dataset_padded = pad_sequences(dataset_seq, maxlen=max_word_length, padding="post", truncating="post")
print("Padding Test : ")
print("Text Ori   : ", dataset[10])
print("Text Token : ", dataset_seq[10])
print("Token Pad  : ", dataset_padded[10])

# Decoder
# Flip word dictionary (key, value)
reverse_word_index = dict([(idx, word) for (word, idx) in word_index.items()])
def decode(sequence):
    return " ".join([reverse_word_index.get(idx, "?") for idx in sequence])
decoded_text = decode(dataset_seq[10])
print("\nDecoder Test : ")
print("Text Sequence : ", dataset_seq[10])
print("Text Decoded  : ", decoded_text)

# One-Hot Encoding for labels
encoder = OneHotEncoder(sparse_output=False)
label_one_hot = encoder.fit_transform(label.reshape(-1, 1))

# Data Split
X_train, X_test, y_train, y_test = train_test_split(dataset_padded, label_one_hot, test_size=0.1, random_state=42)

Dataframe :
                                           Text Clean Sentiment
0  kecewa capek capek bunuh musuh headshot bertur...  negative
1  plisss pembaruan download ulang seharian downl...  negative
2  taii update jam ehh ngeleg musuh didepan mati ...  negative
3  bagus sayang gk hp ram rendah tolong update ku...  negative
4  login pasword email sinyalnya bagus sudahnya p...  negative

Label Spec :
 Sentiment
negative    3418
positive    3418
neutral     3418
Name: count, dtype: int64

Unique words :  20993

Encoder Test : 
Text Original :  pubg mobile aplikasi bagus menyukai aplikasi iniakan main tolong hilangkan bugkarna bug menganggu tolong tencent games
Text Encoded  :  [5, 31, 97, 7, 618, 97, 7011, 6, 2, 383, 7012, 4, 1023, 2, 10, 105]
Padding Test : 
Text Ori   :  dear tencent mohon menyesuaikan level pemain game level rendah match bertemu player level profesional mohon level sesuaikan match
Text Token :  [280, 10, 13, 1157, 251, 54, 1, 251, 250, 132, 491, 12, 251, 214, 13, 25

# Modelling

### define callback

In [8]:
class myCallback(Callback):
    def on_epoch_end(self, epoch, logs={}):
        if (logs.get('accuracy') >= 0.92 and logs.get('val_accuracy') >= 0.92):
            self.ltm.stop_training=True
            print('\nModel telah mencapai akurasi 92%')

### Skema 2 : LSTM

In [9]:
# Feature Extraction
wordEmbedding = layers.Embedding(
                        input_dim=num_unique_words, 
                        output_dim=120, 
                        input_length=max_word_length)

lstm = Sequential([
    wordEmbedding,
    layers.Bidirectional(layers.LSTM(120, return_sequences=True, dropout=0.2, kernel_regularizer=l2(0.01))),
    # layers.Bidirectional(layers.LSTM(120, return_sequences=True, dropout=0.2, kernel_regularizer=l2(0.01))),
    layers.Bidirectional(layers.LSTM(64, return_sequences=True, dropout=0.2, kernel_regularizer=l2(0.01))),
    layers.Bidirectional(layers.LSTM(32, return_sequences=True, dropout=0.2, kernel_regularizer=l2(0.01))),
    layers.Bidirectional(layers.LSTM(16, dropout=0.2, kernel_regularizer=l2(0.01))),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.5),
    # layers.Dense(32, activation='relu'),
    # layers.Dropout(0.5),
    layers.Dense(16, activation='relu'),
    layers.Dropout(0.8),
    layers.Dense(3, activation='softmax')
])

lstm.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 120, 120)          2519160   
                                                                 
 bidirectional_4 (Bidirecti  (None, 120, 240)          231360    
 onal)                                                           
                                                                 
 bidirectional_5 (Bidirecti  (None, 120, 128)          156160    
 onal)                                                           
                                                                 
 bidirectional_6 (Bidirecti  (None, 120, 64)           41216     
 onal)                                                           
                                                                 
 bidirectional_7 (Bidirecti  (None, 32)                10368     
 onal)                                                

In [10]:
lstm.compile(loss=categorical_crossentropy,
             optimizer=Adam(learning_rate=0.00001),
             metrics=['accuracy'])

history_lstm = lstm.fit(X_train, y_train,
                        validation_data=(X_test, y_test),
                        batch_size=8, 
                        epochs=500, 
                        callbacks=myCallback())

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
   1/1154 [..............................] - ETA: 48s - loss: 1.0964 - accuracy: 0.5000

KeyboardInterrupt: 