# Import Library

In [1]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.losses import categorical_crossentropy
from keras.optimizers import Adam
from keras import models
from keras import layers

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from collections import Counter
from pandas import read_csv

import numpy as np

2024-07-25 15:58:16.668071: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Data Preprocessing

### loading dataset

In [2]:
df = read_csv("dataset_pubg.csv")
print(df.shape, end="\n\n")
print(df['Sentiment'].value_counts())
df.head()

(15000, 3)

Sentiment
negative    8119
positive    5868
neutral     1013
Name: count, dtype: int64


Unnamed: 0,Text Original,Text Clean,Sentiment
0,Tidak suka bermain event atau gameplay mode te...,suka bermain event gameplay mode tema menguran...,positive
1,Update terbaru update yang paling kacau selama...,update terbaru update kacau main pubg pengatur...,negative
2,Saya heran kenapa orang2 pada punya masalah ky...,heran orang frame drop render ngebug sinyal ng...,negative
3,"gamenya bagus banget, tapi saya punya sedikit ...",gamenya bagus banget keluhan gamenya berat mai...,negative
4,"Setelah update, grafik menuru walaupun sudah d...",update grafik menuru setingan blur frame rate ...,neutral


In [3]:
dataset = df['Text Clean'].to_numpy()
label = df.Sentiment.to_numpy()
print(dataset)
print(label)

['suka bermain event gameplay mode tema mengurangi pengalaman bermain kompetitif suka visi misi pubg realistis banding game fps event bertema menambah kapasitas internal penuh berpengaruh pengalaman bermain bug frame rate turun event rumah coba fungsinya peperangan event tema'
 'update terbaru update kacau main pubg pengaturan tombol pulak tulisan tombol layar save nyaman main pubg penyakit game pubg lag frame ganggu ganggu diperbaiki player menikmati permainan adil'
 'heran orang frame drop render ngebug sinyal ngelag main pubg mobile gitu sinyal frame drop okelah game online pas hp kentang emang frame drop parah sinyal ngelag pas upgrade hp begituan ngeluh pubg mobile buruk optimalisasinya fix hp tuh kentang'
 ...
 'oppo nglekbisa perbaiki berat gamex inimaaf tambahansetelah kali dimain gk masuk lagu tulisan eror jaringan pdahal jaringan full tolong'
 'update gameplay ngefreeze mobile emulator tolong optimalisasi'
 'bot tolong nerf ngeselin banget sekarat bot nge heal player langsung

### Count unique words

In [4]:
# count unique words
def counter_word(texts):
    count = Counter()
    for text in texts.values:
        for word in text.split():
            count[word] += 1
    return count

counter = counter_word(df['Text Clean'])
num_unique_words = len(counter)
num_unique_words
# counter.most_common(5)

28316

### Tokenizing

In [5]:
# tokenizing dataset
tokenizer = Tokenizer(num_words=num_unique_words)
tokenizer.fit_on_texts(dataset)

# dictionary for word
word_index = tokenizer.word_index

# turn dataset to sequence
dataset_seq = tokenizer.texts_to_sequences(dataset)

### Paddding

In [6]:
max_word_length = 80
dataset_padded = pad_sequences(dataset_seq, maxlen=max_word_length, padding="post", truncating="post")

In [7]:
print("Text Ori   : ", dataset[10])
print("Text Token : ", dataset_seq[10])
print("Token Pad  : ", dataset_padded[10])

Text Ori   :  bermain event kendala bug loting bug bug tolong perbaiki kendala update bug senjata reload backpack senjata loot suka bug berulang ulang
Text Token :  [17, 30, 167, 2, 3045, 2, 2, 3, 8, 167, 4, 2, 41, 480, 3732, 41, 540, 24, 2, 492, 86]
Token Pad  :  [  17   30  167    2 3045    2    2    3    8  167    4    2   41  480
 3732   41  540   24    2  492   86    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0]


### Decoder

In [8]:
# Flip word dictionary (key, value)
reverse_word_index = dict([(idx, word) for (word, idx) in word_index.items()])

def decode(sequence):
    return " ".join([reverse_word_index.get(idx, "?") for idx in sequence])

decoded_text = decode(dataset_seq[10])
print("Text Sequence : ", dataset_seq[10])
print("Text Decoded  : ", decoded_text)

Text Sequence :  [17, 30, 167, 2, 3045, 2, 2, 3, 8, 167, 4, 2, 41, 480, 3732, 41, 540, 24, 2, 492, 86]
Text Decoded  :  bermain event kendala bug loting bug bug tolong perbaiki kendala update bug senjata reload backpack senjata loot suka bug berulang ulang


### Data Split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(dataset_padded, label, test_size=0.2)

# Modelling

### Feature Extraction

In [None]:
wordEmbedding = layers.Embedding(num_unique_words, 32, input_length=max_word_length)

### Skema 2 : LSTM

In [None]:
lstm = models.Sequential()
lstm.add(layers.Embedding(num_unique_words, 32, input_length=max_word_length))

lstm.add(layers.LSTM(100, dropout=0.2, recurrent_dropout=0.2))
lstm.add(layers.Dense(3, activation='softmax'))

# lstm.summary()

In [None]:
loss = categorical_crossentropy
opt = Adam(learning_rate=0.001)
lstm.compile(loss=loss, optimizer=opt, metrics=['accuracy'])

In [None]:
history_lstm = lstm.fit(X_train, y_train, epochs=5)