# LSTM Challange Platinum Binar

In [1]:
#Library
import numpy as np
import pandas as pd
import re 

from sklearn.model_selection import train_test_split #split data train vs test

from tensorflow.keras.preprocessing.text import Tokenizer #token generator
from tensorflow.keras.utils import pad_sequences,to_categorical #penyaman panjang array

from imblearn.over_sampling import SMOTE #kalau kondisi datanya overfit

from tensorflow.keras.models import Sequential #cara kerja ml secara berurut(serial)
from tensorflow.keras.layers import Embedding, Dense, LSTM #layer pada fungsi ML
from tensorflow.keras.callbacks import EarlyStopping #untuk stop ML kalo udah gk nemu titik loss lebih rendah
from sklearn.metrics import precision_recall_fscore_support, accuracy_score #hasil kemampuan machine learning

from sklearn.preprocessing import LabelEncoder #transalate label menjadi indeks (untuk kasus lebih dari 2 variabel)
from tensorflow.keras.utils import to_categorical #Membuat array 1 dimensi(dari hasil label encoder) jadi array 2 dimensi

import tensorflow as tf #ini buat GPU TF

In [2]:
#Penggunaan GPU untuk TF
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  0


In [3]:
#Data Frame
df = pd.read_csv('train_data.csv')
df.columns = ['kalimat','HS']
print(df)

                                                kalimat        HS
0     tempat yang nyaman untuk berkumpul dengan tema...  positive
1     memang banyak bacot sih , omongan doang gede b...  negative
2     buat yang berkunjung ke bandung , yang ingin m...  positive
3     restoran menyajikan makanan khas sunda yang en...  positive
4     kalau travelling ke bandung , wajib makan bata...  positive
...                                                 ...       ...
9895  warung nasi ampera memiliki konsep rumah makan...  positive
9896  mbak della sangat baik dan ramah , makanna nya...  positive
9897  suasana nya sangat romantis jika makan malam d...  positive
9898  masyarakat tidak kecewa jika dipimpin oleh jok...  positive
9899  mau itu pak ridwan kamil atau pak dedi mulyadi...  positive

[9900 rows x 2 columns]


In [4]:
#Fungsi cleansing data 
def preprocessing_text(text):
    text = text.strip()
    text = text.lower()
    text = text.replace("\\n"," ")
    text = re.sub(r"(\s)(\1+)",r"\1",text)
    text = text.replace("rt","")
    text = text.replace("user ","")
    text = text.replace(" user","")
    text = re.sub(r"([a-z])(\1{3,})",r"\1\1",text)
    text = re.sub(r"(\\x)([a-z0-9]{2})",r"",text)
    text = text.replace("\\x8","")
    text = text.strip()
    
    return text

In [5]:
#Cleansing Data Tweet
df['cleaned_kalimat'] = df.kalimat.apply(lambda x:preprocessing_text(x))

In [6]:
#Split Data Training vs Data Validation
X_train, X_val, y_train, y_val = train_test_split(df['cleaned_kalimat'],df['HS'], random_state=0, test_size=0.2)

In [7]:
#Tokenizer buat mecah kalimat jadi kata - kata 
max_features = 10000
tknzr = Tokenizer(num_words=max_features,
                  filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                  split=' ', lower=True
                 )

In [8]:
print(X_train)

5281             banyak keluarga saya yang bekerja di bjb
2929    cebong memang cetek otak nya kelamaan berendem...
5167    anjirlah . saya tidak suka kamu karena menurut...
5255    hop hop the bubble drinks di mal bandung indah...
3078    saya muak dengan keputusan offside yang sebetu...
                              ...                        
9225    memang islam itu agama tidak bermoral cacat lonte
4859    setelah pulang kerja , saya bersama seorang te...
3264    sudirman said mengelaborasi sejumlah isu yang ...
9845    bermula saat rapat di bandung , pukul 09.00 di...
2732    fpi belum bubar ya . jika terus-terus buat rus...
Name: cleaned_kalimat, Length: 7920, dtype: object


In [9]:
#Tokenisasi kalimat 
tknzr.fit_on_texts(X_train) # Tokenisasi
X_train = tknzr.texts_to_sequences(X_train)# Proses token di translate jadi indeks
X_train = pad_sequences(X_train, maxlen=64)# Proses penyamanaan banyak indeks dalam suatu array
y_train = pd.get_dummies(y_train).values

X_val = tknzr.texts_to_sequences(X_val)
X_val = pad_sequences(X_val, maxlen=64)
y_val = pd.get_dummies(y_val).values

In [10]:
# save tokenizer
import pickle

with open('tknzr.pickle', 'wb') as handle:
    pickle.dump(tknzr, handle, protocol=pickle.HIGHEST_PROTOCOL)
    print("tknzr.pickle has created!")

tknzr.pickle has created!


In [11]:
# #Fixing Overfitting data
smote = SMOTE() #dipake untuk kalo label data jomplang biar disamain 
X_train, y_train = smote.fit_resample(X_train, y_train)

In [12]:
#Setting Model Machine Learning LSTM
max_nb_words = tknzr.num_words
embed_dim = 64

model = Sequential()
model.add(Embedding(max_nb_words,embed_dim,input_length=64))
model.add(LSTM(64))
model.add(Dense(32, activation='relu'))
model.add(Dense(3, activation='softmax'))

In [13]:
#Compile Model Machine Learning LSTM
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy']
             )
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 64, 64)            640000    
                                                                 
 lstm (LSTM)                 (None, 64)                33024     
                                                                 
 dense (Dense)               (None, 32)                2080      
                                                                 
 dense_1 (Dense)             (None, 3)                 99        
                                                                 
Total params: 675,203
Trainable params: 675,203
Non-trainable params: 0
_________________________________________________________________
None


In [14]:
#Config toggle untuk menjalankan library to_categorical
tf.config.run_functions_eagerly(True) 

In [15]:
#Validasi Model
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1)

history = model.fit(X_train, 
                    y_train, 
                    epochs=10, 
                    batch_size=64, 
                    validation_data=(X_val, y_val),
                    verbose=1,
                    callbacks=[es])

Epoch 1/10




Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 4: early stopping


In [16]:
#Mengetest model yang sudah belajar dengan Test Data
y_pred = model.predict(X_val)



In [17]:
#Cara Menilai Positive/Negative/Neutral(menggunakan pembulatan hasil)
y_pred = np.argmax(y_pred,axis=1)

#Transform array HS menjadi indeks(0/1/2)
y_true = np.argmax(y_val,axis=1)

In [18]:
print(y_pred)
print(y_true)

[0 0 2 ... 2 2 0]
[2 0 2 ... 2 2 0]


In [19]:
#Hasil Model
print(accuracy_score(y_pred=y_pred, y_true=y_true))
print(precision_recall_fscore_support(y_pred=y_pred, y_true=y_true, average='macro'))

0.8131313131313131
(0.7581753656916236, 0.7907112168292848, 0.770769066827682, None)


In [20]:
# tes teks
def test(kalimat):
    input_kalimat = [kalimat]
    input_kalimat = tknzr.texts_to_sequences(input_kalimat)
    input_kalimat = pad_sequences(input_kalimat, maxlen=64)
    
    hasil = model.predict(input_kalimat)
    hasil = hasil.argmax(axis=1)

    
    # konversi nilai prediksi menjadi label sentimen
    labels = {0: "negative", 1: "neutral", 2: "positive"}
    hasil = labels[hasil[0]]
    return hasil

In [21]:
#Sampel kalimat yang mau ditest
Sampel = "makan ini enak sekali"

In [22]:
test(Sampel)





'positive'

In [23]:
#Simpan model
model.save('model_lstm.h5')
print("Model has created!")

Model has created!


In [24]:
# load model
from keras.models import load_model
model_LSTM = load_model("model_lstm.h5")
print("berhasil")

berhasil


In [25]:
import pickle
file = open("tknzr.pickle","rb")
tknzr = pickle.load(file)
file.close()

print("Berhasil")

Berhasil


In [26]:
# tes file
def test_file_nn(kalimat):
    input_kalimat = df['text_clean']
    input_kalimat = tknzr.texts_to_sequences(input_kalimat)
    input_kalimat = pad_sequences(input_kalimat, maxlen=max_features)
    
    hasil = model_LSTM.predict(input_kalimat)
    hasil = hasil.argmax(axis=1)

    
    # konversi nilai prediksi menjadi label sentimen
    labels = {0: "negative", 1: "neutral", 2: "positive"}
    df["label_prediksi"] = [labels[hasil] for pred in hasil]
    return df