## **Import Library**

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import pickle

## **Load Data**

In [8]:
df = pd.read_csv('final_stress_dataset.csv')

In [9]:
df.head()

Unnamed: 0,Text,Label,text_stemmed
0,"Barang sudah diterima nih kak, makasih yaa",Positive,barang terima kak terima kasih yaa
1,"Gampang dibawa-bawa, terlalu imut ukurannya",Positive,gampang dibawabawa imut ukur
2,LANGGANAN ??????????,Positive,langgan
3,"bagus, pengiriman cepet banget bakal jadi lang...",Positive,bagus kirim cepat banget langgan
4,Kartu bekerja dengan baik begitupun sellernya ...,Positive,kartu sellernya sigap


In [10]:
df.info()

print("\nMissing values:\n", df.isnull().sum())
print("\nData Duplikat:", df.duplicated().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11795 entries, 0 to 11794
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Text          11795 non-null  object
 1   Label         11795 non-null  object
 2   text_stemmed  11758 non-null  object
dtypes: object(3)
memory usage: 276.6+ KB

Missing values:
 Text             0
Label            0
text_stemmed    37
dtype: int64

Data Duplikat: 0


In [11]:
df.drop_duplicates(inplace=True)
df.dropna(inplace=True)

In [12]:
print("\nMissing values:\n", df.isnull().sum())
print("\nData Duplikat:", df.duplicated().sum())


Missing values:
 Text            0
Label           0
text_stemmed    0
dtype: int64

Data Duplikat: 0


## **Modelling**
**Binary Classification dengan LSTM**

In [13]:
# Load dan Encode Label
df['Label_encoded'] = LabelEncoder().fit_transform(df['Label'])

In [14]:
# Tokenization
max_words = 5000
max_len = 100

tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')
tokenizer.fit_on_texts(df['text_stemmed'])

X = tokenizer.texts_to_sequences(df['text_stemmed'])
X = pad_sequences(X, maxlen=max_len)

y = df['Label_encoded'].values

In [15]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
# Build LSTM Model
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=128))
model.add(LSTM(64, return_sequences=False))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [17]:
# Training
early_stop = EarlyStopping(monitor='val_loss', patience=3)

history = model.fit(
    X_train, y_train,
    epochs=10,
    batch_size=64,
    validation_split=0.2,
    callbacks=[early_stop]
)

Epoch 1/10
[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 153ms/step - accuracy: 0.6767 - loss: 0.6010 - val_accuracy: 0.8496 - val_loss: 0.3636
Epoch 2/10
[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 155ms/step - accuracy: 0.8714 - loss: 0.3113 - val_accuracy: 0.8555 - val_loss: 0.3490
Epoch 3/10
[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 148ms/step - accuracy: 0.9200 - loss: 0.2142 - val_accuracy: 0.8613 - val_loss: 0.3527
Epoch 4/10
[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 149ms/step - accuracy: 0.9389 - loss: 0.1655 - val_accuracy: 0.8470 - val_loss: 0.3996
Epoch 5/10
[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 160ms/step - accuracy: 0.9451 - loss: 0.1514 - val_accuracy: 0.8470 - val_loss: 0.4424


In [18]:
# Evaluasi
loss, accuracy = model.evaluate(X_test, y_test)
print(f'\nTest Accuracy: {accuracy*100:.2f}%')

[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 19ms/step - accuracy: 0.8599 - loss: 0.3838

Test Accuracy: 85.46%


In [19]:
model.save('model_lstm_stress.h5')
with open('tokenizer_stress.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)



In [20]:
model.save('model_lstm_stress.keras')

In [21]:
import json

with open("tokenizer.json", "w") as f:
    json.dump(tokenizer.word_index, f)