In [122]:
import string
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [123]:
data1 = pd.read_csv('data_1_2024_labeled.csv')
data2 = pd.read_csv('data_2_2024_labeled.csv')
data3 = pd.read_csv('data_3_2024_labeled.csv')
data4 = pd.read_csv('data_4_2024_labeled.csv')

In [124]:
data = pd.concat([data1, data2, data3, data4], ignore_index=True)

In [125]:
data.label.value_counts()

label
Positif                                                                                                                                                                                                                                                                                                                                                                                                              5704
Negatif                                                                                                                                                                                                                                                                                                                                                                                                              3309
Netral                                                                                                                                                                        

In [126]:
data.label = data.label.replace({
    'positif':'Positif', 
    ' positif':'Positif',
    'System: Positif' : 'Positif',
    'System: Negatif' : 'Negatif',
    'System: Netral' : 'Netral',
    'Netral.' : 'Netral',
    'Negatif.' : 'Negatif',
})

In [127]:
data = data[data.label.isin(['Positif', 'Negatif'])]

In [128]:
data.judul = data.judul.str.replace('\n', '')

In [129]:
def noise_removal(words):
    words=words.translate(str.maketrans('','',string.punctuation+string.digits))
    words=words.strip()
    return words

In [130]:
data.judul = data.judul.apply(noise_removal)

In [131]:
data.judul = data.judul.str.lower()

In [132]:
data.head()

Unnamed: 0,judul,label
0,mcdonalds gugat gerakan boikot produk israel d...,Negatif
2,rokok elektrik resmi kena pajak harga vape cs ...,Negatif
3,ingat transmart diskon setiap hari,Positif
4,cadangan minyak ri mau habis skk migas buka suara,Negatif
5,jasa sewa kano laris manis diserbu pelancong s...,Positif


In [133]:
labelencoder = LabelEncoder()
data['label'] = labelencoder.fit_transform(data['label'])

In [134]:
X_train, X_test, y_train, y_test = train_test_split(data['judul'], data['label'], test_size=0.2, random_state=42)

In [135]:
tokenizer = Tokenizer(num_words=10000, oov_token= "<OOV>")
tokenizer.fit_on_texts(X_train)

In [136]:
word_index = tokenizer.word_index

In [137]:
training_sequences = tokenizer.texts_to_sequences(X_train)
training_padded = pad_sequences(training_sequences, maxlen=120, padding='post', truncating='post')

In [138]:
testing_sequences = tokenizer.texts_to_sequences(X_test)
testing_padded = pad_sequences(testing_sequences, maxlen=120, padding='post', truncating='post')

In [139]:
training_padded = np.array(training_padded)
training_labels = np.array(y_train)
testing_padded = np.array(testing_padded)
testing_labels = np.array(y_test)

In [140]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(10000, 16, input_length=120),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])



In [141]:
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [142]:
num_epochs = 10
history = model.fit(training_padded, 
                    training_labels, 
                    epochs=num_epochs, 
                    validation_data=(testing_padded, testing_labels), 
                    verbose=1)

Epoch 1/10
[1m228/228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.6237 - loss: 0.6589 - val_accuracy: 0.6137 - val_loss: 0.6649
Epoch 2/10
[1m228/228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.6454 - loss: 0.6483 - val_accuracy: 0.6137 - val_loss: 0.6625
Epoch 3/10
[1m228/228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.6423 - loss: 0.6460 - val_accuracy: 0.6137 - val_loss: 0.6544
Epoch 4/10
[1m228/228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.6425 - loss: 0.6311 - val_accuracy: 0.7791 - val_loss: 0.6318
Epoch 5/10
[1m228/228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.6884 - loss: 0.5843 - val_accuracy: 0.6478 - val_loss: 0.5631
Epoch 6/10
[1m228/228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7452 - loss: 0.5148 - val_accuracy: 0.7495 - val_loss: 0.4994
Epoch 7/10
[1m228/228[0m 

In [143]:
# Evaluate the model on the training data
training_loss, training_accuracy = model.evaluate(training_padded, training_labels, verbose=0)
print(f"Final Training Accuracy: {training_accuracy}")

# Evaluate the model on the testing data
testing_loss, testing_accuracy = model.evaluate(testing_padded, testing_labels, verbose=0)
print(f"Final Testing Accuracy: {testing_accuracy}")


Final Training Accuracy: 0.8829187750816345
Final Testing Accuracy: 0.8104395866394043
