In [13]:
# import library
import pandas as pd
import re
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [17]:
df = pd.read_csv("comments_ytb.csv")
print(df.head)

<bound method NDFrame.head of                              cid  \
0     UgyNjfLelzZmWt3BbUJ4AaABAg   
1     UgxlftxyeXmlcqs1qIR4AaABAg   
2     UgxIwYZWZO3uUzZK4_d4AaABAg   
3     UgzvIdzmR5LcZGETqCV4AaABAg   
4     Ugzm2xQSGSgSd4lnAvp4AaABAg   
...                          ...   
5675  UgwuYn8_ViXWbsNxu0d4AaABAg   
5676  UgyBj5azu5cbIak0uT54AaABAg   
5677  UgxLtQxL76mMb5KkHpF4AaABAg   
5678  UgzJKxNsyyE0Y0oEUCB4AaABAg   
5679  Ugx2ie52FxdBPw-P6AB4AaABAg   

                                                   text  time  \
0                                         Oktober 2025üòÇ  1 Â§©Ââç   
1                                                     üòÆ  1 Â§©Ââç   
2                                              hai kakk  3 Â§©Ââç   
3                                                 Absen  3 Â§©Ââç   
4     Ka kaka tau ga aku satuhari enam kali nonton v...  6 Â§©Ââç   
...                                                 ...   ...   
5675                                                  ü

In [18]:
def clean_text(text):
    if pd.isna(text):
        return ""
    text = str(text).lower()
    text = re.sub(r"http\S+|www\.\S+", "", text)        # hapus URL
    text = re.sub(r"[^a-z0-9\s]", " ", text)           # sisakan alfanumerik + spasi
    text = re.sub(r"\s+", " ", text).strip()           # normalize spasi
    return text

df['clean_text'] = df['text'].apply(clean_text)

In [19]:
# Simple rule-based pelabelan
positive_words = ["bagus", "keren", "mantap", "suka", "terbaik", "hebat", "love"]
negative_words = ["jelek", "buruk", "benci", "parah", "gak suka", "hate", "tidak suka", "gakbagus"]

def simple_sentiment(text):
    score = 0
    for w in positive_words:
        if w in text:
            score += 1
    for w in negative_words:
        if w in text:
            score -= 1
    if score > 0:
        return "positive"
    elif score < 0:
        return "negative"
    else:
        return "neutral"

df['label'] = df['clean_text'].apply(simple_sentiment)
print("\nDistribusi label:")
print(df['label'].value_counts())


Distribusi label:
label
neutral     5380
positive     272
negative      28
Name: count, dtype: int64


In [20]:
# Encode label (numeric) dan one-hot
le = LabelEncoder()
df['label_encoded'] = le.fit_transform(df['label'])
num_classes = len(le.classes_)
y = df['label_encoded'].values
y_cat = to_categorical(y, num_classes=num_classes)

# Mapping index -> label
index_to_label = {i: lbl for i, lbl in enumerate(le.classes_)}

In [22]:
#  Skema 1: TF-IDF + MLP
tfidf = TfidfVectorizer(max_features=5000)
X_tfidf = tfidf.fit_transform(df['clean_text']).toarray()

X_train, X_test, y_train, y_test = train_test_split(
    X_tfidf, y_cat, test_size=0.2, random_state=42, stratify=y
)

model_tfidf = Sequential([
    Dense(256, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.5),
    Dense(128, activation='relu'),
    Dropout(0.3),
    Dense(num_classes, activation='softmax')
])
model_tfidf.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

print("\n=== Training TF-IDF + MLP ===")
history1 = model_tfidf.fit(X_train, y_train, epochs=10, batch_size=64, validation_split=0.1, verbose=2)

test_loss1, test_acc1 = model_tfidf.evaluate(X_test, y_test, verbose=0)
print(f"TF-IDF MLP Test Accuracy: {test_acc1*100:.2f}%")

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)



=== Training TF-IDF + MLP ===
Epoch 1/10
64/64 - 3s - 41ms/step - accuracy: 0.9362 - loss: 0.4312 - val_accuracy: 0.9538 - val_loss: 0.1946
Epoch 2/10
64/64 - 1s - 20ms/step - accuracy: 0.9464 - loss: 0.1768 - val_accuracy: 0.9538 - val_loss: 0.1677
Epoch 3/10
64/64 - 1s - 20ms/step - accuracy: 0.9464 - loss: 0.1238 - val_accuracy: 0.9538 - val_loss: 0.1550
Epoch 4/10
64/64 - 1s - 21ms/step - accuracy: 0.9631 - loss: 0.0827 - val_accuracy: 0.9626 - val_loss: 0.1454
Epoch 5/10
64/64 - 1s - 22ms/step - accuracy: 0.9880 - loss: 0.0430 - val_accuracy: 0.9648 - val_loss: 0.1639
Epoch 6/10
64/64 - 2s - 34ms/step - accuracy: 0.9922 - loss: 0.0243 - val_accuracy: 0.9648 - val_loss: 0.1692
Epoch 7/10
64/64 - 2s - 24ms/step - accuracy: 0.9932 - loss: 0.0186 - val_accuracy: 0.9670 - val_loss: 0.1752
Epoch 8/10
64/64 - 2s - 37ms/step - accuracy: 0.9944 - loss: 0.0130 - val_accuracy: 0.9648 - val_loss: 0.1722
Epoch 9/10
64/64 - 1s - 21ms/step - accuracy: 0.9941 - loss: 0.0116 - val_accuracy: 0.967

In [23]:
# Skema 2: CountVectorizer + MLP
cv = CountVectorizer(max_features=5000)
X_cv = cv.fit_transform(df['clean_text']).toarray()

X_train2, X_test2, y_train2, y_test2 = train_test_split(
    X_cv, y_cat, test_size=0.2, random_state=42, stratify=y
)

model_cv = Sequential([
    Dense(256, activation='relu', input_shape=(X_train2.shape[1],)),
    Dropout(0.5),
    Dense(128, activation='relu'),
    Dropout(0.3),
    Dense(num_classes, activation='softmax')
])
model_cv.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

print("\n=== Training CountVectorizer + MLP ===")
history2 = model_cv.fit(X_train2, y_train2, epochs=10, batch_size=64, validation_split=0.1, verbose=2)

test_loss2, test_acc2 = model_cv.evaluate(X_test2, y_test2, verbose=0)
print(f"CountVec MLP Test Accuracy: {test_acc2*100:.2f}%")

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)



=== Training CountVectorizer + MLP ===
Epoch 1/10
64/64 - 4s - 58ms/step - accuracy: 0.9374 - loss: 0.4865 - val_accuracy: 0.9538 - val_loss: 0.2119
Epoch 2/10
64/64 - 2s - 27ms/step - accuracy: 0.9479 - loss: 0.1625 - val_accuracy: 0.9538 - val_loss: 0.1656
Epoch 3/10
64/64 - 1s - 22ms/step - accuracy: 0.9724 - loss: 0.0951 - val_accuracy: 0.9538 - val_loss: 0.1703
Epoch 4/10
64/64 - 1s - 21ms/step - accuracy: 0.9873 - loss: 0.0478 - val_accuracy: 0.9692 - val_loss: 0.1562
Epoch 5/10
64/64 - 1s - 22ms/step - accuracy: 0.9912 - loss: 0.0283 - val_accuracy: 0.9670 - val_loss: 0.1717
Epoch 6/10
64/64 - 1s - 22ms/step - accuracy: 0.9944 - loss: 0.0161 - val_accuracy: 0.9736 - val_loss: 0.1868
Epoch 7/10
64/64 - 1s - 22ms/step - accuracy: 0.9971 - loss: 0.0145 - val_accuracy: 0.9714 - val_loss: 0.1899
Epoch 8/10
64/64 - 2s - 28ms/step - accuracy: 0.9978 - loss: 0.0097 - val_accuracy: 0.9670 - val_loss: 0.2047
Epoch 9/10
64/64 - 2s - 33ms/step - accuracy: 0.9985 - loss: 0.0070 - val_accura

In [25]:
# Skema 3: Tokenizer + BiLSTM
max_words = 10000
maxlen = 100

tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')
tokenizer.fit_on_texts(df['clean_text'])
sequences = tokenizer.texts_to_sequences(df['clean_text'])
padded = pad_sequences(sequences, maxlen=maxlen, padding='post')

X_train3, X_test3, y_train3, y_test3 = train_test_split(
    padded, y_cat, test_size=0.2, random_state=42, stratify=y
)

model_lstm = Sequential([
    Embedding(input_dim=max_words, output_dim=128, input_length=maxlen),
    Bidirectional(LSTM(64)),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dense(num_classes, activation='softmax')
])
model_lstm.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

print("\n=== Training Tokenizer + BiLSTM ===")
history3 = model_lstm.fit(X_train3, y_train3, epochs=10, batch_size=64, validation_split=0.1, verbose=2)

test_loss3, test_acc3 = model_lstm.evaluate(X_test3, y_test3, verbose=0)
print(f"BiLSTM Test Accuracy: {test_acc3*100:.2f}%")


=== Training Tokenizer + BiLSTM ===
Epoch 1/10




64/64 - 20s - 311ms/step - accuracy: 0.9303 - loss: 0.3273 - val_accuracy: 0.9538 - val_loss: 0.1836
Epoch 2/10
64/64 - 21s - 323ms/step - accuracy: 0.9464 - loss: 0.2075 - val_accuracy: 0.9538 - val_loss: 0.1725
Epoch 3/10
64/64 - 23s - 365ms/step - accuracy: 0.9599 - loss: 0.1344 - val_accuracy: 0.9714 - val_loss: 0.1191
Epoch 4/10
64/64 - 15s - 237ms/step - accuracy: 0.9890 - loss: 0.0469 - val_accuracy: 0.9714 - val_loss: 0.1335
Epoch 5/10
64/64 - 15s - 228ms/step - accuracy: 0.9922 - loss: 0.0323 - val_accuracy: 0.9758 - val_loss: 0.1312
Epoch 6/10
64/64 - 16s - 250ms/step - accuracy: 0.9939 - loss: 0.0247 - val_accuracy: 0.9780 - val_loss: 0.1403
Epoch 7/10
64/64 - 14s - 224ms/step - accuracy: 0.9946 - loss: 0.0194 - val_accuracy: 0.9758 - val_loss: 0.1248
Epoch 8/10
64/64 - 14s - 226ms/step - accuracy: 0.9949 - loss: 0.0183 - val_accuracy: 0.9780 - val_loss: 0.1558
Epoch 9/10
64/64 - 21s - 322ms/step - accuracy: 0.9954 - loss: 0.0139 - val_accuracy: 0.9780 - val_loss: 0.1725
Epo

In [26]:
# Inference untuk ketiga skema
label_map = {0: "negative", 1: "neutral", 2: "positive"}
komentar_uji = [
    "Saya suka sekali video ini, sangat bermanfaat!",
    "Videonya jelek dan tidak jelas.",
    "Saya baru nonton, belum bisa komentar."
]

# Skema 1
print("\n=== Inference Skema 1: TF-IDF + MLP ===")
y_pred1_prob = model_tfidf.predict(X_test)
y_pred1 = np.argmax(y_pred1_prob, axis=1)
y_true1 = np.argmax(y_test, axis=1)
df_pred1 = pd.DataFrame({
    "Teks": df['clean_text'].iloc[:10],
    "Actual": [label_map[i] for i in y_true1[:10]],
    "Prediksi": [label_map[i] for i in y_pred1[:10]]
})
print(df_pred1)



=== Inference Skema 1: TF-IDF + MLP ===
[1m36/36[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 5ms/step
                                                Teks    Actual Prediksi
0                                       oktober 2025   neutral  neutral
1                                                      neutral  neutral
2                                           hai kakk   neutral  neutral
3                                              absen   neutral  neutral
4  ka kaka tau ga aku satuhari enam kali nonton v...  positive  neutral
5                                         2 okt 2025   neutral  neutral
6                                     hai saya hulky   neutral  neutral
7                                     september 2025   neutral  neutral
8  ngobrol2 tentang nenek2 manggarai temen kantor...   neutral  neutral
9  sekarang lebih serem stasiun manggarai apalagi...   neutral  neutral


In [27]:
# Skema 2
print("\n=== Inference Skema 2: CountVectorizer + MLP ===")
y_pred2_prob = model_cv.predict(X_test2)
y_pred2 = np.argmax(y_pred2_prob, axis=1)
y_true2 = np.argmax(y_test2, axis=1)
df_pred2 = pd.DataFrame({
    "Teks": df['clean_text'].iloc[:10],
    "Actual": [label_map[i] for i in y_true2[:10]],
    "Prediksi": [label_map[i] for i in y_pred2[:10]]
})
print(df_pred2)


=== Inference Skema 2: CountVectorizer + MLP ===
[1m36/36[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 10ms/step
                                                Teks    Actual Prediksi
0                                       oktober 2025   neutral  neutral
1                                                      neutral  neutral
2                                           hai kakk   neutral  neutral
3                                              absen   neutral  neutral
4  ka kaka tau ga aku satuhari enam kali nonton v...  positive  neutral
5                                         2 okt 2025   neutral  neutral
6                                     hai saya hulky   neutral  neutral
7                                     september 2025   neutral  neutral
8  ngobrol2 tentang nenek2 manggarai temen kantor...   neutral  neutral
9  sekarang lebih serem stasiun manggarai apalagi...   neutral  neutral


In [28]:
# Skema 3
print("\n=== Inference Skema 3: Tokenizer + BiLSTM ===")
def predict_sentiment_lstm(text):
    text_clean = clean_text(text)
    seq = tokenizer.texts_to_sequences([text_clean])
    padded_seq = pad_sequences(seq, maxlen=maxlen, padding='post')
    pred = model_lstm.predict(padded_seq)
    label_idx = pred.argmax(axis=1)[0]
    return label_map[label_idx]

for komentar in komentar_uji:
    hasil = predict_sentiment_lstm(komentar)
    print(f"Teks: {komentar}")
    print(f"Prediksi Sentimen: {hasil}\n")



=== Inference Skema 3: Tokenizer + BiLSTM ===
[1m1/1[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 350ms/step
Teks: Saya suka sekali video ini, sangat bermanfaat!
Prediksi Sentimen: positive

[1m1/1[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 50ms/step
Teks: Videonya jelek dan tidak jelas.
Prediksi Sentimen: neutral

[1m1/1[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 48ms/step
Teks: Saya baru nonton, belum bisa komentar.
Prediksi Sentimen: neutral

