In [None]:
!pip install gensim



In [163]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import ComplementNB
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier

import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
nltk.download('punkt_tab')

import gensim
from gensim.models import Word2Vec

df = pd.read_excel("data label.xlsx")
texts = df["data"].astype(str)
labels = df["label"]

stemmer = PorterStemmer()
def stem_text(text):
    tokens = word_tokenize(text.lower())
    stems = [stemmer.stem(w) for w in tokens]
    return " ".join(stems)

df["clean"] = texts.apply(stem_text)

X = df["clean"]
y = labels

# Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


# TF-IDF

In [None]:
vectorizer_tfidf = TfidfVectorizer()
X_train_tfidf = vectorizer_tfidf.fit_transform(X_train)
X_test_tfidf = vectorizer_tfidf.transform(X_test)

model_dt = DecisionTreeClassifier(
    criterion='entropy',
    max_depth=50,
    min_samples_split=5,
    random_state=42
)

model_dt.fit(X_train_tfidf, y_train)

pred1 = model_dt.predict(X_test_tfidf)
acc1 = accuracy_score(y_test, pred1)

print("TF-IDF + Decision Tree")
print("Akurasi:", acc1)
print("\nClassification Report:")
print(classification_report(y_test, pred1, digits=4))

TF-IDF + Decision Tree
Akurasi: 0.7

Classification Report:
              precision    recall  f1-score   support

     negatif     0.9375    0.7500    0.8333        20
      netral     0.5714    0.3636    0.4444        11
     positif     0.5926    0.8421    0.6957        19

    accuracy                         0.7000        50
   macro avg     0.7005    0.6519    0.6578        50
weighted avg     0.7259    0.7000    0.6955        50



In [None]:
vectorizer_tfidf = TfidfVectorizer(
    ngram_range=(1, 2),      # unigram + bigram
    sublinear_tf=True,       # TF-IDF lebih stabil
    min_df=2,                # hilangkan kata terlalu jarang
    max_features=10000       # batasi fitur besar supaya tidak overfitting
)

X_train_tfidf = vectorizer_tfidf.fit_transform(X_train)
X_test_tfidf = vectorizer_tfidf.transform(X_test)

model_nb = ComplementNB()
model_nb.fit(X_train_tfidf, y_train)

pred1 = model_nb.predict(X_test_tfidf)
acc1 = accuracy_score(y_test, pred1)

print("TF-IDF + Naive Bayes")
print("Akurasi:", acc1)
print("\nClassification Report:")
print(classification_report(y_test, pred1, digits=4))

TF-IDF + Naive Bayes
Akurasi: 0.82

Classification Report:
              precision    recall  f1-score   support

     negatif     0.7500    0.9000    0.8182        20
      netral     0.8750    0.6364    0.7368        11
     positif     0.8889    0.8421    0.8649        19

    accuracy                         0.8200        50
   macro avg     0.8380    0.7928    0.8066        50
weighted avg     0.8303    0.8200    0.8180        50



In [None]:
vectorizer_tfidf = TfidfVectorizer(
    ngram_range=(1, 2),
    sublinear_tf=True,
    min_df=2,
    max_df=0.95,
    max_features=15000
)

X_train_tfidf = vectorizer_tfidf.fit_transform(X_train)
X_test_tfidf = vectorizer_tfidf.transform(X_test)

# Logistic Regression Tuning
model_lr = LogisticRegression(
    C=2.0,
    max_iter=1000,
    solver='liblinear',
    class_weight='balanced',
    n_jobs=-1
)

model_lr.fit(X_train_tfidf, y_train)

pred3 = model_lr.predict(X_test_tfidf)
acc3 = accuracy_score(y_test, pred3)

print("TF-IDF + Logistic Regression")
print("Akurasi:", acc3)
print("\nClassification Report:")
print(classification_report(y_test, pred3, digits=4))

TF-IDF + Logistic Regression
Akurasi: 0.8

Classification Report:
              precision    recall  f1-score   support

     negatif     0.7200    0.9000    0.8000        20
      netral     0.8333    0.4545    0.5882        11
     positif     0.8947    0.8947    0.8947        19

    accuracy                         0.8000        50
   macro avg     0.8160    0.7498    0.7610        50
weighted avg     0.8113    0.8000    0.7894        50





In [None]:
tfidf = TfidfVectorizer()
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# SVM Model
model_svm = SVC(kernel="linear")       # kernel linear paling umum utk teks
model_svm.fit(X_train_tfidf, y_train)

pred = model_svm.predict(X_test_tfidf)

accuracy = accuracy_score(y_test, pred)

print("TF-IDF + SVM")
print("Akurasi:", accuracy)
print("\nClassification Report:")
print(classification_report(y_test, pred))

TF-IDF + SVM
Akurasi: 0.8

Classification Report:
              precision    recall  f1-score   support

     negatif       0.72      0.90      0.80        20
      netral       1.00      0.55      0.71        11
     positif       0.84      0.84      0.84        19

    accuracy                           0.80        50
   macro avg       0.85      0.76      0.78        50
weighted avg       0.83      0.80      0.80        50



# BoW

In [None]:
vectorizer_bow = CountVectorizer(
    ngram_range=(1,2),   # unigram + bigram â†’ akurasi naik
    min_df=2,            # buang kata sangat jarang
    max_df=0.95          # buang kata terlalu sering
)
X_train_bow = vectorizer_bow.fit_transform(X_train)
X_test_bow = vectorizer_bow.transform(X_test)

model_nb = MultinomialNB(alpha=0.3)   # alpha dituning untuk akurasi lebih tinggi
model_nb.fit(X_train_bow, y_train)

pred1 = model_nb.predict(X_test_bow)
acc1 = accuracy_score(y_test, pred1)

print("BoW + Naive Bayes")
print("Akurasi:", acc1)
print("\nClassification Report:")
print(classification_report(y_test, pred1, digits=4))

BoW + Naive Bayes
Akurasi: 0.76

Classification Report:
              precision    recall  f1-score   support

     negatif     0.7391    0.8500    0.7907        20
      netral     0.8333    0.4545    0.5882        11
     positif     0.7619    0.8421    0.8000        19

    accuracy                         0.7600        50
   macro avg     0.7781    0.7156    0.7263        50
weighted avg     0.7685    0.7600    0.7497        50



In [None]:
vectorizer_bow = CountVectorizer(
    ngram_range=(1,2),
    min_df=2,
    max_df=0.95,
)

X_train_bow = vectorizer_bow.fit_transform(X_train)
X_test_bow = vectorizer_bow.transform(X_test)

model_dt = DecisionTreeClassifier(
    criterion='entropy',
    max_depth=50,
    min_samples_split=5,
    random_state=42
)

model_dt.fit(X_train_bow, y_train)

pred1 = model_dt.predict(X_test_bow)
acc1 = accuracy_score(y_test, pred1)

print("BoW + Decision Tree")
print("Akurasi:", acc1)
print("\nClassification Report:")
print(classification_report(y_test, pred1, digits=4))

BoW + Decision Tree
Akurasi: 0.7

Classification Report:
              precision    recall  f1-score   support

     negatif     0.8667    0.6500    0.7429        20
      netral     0.7143    0.4545    0.5556        11
     positif     0.6071    0.8947    0.7234        19

    accuracy                         0.7000        50
   macro avg     0.7294    0.6664    0.6739        50
weighted avg     0.7345    0.7000    0.6943        50



In [None]:
vectorizer_bow = CountVectorizer(
    ngram_range=(1,2),
    min_df=2,
    max_df=0.95,
)

X_train_bow = vectorizer_bow.fit_transform(X_train)
X_test_bow = vectorizer_bow.transform(X_test)

model_svm = SVC(kernel="linear")
model_svm.fit(X_train_bow, y_train)

pred2 = model_svm.predict(X_test_bow)
acc2 = accuracy_score(y_test, pred2)

print("BoW + SVM")
print("Akurasi:", acc2)
print("\nClassification Report:")
print(classification_report(y_test, pred2))

BoW + SVM
Akurasi: 0.74

Classification Report:
              precision    recall  f1-score   support

     negatif       0.94      0.75      0.83        20
      netral       0.83      0.45      0.59        11
     positif       0.61      0.89      0.72        19

    accuracy                           0.74        50
   macro avg       0.79      0.70      0.71        50
weighted avg       0.79      0.74      0.74        50



In [None]:
vectorizer_bow = CountVectorizer(
    ngram_range=(1,2),
    min_df=2,
    max_df=0.95,
)

X_train_bow = vectorizer_bow.fit_transform(X_train)
X_test_bow = vectorizer_bow.transform(X_test)

model_lr = LogisticRegression(
    max_iter=300,
    C=2.0,
    penalty='l2',
    solver='liblinear',
    class_weight='balanced',
    random_state=42
)
model_lr.fit(X_train_bow, y_train)

pred3 = model_lr.predict(X_test_bow)
acc3 = accuracy_score(y_test, pred3)

print("BoW + Logistic Regression")
print("Akurasi:", acc3)
print("\nClassification Report:")
print(classification_report(y_test, pred3, digits=4))


BoW + Logistic Regression
Akurasi: 0.76

Classification Report:
              precision    recall  f1-score   support

     negatif     0.8824    0.7500    0.8108        20
      netral     0.8571    0.5455    0.6667        11
     positif     0.6538    0.8947    0.7556        19

    accuracy                         0.7600        50
   macro avg     0.7978    0.7301    0.7443        50
weighted avg     0.7900    0.7600    0.7581        50



# Word2vec

In [164]:
# Tokenisasi kalimat
sentences = [text.split() for text in X_train]

# Training Word2Vec
w2v_size = 100  # dimensi vektor
w2v_model = Word2Vec(
    sentences,
    vector_size=w2v_size,
    window=5,
    min_count=1,
    workers=4,
    epochs=100
)

# Fungsi untuk membuat representasi dokumen (average Word2Vec)
def get_avg_w2v(docs, w2v_model, size):
    vecs = []
    for doc in docs:
        words = doc.split()
        word_vecs = []
        for word in words:
            if word in w2v_model.wv:
                word_vecs.append(w2v_model.wv[word])
        if len(word_vecs) > 0:
            vecs.append(np.mean(word_vecs, axis=0))
        else:
            vecs.append(np.zeros(size))
    return np.array(vecs)

# Representasi dokumen
X_train_w2v = get_avg_w2v(X_train, w2v_model, w2v_size)
X_test_w2v = get_avg_w2v(X_test, w2v_model, w2v_size)

# SVM Model
model_svm = SVC(kernel="linear")
model_svm.fit(X_train_w2v, y_train)

pred = model_svm.predict(X_test_w2v)

accuracy = accuracy_score(y_test, pred)

print("Word2Vec + SVM")
print("Akurasi:", accuracy)
print("\nClassification Report:")
print(classification_report(y_test, pred))


Word2Vec + SVM
Akurasi: 0.56

Classification Report:
              precision    recall  f1-score   support

     negatif       0.50      0.85      0.63        20
      netral       0.00      0.00      0.00        11
     positif       0.69      0.58      0.63        19

    accuracy                           0.56        50
   macro avg       0.40      0.48      0.42        50
weighted avg       0.46      0.56      0.49        50



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
