<a href="https://colab.research.google.com/github/wedingdong/sentiment-analysis-chatgpt-indobert-tfidf/blob/main/UAS_NLP_TF_IDF_KO.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Tahap 1: Setup & Impor **Library**

In [None]:
#MENGINSTALL LIBRARY YANG DIBUTUHKAN
!pip install scikit-learn Sastrawi pandas gradio --quiet

#MELAKUKAN IMPORT
import pandas as pd
import numpy as np
import re
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import gradio as gr
import os


# Tahap 2: Load Data Mentah

In [None]:
#LOAD DATASET DARI GOOGLE DRIVE
from google.colab import drive
drive.mount('/content/drive')
FILEPATH = "/content/drive/MyDrive/UAS_NLP/Datasets_ulasan_chatgpt_BIndo.csv"
assert os.path.exists(FILEPATH), "File tidak ditemukan!"

df = pd.read_csv(FILEPATH, on_bad_lines='skip', encoding='latin1')
df.dropna(subset=['content','score'], inplace=True)
print("Jumlah data setelah cleaning:", len(df))

def is_valid_score(x):
    try:
        if len(str(x)) > 5: return False
        float(x)
        return True
    except:
        return False

df = df[df['score'].apply(is_valid_score)].copy()
# KONVERSI 'score' ke tipe data numerik
df['score'] = pd.to_numeric(df['score'])

print("Jumlah data setelah filtering skor:", len(df))

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Jumlah data setelah cleaning: 13197
Jumlah data setelah filtering skor: 13194



# Tahap 3: Labeling & Balancing

In [None]:
# KAMUS SENTIMEN (Definisi ulang untuk digunakan di hybrid_label)
positive_keywords = ["bagus","keren","mantap","membantu","top","suka","memuaskan", "works well","mantab","terbaik","baik","luar biasa","recommended", "cepat","responsive","berguna","helpful","menyenangkan"]
negative_keywords = ["buruk","jelek","error","eror","lemot","lag","kecewa","gak bisa", "gabisa","tidak bisa","parah","tidak puas","masalah","sampah", "payah","crash","hang","ngefreeze","mengecewakan","tidak berfungsi"]
neutral_keywords = ["biasa saja","biasa aja","lumayan","cukup","oke lah","standar","normal", "ya begitu","ya gitu","so-so","biasa","tidak terlalu","netral","oke"]

# PREPROCESS TEKS RINGAN
def clean_text_hybrid(text):
    text = str(text).lower()
    text = re.sub(r"[^a-z0-9 ]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

# FUNGSI HYBRID LABELING UTAMA
def clean_text_hybrid(text):
    text = str(text).lower()
    text = re.sub(r"[^a-z0-9 ]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

def hybrid_label(content, score):
    # Mapping Skor
    if score <= 2: label_score = 0  # Negatif
    elif score == 3: label_score = 1  # Netral
    else: label_score = 2  # Positif

    # Labeling Kamus
    t = clean_text_hybrid(content)
    pos = sum(k in t for k in positive_keywords)
    neg = sum(k in t for k in negative_keywords)

    # Aturan Konflik: Positif + Negatif = Netral
    if pos > 0 and neg > 0:
        return 1

    # Jika tidak ada konflik, ikuti Mapping Skor
    return label_score

# --- Terapkan Hybrid Labeling (Koreksi Cara Apply) ---
# MEMANGGIL FUNGSI DENGAN MENGIRIMKAN SELURUH BARIS (axis=1)
df['label'] = df.apply(lambda row: hybrid_label(row['content'], row['score']), axis=1)

df_clean = df[['content', 'label']].copy()

print("Distribusi label sebelum balancing:")
print(df_clean['label'].value_counts())

# Oversampling balancing agar setiap kelas sama banyak datanya
target_count = df_clean['label'].value_counts().max()

df_resampled = pd.concat([
    df_clean[df_clean['label'] == k]
    if len(df_clean[df_clean['label'] == k]) == target_count
    else resample(
        df_clean[df_clean['label'] == k],
        replace=True,
        n_samples=target_count,
        random_state=42
    )
    for k in df_clean['label'].unique()
], ignore_index=True).sample(frac=1, random_state=42)

print("Distribusi label setelah balancing:")
print(df_resampled['label'].value_counts())

Distribusi label sebelum balancing:
label
2    12308
1      450
0      436
Name: count, dtype: int64
Distribusi label setelah balancing:
label
1    12308
0    12308
2    12308
Name: count, dtype: int64


# Tahap 4: Pra-Prosesing Teks (Cleaning, Stopword, Stemming)



In [None]:
factory_stop = StopWordRemoverFactory()

stopwords = set(factory_stop.get_stop_words()) - {"tidak", "kurang", "tanpa", "belum"}
stemmer = StemmerFactory().create_stemmer()

def clean_text(text):
    text = str(text)
    # Penghapusan URL, Angka, Simbol
    text = re.sub(r'https?://\\S+|www\\.\\S+', ' ', text)
    text = re.sub(r'\\d+', ' ', text)
    text = re.sub(r'[^a-zA-Z\\s]', ' ', text)
    text = text.lower()

    # Stopword Removal
    text = ' '.join([word for word in text.split() if word not in stopwords])
    # Stemming
    text = stemmer.stem(text)

    text = re.sub(r'\\s+', ' ', text).strip()
    return text

df_resampled['content_clean'] = df_resampled['content'].apply(clean_text)

# Tahap 5: Train-Test Split

In [None]:
X = df_resampled['content_clean'].values
y = df_resampled['label'].values
# Pisahkan fitur teks bersih dan label sentimen.

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42)
# Bagi data jadi train:test 80:20,

# Tahap 6: TF-IDF Vectorization

In [None]:
tfidf = TfidfVectorizer(max_features=6000, ngram_range=(1,2))
X_train_vec = tfidf.fit_transform(X_train)  #Melakukan training
X_test_vec = tfidf.transform(X_test)        #Melakukan Testing


# Tahap 7: Training Model ML dengan smoothing

In [None]:
from sklearn.naive_bayes import MultinomialNB
# Training model TF-IDF menggunakan Naive Bayes dengan smoothing
clf_nb = MultinomialNB(alpha=1.0)
clf_nb.fit(X_train_vec, y_train)

# Menghasilkan prediksi untuk data testing
y_pred_nb = clf_nb.predict(X_test_vec)

print("=== Hasil Training TF IDF ===")
print(classification_report(y_test, y_pred_nb, target_names=["Negatif", "Netral", "Positif"]))



=== Hasil Training TF IDF ===
              precision    recall  f1-score   support

     Negatif       0.88      0.87      0.87      2461
      Netral       0.93      0.79      0.85      2462
     Positif       0.73      0.85      0.79      2462

    accuracy                           0.84      7385
   macro avg       0.85      0.84      0.84      7385
weighted avg       0.85      0.84      0.84      7385



# Tahap 8: Evaluasi Model

In [None]:
# Menghasilkan prediksi model untuk data testing
y_pred = clf_nb.predict(X_test_vec)

# --- Output Classification Report ---
print("=== Laporan Klasifikasi Sentimen ===")
print(classification_report(y_test, y_pred, target_names=["Negatif", "Netral", "Positif"]))

#  Menghitung dan Menampilkan Akurasi Keseluruhan
akurasi = accuracy_score(y_test, y_pred)
print(f"Akurasi Keseluruhan: {akurasi * 100:.2f}%")

# Menghitung dan Menampilkan F1-Score Netral (Persen)
from sklearn.metrics import f1_score

# Menghitung F1-score untuk setiap kelas (Negatif, Netral, Positif)
# Diasumsikan 0: Negatif, 1: Netral, 2: Positif
f1_per_kelas = f1_score(y_test, y_pred, average=None)

# F1-Score Netral berada pada indeks 1
# Mengalikan dengan 100 dan memformat ke 2 desimal
f1_netral = f1_per_kelas[1]
print(f"F1-Score Netral: {f1_netral * 100:.2f}%")

=== Laporan Klasifikasi Sentimen ===
              precision    recall  f1-score   support

     Negatif       0.88      0.87      0.87      2461
      Netral       0.93      0.79      0.85      2462
     Positif       0.73      0.85      0.79      2462

    accuracy                           0.84      7385
   macro avg       0.85      0.84      0.84      7385
weighted avg       0.85      0.84      0.84      7385

Akurasi Keseluruhan: 83.55%
F1-Score Netral: 85.14%


# Tahap 9: Deploy Gradio UI

In [None]:
label_map = {0: "Negatif", 1: "Netral", 2: "Positif"}

# KATA KUNCI POSITIF NEGATIF
positive_keywords = ["bagus","keren","mantap","membantu","top","suka","memuaskan", "kerja bagus","mantab","terbaik","baik","luar biasa","recommended", "cepat","responsive","berguna","helpful","menyenangkan","tidak jelek","tidak buruk"]
negative_keywords = ["buruk","jelek","error","eror","lemot","lag","kecewa","gak bisa", "gabisa","tidak bisa","parah","tidak puas","masalah","sampah", "payah","crash","hang","ngefreeze","mengecewakan","tidak berfungsi","tidak bagus","todak suka"]

def predict_sentiment(text):
    if not text or str(text).strip() == "":
        return "ERROR: Masukkan teks ulasan terlebih dahulu."

    # 1. Pra-proses teks untuk MENDETEKSI KAMUS (Cukup clean_text_hybrid yang ringan)
    cleaned_hybrid = clean_text_hybrid(text)
    pos_count = sum(k in cleaned_hybrid for k in positive_keywords)
    neg_count = sum(k in cleaned_hybrid for k in negative_keywords)

    #  ATURAN HYBRID OVERRIDE
    if pos_count > 0 and neg_count > 0:
        # Konflik Positif + Negatif = Netral (skor 1)
        pred_score = 1
        pred_label = label_map[pred_score]
        return f"Hasil: {pred_label} (Skor: {pred_score})"

    # JIKA TIDAK ADA KONFLIK, GUNAKAN MODEL ML
    try:
        # 2. Pra-proses teks Penuh (untuk TF-IDF)

        cleaned_full = clean_text(text)

        # 3. Vektorisasi & Prediksi Model ML
        vec = tfidf.transform([cleaned_full])
        pred_score = clf_nb.predict(vec)[0]
        pred_label = label_map[pred_score]

        return f"Hasil: {pred_label} (Skor: {pred_score})"

    except Exception as e:
        return f"ERROR Pemrosesan: Terjadi kesalahan. Pastikan semua tahap di-run. ({e})"


# Membangun Antarmuka Gradio
iface = gr.Interface(
    fn=predict_sentiment,
    inputs=gr.Textbox(lines=3, placeholder="Tulis ulasan..."),
    outputs="text",
    title="Analisis Sentimen (TF-IDF)",
    description="Model menggabungkan TF-IDF dan logika kamus konflik (Positif+Negatif=Netral) dan prediksi Naive Bayes."
)

iface.launch()

It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://95f27b4ada645a76d7.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


