In [None]:
# ============================================================================
# ANALISIS SENTIMEN KEBIJAKAN REDENOMINASI RUPIAH - KOMENTAR YOUTUBE
# Menggunakan: Logistic Regression, SVM, KNN, Random Forest, Decision Tree, Neural Network
# ============================================================================

# STEP 1: INSTALASI DAN IMPORT LIBRARY
# ============================================================================
!pip install google-api-python-client
!pip install sastrawi
!pip install wordcloud
!pip install scikit-learn
!pip install tensorflow
!pip install pandas numpy matplotlib seaborn

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import warnings
warnings.filterwarnings('ignore')

# Library untuk scraping YouTube
from googleapiclient.discovery import build

# Library preprocessing
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

# Library Machine Learning
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier

# Library evaluasi
from sklearn.metrics import (classification_report, confusion_matrix,
                            accuracy_score, precision_score, recall_score,
                            f1_score, roc_auc_score, roc_curve)

# Library visualisasi
from wordcloud import WordCloud

Collecting sastrawi
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl.metadata (909 bytes)
Downloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m209.7/209.7 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sastrawi
Successfully installed sastrawi-1.0.1


In [None]:
# Cell 2: import & load
import pandas as pd, re
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from tqdm import tqdm
tqdm.pandas()

df = pd.read_csv('/content/dataset_komentar_purbaya_redenominasi.csv')  # upload file ke Colab terlebih dahulu
print("Shape awal:", df.shape)
print("Kolom:", df.columns.tolist())


Shape awal: (1174, 9)
Kolom: ['author', 'comment', 'commentsCount', 'pageUrl', 'replyCount', 'title', 'type', 'videoID', 'voteCount']


In [None]:
df

Unnamed: 0,author,comment,commentsCount,pageUrl,replyCount,title,type,videoID,voteCount
0,@Bennix,SEGERA isi Petisi Mendukung purbaya REDENOMINA...,6358,https://youtu.be/GRA-0D9PojY?si=RNrPNibbLbVH9BRT,339,"REDENOMINASI PURBAYA! Uang Rp1000 Jadi Rp1, Ri...",comment,,1000
1,@SamsFirdaus-lz3it,Kasian duit2 para koruptor nggak akan berlaku ...,6358,https://youtu.be/GRA-0D9PojY?si=RNrPNibbLbVH9BRT,0,"REDENOMINASI PURBAYA! Uang Rp1000 Jadi Rp1, Ri...",comment,,0
2,@Arew-hv1nk,gaskeuunnn..pa purbaya,6358,https://youtu.be/GRA-0D9PojY?si=RNrPNibbLbVH9BRT,0,"REDENOMINASI PURBAYA! Uang Rp1000 Jadi Rp1, Ri...",comment,,0
3,@BaritoputerariauGreatsupporter,"2 RUPIAH setara Rp.2.000,- 5 RUPIAH setara Rp....",6358,https://youtu.be/GRA-0D9PojY?si=RNrPNibbLbVH9BRT,0,"REDENOMINASI PURBAYA! Uang Rp1000 Jadi Rp1, Ri...",comment,,1
4,@sicimolgemol,Jangan lupa kasih tau sanak saudara teman atau...,6358,https://youtu.be/GRA-0D9PojY?si=RNrPNibbLbVH9BRT,0,"REDENOMINASI PURBAYA! Uang Rp1000 Jadi Rp1, Ri...",comment,,0
...,...,...,...,...,...,...,...,...,...
1169,@keluargabahagia5033,Biyar koruptor2 duitnya keluar semua,6358,https://youtu.be/GRA-0D9PojY?si=RNrPNibbLbVH9BRT,0,"REDENOMINASI PURBAYA! Uang Rp1000 Jadi Rp1, Ri...",comment,,0
1170,@keluargabahagia5033,Saya setuju lah redenominasi lagian juga saya...,6358,https://youtu.be/GRA-0D9PojY?si=RNrPNibbLbVH9BRT,0,"REDENOMINASI PURBAYA! Uang Rp1000 Jadi Rp1, Ri...",comment,,0
1171,@Rebahaners241,Gue malah lebih aneh sama yang belum paham ten...,6358,https://youtu.be/GRA-0D9PojY?si=RNrPNibbLbVH9BRT,0,"REDENOMINASI PURBAYA! Uang Rp1000 Jadi Rp1, Ri...",comment,,0
1172,@chesterlpf,1dollar AS jadi berapa? Uang kita jadi berharg...,6358,https://youtu.be/GRA-0D9PojY?si=RNrPNibbLbVH9BRT,0,"REDENOMINASI PURBAYA! Uang Rp1000 Jadi Rp1, Ri...",comment,,0


In [None]:
# Jumlah total & missing text
print("Total rows:", len(df))
print("Jumlah text kosong:", df['comment'].isna().sum())
print("Jumlah text terisi:", df['comment'].notna().sum())

# Distribusi stars
print("\ncomment (comment):")
print(df['comment'].value_counts().sort_index())

# Tampilkan beberapa contoh teks
print("\nContoh beberapa teks (kolom 'comment'):")
display(df['comment'].dropna().sample(min(10, df['comment'].dropna().shape[0]), random_state=42).reset_index(drop=True))

Total rows: 1174
Jumlah text kosong: 0
Jumlah text terisi: 1174

comment (comment):
comment
"Anjing2 peliharaan koruptor" üòÇ...suka sekaliüëç.\nDr dulu mau dilakukan pemerintah selalu mendapatkan sandungan dr "pengusaha anjing" dn politisi peliharaan koruptor ...tp kali berani üëç                     1
# Save Purbaya #                                                                                                                                                                                             1
#KitaSetujuRedenominasi                                                                                                                                                                                      1
,‚ù§‚ù§‚ù§‚ù§                                                                                                                                                                                                        1
0:33 bukanya pak purbaya mentri keuangan ya üòÖ               

Unnamed: 0,comment
0,Dulu 2010 pernah denger dari guru ada wacana r...
1,Setuju banget bang
2,"kalau saldo di rekening bagaimana, apa berkura..."
3,gw sebagai programmer bank biasanya ngetes qri...
4,Sebelum redenominasi harga barang Rp 50.000\nS...
5,ü§£ü§£ü§£üòÇüòÅMANTAAP BANG GAYA BAHASA KAU. SAYA SUKA C...
6,Pak benix saya jadi percaya uang yg ditemukan ...
7,dan secara tiba2 harga emas meningkat karena b...
8,Redominasi bisa dilakukan uang lama bisa di gu...
9,Koruptor ketar ketir üòÇ


In [None]:
# Kita fokus pada baris yang berisi comment (ulasan)
df_text = df[df['comment'].notna()].copy().reset_index(drop=True)
print("Rows with comment:", df_text.shape[0])


Rows with comment: 1174


In [None]:
# Preprocessing (cleaning + tokenizing + stemming Bahasa Indonesia)
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# Stopword list sederhana (kamu bisa tambahkan kata lain sesuai kebutuhan)
STOPWORDS = set([
    'yang','dan','di','ke','dari','ini','itu','atau','pun','jika','karena','dengan','sebagai','pada','adalah','saya',
    'kamu','nya','untuk','akan','apa','tidak','ga','gak','yg','rt','dgn','jd','sih','ya','oh','adalah','si','kami',
    'anda','juga','tadi','itu','saja','lebih','cukup','lagi','lagi','ad','tdk'
])

def clean_text(s: str) -> str:
    s = str(s)
    s = s.lower()
    s = re.sub(r'http\S+',' ', s)     # remove urls
    s = re.sub(r'@\w+',' ', s)        # mentions
    s = re.sub(r'[^0-9a-z\s]', ' ', s) # keep alnum + spaces
    s = re.sub(r'\s+', ' ', s).strip()
    return s

def tokenize_and_stem(s: str):
    s_clean = clean_text(s)
    toks = [t for t in s_clean.split() if t not in STOPWORDS]
    # lakukan stemming (Sastrawi)
    toks_stem = [stemmer.stem(t) for t in toks]
    return toks_stem

# terapk
df_text['clean'] = df_text['comment'].astype(str).progress_apply(clean_text)
df_text['tokens'] = df_text['comment'].astype(str).progress_apply(tokenize_and_stem)

# tampil beberapa contoh hasil preprocessing
display(df_text[['comment','clean','tokens']].head(8))


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1174/1174 [00:00<00:00, 42960.70it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1174/1174 [04:22<00:00,  4.48it/s]


Unnamed: 0,comment,clean,tokens
0,SEGERA isi Petisi Mendukung purbaya REDENOMINA...,segera isi petisi mendukung purbaya redenomina...,"[segera, isi, petisi, dukung, purbaya, redenom..."
1,Kasian duit2 para koruptor nggak akan berlaku ...,kasian duit2 para koruptor nggak akan berlaku ...,"[kasi, duit2, para, koruptor, nggak, laku, sim..."
2,gaskeuunnn..pa purbaya,gaskeuunnn pa purbaya,"[gaskeuunnn, pa, purbaya]"
3,"2 RUPIAH setara Rp.2.000,- 5 RUPIAH setara Rp....",2 rupiah setara rp 2 000 5 rupiah setara rp 5 ...,"[2, rupiah, tara, rp, 2, 000, 5, rupiah, tara,..."
4,Jangan lupa kasih tau sanak saudara teman atau...,jangan lupa kasih tau sanak saudara teman atau...,"[jangan, lupa, kasih, tau, sanak, saudara, tem..."
5,Bantu share aja bang. Biar masyarakat pada pinter,bantu share aja bang biar masyarakat pada pinter,"[bantu, share, aja, bang, biar, masyarakat, pi..."
6,Sangat setuju kalu bisa segera redenominasi bi...,sangat setuju kalu bisa segera redenominasi bi...,"[sangat, tuju, kalu, bisa, segera, redenominas..."
7,"1 RUPIAH sama dengan Rp.1.000,- ,50 Cent + 50...",1 rupiah sama dengan rp 1 000 50 cent 50 cent ...,"[1, rupiah, sama, rp, 1, 000, 50, cent, 50, ce..."


In [None]:
df_text.drop(columns=['author', 'commentsCount', 'pageUrl','replyCount', 'title', 'type', 'videoID', 'voteCount'], inplace=True)


In [None]:
# Buat DataFrame contoh
data = {'author', 'commentsCount', 'pageUrl', 'replyCount', 'title', 'type', 'videoID', }
df_text = pd.DataFrame(data)

# Hapus kolom 'kolom_b' dan 'kolom_c'
df_text.drop(columns=['author', 'pageUrl', 'replycount', 'title', 'type', 'videoID'], inplace=True)

# Tampilkan DataFrame setelah dihapus
print(df)

KeyError: "['author', 'pageUrl', 'replycount', 'title', 'type', 'videoID'] not found in axis"

In [None]:
OUTFILE = '2Preprocesssing_dataset_komentar_purbaya_redenominasi.csv'
df_text.to_csv(OUTFILE, index=False)
print("Saved preprocessed file to:", OUTFILE)


Saved preprocessed file to: 2Preprocesssing_dataset_komentar_purbaya_redenominasi.csv


In [None]:
df_text = pd.read_csv('/content/2Preprocesssing_dataset_komentar_purbaya_redenominasi.csv')

In [None]:
df


Unnamed: 0,0
0,author
1,title
2,replyCount
3,type
4,commentsCount
5,pageUrl
6,videoID


In [None]:
# Daftar keyword (belum distem) ‚Äî akan distem supaya cocok dengan tokens
ASPECT_KEYWORDS_RAW = {
    'kata positif': [
        'setuju', 'bagus', 'baik', 'mendukung', 'positif', 'tepat',
                'benar', 'bijak', 'cerdas', 'hebat', 'suka', 'senang',
                'mantap', 'oke', 'siap', 'maju', 'untung', 'efektif',
                'menguntungkan', 'praktis', 'mudah', 'jelas', 'cemerlang'
    ],
    'kata negatif': [
        'tidak', 'jangan', 'tolak', 'buruk', 'salah', 'gagal',
                'jelek', 'bodoh', 'kacau', 'rusak', 'bingung', 'ribet',
                'susah', 'sulit', 'bahaya', 'ancaman', 'merugikan',
                'masalah', 'khawatir', 'takut', 'menolak', 'rugi', 'ngawur'
    ],

}

# Stem keywords supaya cocok dengan tokens yang sudah di-stem
def stem_keyword_list(word_list):
    out = set()
    for w in word_list:
        # break multi-word -> stem per token
        parts = str(w).lower().split()
        stemmed_parts = [stemmer.stem(p) for p in parts if p not in STOPWORDS]
        if len(stemmed_parts) == 0:
            continue
        out.add(" ".join(stemmed_parts))
    return out

ASPECT_KEYWORDS = {k: stem_keyword_list(v) for k,v in ASPECT_KEYWORDS_RAW.items()}

# tampil ringkasan keyword (setelah stemming)
for a,kw in ASPECT_KEYWORDS.items():
    print(f"{a}: {len(kw)} keywords. Sample:", list(kw)[:12])


kata positif: 22 keywords. Sample: ['bagus', 'mantap', 'untung', 'mudah', 'siap', 'positif', 'oke', 'hebat', 'praktis', 'cemerlang', 'jelas', 'benar']
kata negatif: 20 keywords. Sample: ['jelek', 'buruk', 'tolak', 'salah', 'rusak', 'bingung', 'rugi', 'khawatir', 'masalah', 'ancam', 'jangan', 'kacau']


In [None]:
# Fungsi deteksi aspek (cocokkan kata kunci yang sudah distem)
def detect_aspects_from_tokens(tokens):
    s = set(tokens)
    found = []
    for aspect, kwset in ASPECT_KEYWORDS.items():
        # check any keyword in tokens; keyword bisa multi-token (gabungan) - kita check if all parts present in contiguous not necessary,
        # we do simple containment of any keyword tokens in the token set or phrase in joined string
        joined = " ".join(tokens)
        matched = False
        for kw in kwset:
            # jika keyword multi-word (mis. 'gedung baru'), cek apakah phrase muncul
            if " " in kw:
                if kw in joined:
                    matched = True
                    break
            else:
                if kw in s:
                    matched = True
                    break
        if matched:
            found.append(aspect)
    return found

# Fungsi sentiment sederhana berdasarkan lexicon + negasi
def sentiment_rule_from_tokens(tokens):
    pos = 0
    neg = 0
    for i,t in enumerate(tokens):
        # Check for multi-word lexicon by looking ahead a bit (max 3 tokens phrase)
        # Build small sliding phrases
        for L in (3,2,1):
            if i+L <= len(tokens):
                phrase = " ".join(tokens[i:i+L])
                if phrase in POS_WORDS:
                    # check previous token for negation
                    if i>0 and tokens[i-1] in NEGATION_WORDS:
                        neg += 1
                    else:
                        pos += 1
                if phrase in NEG_WORDS:
                    if i>0 and tokens[i-1] in NEGATION_WORDS:
                        pos += 1
                    else:
                        neg += 1
    if pos - neg >= 1:
        return 'positif'
    elif neg - pos >= 1:
        return 'negatif'
    else:
        return 'netral'


In [None]:
# STEP 8: VISUALISASI HASIL
# ============================================================================

def plot_model_comparison(results):
    """Visualisasi perbandingan performa model"""

    models = list(results.keys())
    metrics = ['accuracy', 'precision', 'recall', 'f1_score']

    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    axes = axes.ravel()

    for idx, metric in enumerate(metrics):
        values = [results[model][metric] for model in models]

        axes[idx].barh(models, values, color='skyblue')
        axes[idx].set_xlabel('Score')
        axes[idx].set_title(f'{metric.upper().replace("_", " ")}')
        axes[idx].set_xlim([0, 1])

        # Tambahkan nilai di bar
        for i, v in enumerate(values):
            axes[idx].text(v, i, f' {v:.4f}', va='center')

    plt.tight_layout()
    plt.show()

def plot_confusion_matrices(results, y_test):
    """Plot confusion matrix untuk semua model"""

    n_models = len(results)
    fig, axes = plt.subplots(2, 3, figsize=(18, 12))
    axes = axes.ravel()

    for idx, (name, result) in enumerate(results.items()):
        cm = confusion_matrix(y_test, result['y_pred'])

        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                   xticklabels=['Negatif', 'Positif'],
                   yticklabels=['Negatif', 'Positif'],
                   ax=axes[idx])

        axes[idx].set_title(f'Confusion Matrix - {name}')
        axes[idx].set_ylabel('Actual')
        axes[idx].set_xlabel('Predicted')

    plt.tight_layout()
    plt.show()

def plot_roc_curves(results, X_test, y_test):
    """Plot ROC curves untuk semua model"""

    plt.figure(figsize=(10, 8))

    for name, result in results.items():
        model = result['model']

        # Dapatkan probabilitas
        if hasattr(model, 'predict_proba'):
            y_proba = model.predict_proba(X_test)[:, 1]
        elif hasattr(model, 'decision_function'):
            y_proba = model.decision_function(X_test)
        else:
            continue

        # Hitung ROC
        fpr, tpr, _ = roc_curve(y_test, y_proba)
        auc = roc_auc_score(y_test, y_proba)

        plt.plot(fpr, tpr, label=f'{name} (AUC = {auc:.4f})')

    plt.plot([0, 1], [0, 1], 'k--', label='Random')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curves - All Models')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.show()


In [None]:
# STEP 9: MAIN PIPELINE - EKSEKUSI LENGKAP
# ============================================================================

def run_complete_analysis(df):
    """
    Pipeline lengkap analisis sentimen

    Parameter:
    - df: DataFrame dengan kolom 'text' berisi komentar
    """

    print("="*70)
    print("ANALISIS SENTIMEN KEBIJAKAN REDENOMINASI RUPIAH")
    print("="*70)

    # 1. Preprocessing
    print("\n[1/9] Preprocessing data...")
    df['cleaned_text'] = df['text'].apply(cleaning_text)
    df['processed_text'] = df['cleaned_text'].apply(preprocess_text)
    df['sentiment'] = df['processed_text'].apply(label_sentiment_lexicon)
    df = df[df['processed_text'].str.strip() != '']

    print(f"‚úì Data berhasil diproses: {len(df)} komentar")
    print(f"  - Positif: {sum(df['sentiment'] == 1)}")
    print(f"  - Negatif: {sum(df['sentiment'] == 0)}")

    # 2. EDA
    print("\n[2/9] Exploratory Data Analysis...")
    perform_eda(df)

    # 3. Feature Extraction
    print("\n[3/9] Feature Extraction dengan TF-IDF...")
    X, y, vectorizer = create_features(df, method='tfidf', max_features=1000)
    print(f"‚úì Fitur berhasil diekstrak: {X.shape[1]} fitur")

    # 4. Split Data
    print("\n[4/9] Splitting data...")
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    print(f"‚úì Train: {len(X_train)}, Test: {len(X_test)}")

    # 5. Training Models
    print("\n[5/9] Training 6 models...")
    results = train_all_models(X_train, X_test, y_train, y_test)

    # 6. Hyperparameter Tuning
    print("\n[6/9] Hyperparameter tuning (opsional)...")
    # best_rf, best_svm = hyperparameter_tuning(X_train, y_train)

    # 7. Model Comparison
    print("\n[7/9] Membuat visualisasi perbandingan model...")
    plot_model_comparison(results)

    # 8. Confusion Matrices
    print("\n[8/9] Membuat confusion matrices...")
    plot_confusion_matrices(results, y_test)

    # 9. ROC Curves
    print("\n[9/9] Membuat ROC curves...")
    plot_roc_curves(results, X_test, y_test)

    # Summary
    print("\n" + "="*70)
    print("RINGKASAN HASIL")
    print("="*70)

    best_model = max(results.items(), key=lambda x: x[1]['accuracy'])
    print(f"\nüèÜ Model Terbaik: {best_model[0]}")
    print(f"   Accuracy: {best_model[1]['accuracy']:.4f}")
    print(f"   F1-Score: {best_model[1]['f1_score']:.4f}")

    return results, vectorizer, df