In [1]:
# 1. INSTALL DAN IMPORT LIBRARY
!pip install Sastrawi nltk scikit-learn google-play-scraper pandas

import pandas as pd
import numpy as np
import re
import nltk
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# download daftar stopword Bahasa Indonesia
nltk.download('stopwords')
from nltk.corpus import stopwords

Collecting Sastrawi
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl.metadata (909 bytes)
Collecting google-play-scraper
  Downloading google_play_scraper-1.2.7-py3-none-any.whl.metadata (50 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.2/50.2 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Downloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.7/209.7 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading google_play_scraper-1.2.7-py3-none-any.whl (28 kB)
Installing collected packages: Sastrawi, google-play-scraper
Successfully installed Sastrawi-1.0.1 google-play-scraper-1.2.7


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:
# 2. LOAD DATASET (HASIL SCRAPING)

# Membaca file hasil scraping Tokopedia
df = pd.read_csv('dataset.csv')

# Menampilkan jumlah baris data
print("Jumlah data:", len(df))

# Menampilkan 5 baris pertama
df.head()


Jumlah data: 4000


Unnamed: 0,nama_pengguna,rating,ulasan
0,Ms Arifien,5,👍👍👍👍💥👌
1,karie supriyadi,1,"sampai kesal nunggu refund.sudah sabar nunggu,..."
2,Suyatno Albany,5,nice
3,ferdy gartin,5,keren
4,Karsa Doank,1,"Saya beli pompa air pake gopaylater, angsuran ..."


In [4]:
# 3. CLEANING DASAR

def cleaning_basic(text):
    text = str(text).lower()                 # ubah ke huruf kecil
    text = re.sub(r'http\S+', ' ', text)     # hapus URL
    text = re.sub(r'[^a-z\s]', ' ', text)    # hapus angka dan simbol
    text = re.sub(r'\s+', ' ', text).strip() # hapus spasi berlebih
    return text

# Terapkan ke kolom ulasan
df['clean_ulasan'] = df['ulasan'].apply(cleaning_basic)

# Lihat hasilnya
df[['ulasan', 'clean_ulasan']].head()


Unnamed: 0,ulasan,clean_ulasan
0,👍👍👍👍💥👌,
1,"sampai kesal nunggu refund.sudah sabar nunggu,...",sampai kesal nunggu refund sudah sabar nunggu ...
2,nice,nice
3,keren,keren
4,"Saya beli pompa air pake gopaylater, angsuran ...",saya beli pompa air pake gopaylater angsuran b...


In [5]:
# 4. NORMALISASI KATA TIDAK BAKU / SLANG

kamus_normalisasi = {
    'gk': 'tidak', 'ga': 'tidak', 'nggak': 'tidak', 'gak': 'tidak',
    'bgt': 'banget', 'bener': 'benar', 'nyesel': 'menyesal',
    'rekomen': 'rekomendasi', 'mantul': 'mantap', 'trima': 'terima',
    'smoga': 'semoga', 'kmu': 'kamu', 'udh': 'sudah', 'sdh': 'sudah',
    'tp': 'tapi', 'jd': 'jadi', 'dgn': 'dengan', 'blm': 'belum',
    'trs': 'terus', 'bkn': 'bukan', 'yg': 'yang', 'brg': 'barang'
}

def normalisasi(text):
    return ' '.join([kamus_normalisasi.get(w, w) for w in text.split()])

df['clean_ulasan'] = df['clean_ulasan'].apply(normalisasi)
df[['ulasan', 'clean_ulasan']].head()


Unnamed: 0,ulasan,clean_ulasan
0,👍👍👍👍💥👌,
1,"sampai kesal nunggu refund.sudah sabar nunggu,...",sampai kesal nunggu refund sudah sabar nunggu ...
2,nice,nice
3,keren,keren
4,"Saya beli pompa air pake gopaylater, angsuran ...",saya beli pompa air pake gopaylater angsuran b...


In [6]:
# 5. TOKENISASI, STOPWORD REMOVAL, STEMMING

# Inisialisasi stopwords Bahasa Indonesia
stop_words = set(stopwords.words('indonesian'))

# Inisialisasi stemmer dari library Sastrawi
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# Fungsi preprocessing: tokenisasi + hapus stopword + stemming
def preprocess(text):
    tokens = [w for w in text.split() if w not in stop_words]  # tokenisasi & hapus stopword
    stems = [stemmer.stem(w) for w in tokens]                  # stemming
    return ' '.join(stems)

# Terapkan ke kolom teks bersih
df['clean_ulasan'] = df['clean_ulasan'].apply(preprocess)

# Tampilkan contoh hasil
df[['ulasan', 'clean_ulasan']].head()

Unnamed: 0,ulasan,clean_ulasan
0,👍👍👍👍💥👌,
1,"sampai kesal nunggu refund.sudah sabar nunggu,...",kesal nunggu refund sabar nunggu bikin repot s...
2,nice,nice
3,keren,keren
4,"Saya beli pompa air pake gopaylater, angsuran ...",beli pompa air pake gopaylater angsur pompa be...


In [7]:
# 6. LEXICON-BASED LABELING (DARI TEKS, BUKAN RATING)

# Kamus kata positif dan negatif (relevan untuk konteks Tokopedia)
positives = {
    "bagus","mantap","puas","rekomendasi","cepat","murah","nyaman","baik",
    "keren","berfungsi","ori","original","sesuai","rapi","worth","memuaskan",
    "mantapp","top","oke","responsif","ramah","tepat","aman","senang","mantul"
}

negatives = {
    "buruk","kecewa","lemot","lambat","error","parah","jelek","mengecewakan",
    "rusak","telat","bohong","tipu","refund","lama","hang","crash","bug",
    "down","delay","ribet","susah","gagal","macet","lemotnya","tidakfungsi"
}

# Kata penguat (intensifier) dan penyangkal (negator)
intensifier = {"sangat","banget","amat","terlalu"}
negator = {"tidak","tak","nggak","ga","gak","bukan"}

# Fungsi perhitungan skor lexicon
def lexicon_score(tokens_str):
    score, prev = 0, ""
    for t in tokens_str.split():
        s = 1 if t in positives else (-1 if t in negatives else 0)
        # negasi → membalik skor
        if prev in negator and s != 0:
            s = -s
        # penguat → menggandakan skor
        if prev in intensifier and s != 0:
            s *= 2
        score += s
        prev = t
    return score

# Fungsi untuk menentukan label berdasarkan skor total
def label_from_text(tokens_str):
    s = lexicon_score(tokens_str)
    if s > 0:
        return "positif"
    elif s < 0:
        return "negatif"
    else:
        return "netral"

# Terapkan ke seluruh data
df['sentimen'] = df['clean_ulasan'].apply(label_from_text)

# Lihat distribusi hasil label
print(df['sentimen'].value_counts())
df[['ulasan','clean_ulasan','sentimen']].head()

sentimen
netral     2040
positif    1260
negatif     700
Name: count, dtype: int64


Unnamed: 0,ulasan,clean_ulasan,sentimen
0,👍👍👍👍💥👌,,netral
1,"sampai kesal nunggu refund.sudah sabar nunggu,...",kesal nunggu refund sabar nunggu bikin repot s...,negatif
2,nice,nice,netral
3,keren,keren,positif
4,"Saya beli pompa air pake gopaylater, angsuran ...",beli pompa air pake gopaylater angsur pompa be...,negatif


In [8]:
# 7. SPLIT DATA (SETELAH PREPROCESSING LENGKAP)

from sklearn.model_selection import train_test_split

# X = teks bersih; y = label hasil analisis teks (bukan rating!)
X = df['clean_ulasan']
y = df['sentimen']

# Bagi data 80% untuk training, 20% untuk testing
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Tampilkan jumlah masing-masing
print("Data latih :", len(X_train))
print("Data uji   :", len(X_test))


Data latih : 3200
Data uji   : 800


In [9]:
# 8. EKSTRAKSI FITUR (TF-IDF)

from sklearn.feature_extraction.text import TfidfVectorizer

# Membuat objek TF-IDF dengan maksimal 5000 fitur
vectorizer = TfidfVectorizer(max_features=5000)

# Latih TF-IDF di data training, lalu transformasi data training & testing
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf  = vectorizer.transform(X_test)

# Tampilkan bentuk matriks (baris = jumlah data, kolom = jumlah fitur)
print("Shape TF-IDF:")
print("Train:", X_train_tfidf.shape)
print("Test :", X_test_tfidf.shape)

Shape TF-IDF:
Train: (3200, 747)
Test : (800, 747)


In [10]:
# 9. TRAINING 3 MODEL DAN EVALUASI

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Siapkan model
models = {
    "Logistic Regression": LogisticRegression(max_iter=300),
    "SVM": LinearSVC(),
    "Random Forest": RandomForestClassifier(n_estimators=150, random_state=42)
}

results = {}

# Latih dan uji tiap model
for name, m in models.items():
    m.fit(X_train_tfidf, y_train)                   # Latih model
    pred = m.predict(X_test_tfidf)                  # Prediksi di data uji
    acc = accuracy_score(y_test, pred)              # Hitung akurasi
    results[name] = acc
    print(f"\n=== {name} ===")
    print(f"Akurasi: {acc*100:.2f}%")
    print(classification_report(y_test, pred))

# Pilih model terbaik
best_model_name = max(results, key=results.get)
best_model = models[best_model_name]
print(f"\nModel terbaik: {best_model_name} (akurasi {results[best_model_name]*100:.2f}%)")


=== Logistic Regression ===
Akurasi: 100.00%
              precision    recall  f1-score   support

     negatif       1.00      1.00      1.00       140
      netral       1.00      1.00      1.00       408
     positif       1.00      1.00      1.00       252

    accuracy                           1.00       800
   macro avg       1.00      1.00      1.00       800
weighted avg       1.00      1.00      1.00       800


=== SVM ===
Akurasi: 100.00%
              precision    recall  f1-score   support

     negatif       1.00      1.00      1.00       140
      netral       1.00      1.00      1.00       408
     positif       1.00      1.00      1.00       252

    accuracy                           1.00       800
   macro avg       1.00      1.00      1.00       800
weighted avg       1.00      1.00      1.00       800


=== Random Forest ===
Akurasi: 100.00%
              precision    recall  f1-score   support

     negatif       1.00      1.00      1.00       140
      netral 

In [11]:
# 10. INFERENCE (UJI KALIMAT BARU)

def prediksi_sentimen(teks):
    # Gunakan pipeline preprocessing yang sama seperti sebelumnya
    teks = cleaning_basic(teks)
    teks = normalisasi(teks)
    teks = preprocess(teks)
    X_new = vectorizer.transform([teks])
    return best_model.predict(X_new)[0]

# Uji beberapa contoh kalimat baru
contoh = [
    "Aplikasinya bagus banget dan pengiriman cepat!",
    "Biasa saja, kadang error pas transaksi.",
    "Kecewa banget, aplikasi sering force close."
]

for c in contoh:
    print(f"{c} -> {prediksi_sentimen(c)}")

Aplikasinya bagus banget dan pengiriman cepat! -> positif
Biasa saja, kadang error pas transaksi. -> netral
Kecewa banget, aplikasi sering force close. -> netral


In [12]:
# 11. SIMPAN DATASET BERSIH DAN FILE REQUIREMENTS

# Simpan dataset akhir
df.to_csv('clean_dataset_final.csv', index=False, encoding='utf-8')
print("File 'clean_dataset_final.csv' berhasil disimpan.")

# Buat daftar library yang dipakai
!pip freeze > requirements.txt
print("File 'requirements.txt' berhasil dibuat.")


File 'clean_dataset_final.csv' berhasil disimpan.
File 'requirements.txt' berhasil dibuat.
