# LDA

## IMPORT LIBRARY

In [1]:
# ==============================================
# 1. IMPORT LIBRARY
# ==============================================
import pandas as pd
import gensim
from gensim import corpora
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder


## LOAD DATASET

In [2]:
# ==============================================
# 2. LOAD DATASET
# ==============================================
import pandas as pd

df = pd.read_csv("hasil_preprocessing_berita.csv")

# Kolom yang benar berdasarkan dataset kamu
text_col = 'hasil_preprocessing'   # kolom teks hasil preprocessing
label_col = 'kategori'             # kolom label kategori

# Hapus baris kosong di dua kolom ini
df = df.dropna(subset=[text_col, label_col])

print(df.head())
print("\nJumlah data:", len(df))


                                          isi_berita  \
0  TUBAN,BANGSAONLINE.com- Komisi II DPRD Tuban b...   
1  SURABAYA,BANGSAONLINE.com- Rumah Literasi Digi...   
2  SURABAYA, BANGSAONLINE.com- Baznas RI menyalur...   
3  SURABAYA,BANGSAONLINE.com- Bangunan musala lan...   
4  KOTA KEDIRI, BANGSAONLINE.com- Wakil Wali Kota...   

                                 hasil_preprocessing      kategori  
0  ['tubanbangsaonlinecom', 'komisi', 'ii', 'dprd...         Jatim  
1  ['surabayabangsaonlinecom', 'rumah', 'literasi...         Jatim  
2  ['surabaya', 'bangsaonlinecom', 'baznas', 'ri'...   Jatim Metro  
3  ['surabayabangsaonlinecom', 'bangun', 'musala'...   Jatim Metro  
4  ['kota', 'diri', 'bangsaonlinecom', 'wakil', '...  Jatim Tengah  

Jumlah data: 74


## KONVERSI TEKS PREPROCESSING KE LIST KATA

In [3]:
# ==============================================
# 3. KONVERSI TEKS PREPROCESSING KE LIST KATA
# ==============================================
import ast

def safely_eval_list(x):
    try:
        return ast.literal_eval(x)  # ubah dari string list ke list asli
    except:
        return []  # kalau gagal parsing, jadikan list kosong

df['tokens'] = df[text_col].apply(safely_eval_list)

print("Contoh token:\n", df['tokens'].head())


Contoh token:
 0    [tubanbangsaonlinecom, komisi, ii, dprd, tuban...
1    [surabayabangsaonlinecom, rumah, literasi, dig...
2    [surabaya, bangsaonlinecom, baznas, ri, salur,...
3    [surabayabangsaonlinecom, bangun, musala, lant...
4    [kota, diri, bangsaonlinecom, wakil, wali, kot...
Name: tokens, dtype: object


## BUAT DICTIONARY DAN CORPUS UNTUK LDA

In [4]:
# ==============================================
# 4. BUAT DICTIONARY DAN CORPUS UNTUK LDA
# ==============================================
dictionary = corpora.Dictionary(df['tokens'])
corpus = [dictionary.doc2bow(text) for text in df['tokens']]

print("Jumlah kata unik:", len(dictionary))


Jumlah kata unik: 4138


## TRAIN LDA MODEL (EKSTRAKSI FITUR TOPIK)

In [5]:
# ==============================================
# 5. TRAIN LDA MODEL (EKSTRAKSI FITUR TOPIK)
# ==============================================
num_topics = 10  # jumlah topik bisa diubah
lda_model = gensim.models.LdaModel(
    corpus=corpus,
    id2word=dictionary,
    num_topics=num_topics,
    passes=10,
    random_state=42
)

# Lihat topik yang dihasilkan
for idx, topic in lda_model.print_topics(num_topics=num_topics, num_words=10):
    print(f"Topik {idx}: {topic}")


Topik 0: 0.019*"tuban" + 0.017*"warga" + 0.010*"pwi" + 0.008*"pt" + 0.007*"rumah" + 0.006*"desa" + 0.006*"indonesia" + 0.006*"hadir" + 0.006*"usaha" + 0.006*"beras"
Topik 1: 0.011*"pizza" + 0.009*"restoran" + 0.008*"pasta" + 0.008*"italia" + 0.008*"santri" + 0.008*"kota" + 0.008*"ubud" + 0.007*"mamdani" + 0.007*"makan" + 0.007*"asep"
Topik 2: 0.013*"angin" + 0.010*"tanah" + 0.008*"cepat" + 0.007*"arah" + 0.007*"wib" + 0.007*"c" + 0.007*"cuaca" + 0.007*"ms" + 0.007*"suhu" + 0.006*"indonesia"
Topik 3: 0.010*"dunia" + 0.010*"asep" + 0.009*"motor" + 0.009*"buku" + 0.009*"masud" + 0.009*"kiai" + 0.008*"laku" + 0.008*"adnan" + 0.008*"curi" + 0.007*"banten"
Topik 4: 0.015*"nu" + 0.015*"gus" + 0.013*"yahya" + 0.010*"diri" + 0.009*"iphone" + 0.008*"pbnu" + 0.008*"ketua" + 0.007*"kiai" + 0.007*"malang" + 0.007*"digital"
Topik 5: 0.017*"rp" + 0.013*"nomor" + 0.013*"nabi" + 0.010*"kg" + 0.010*"sedekah" + 0.010*"maulid" + 0.008*"ingat" + 0.008*"" + 0.007*"saw" + 0.006*"indonesia"
Topik 6: 0.016*"kl

## KONVERSI DISTRIBUSI TOPIK KE FITUR NUMERIK

In [6]:
# ==============================================
# 6. KONVERSI DISTRIBUSI TOPIK KE FITUR NUMERIK
# ==============================================
def get_topic_vector(lda_model, corpus_doc, num_topics):
    topic_dist = [0] * num_topics
    for topic_num, prob in lda_model.get_document_topics(corpus_doc):
        topic_dist[topic_num] = prob
    return topic_dist

lda_features = [get_topic_vector(lda_model, doc, num_topics) for doc in corpus]
X = pd.DataFrame(lda_features)
print("Shape fitur:", X.shape)


Shape fitur: (74, 10)


## ENCODE LABEL DAN SPLIT DATA (VERSI FIXED 20%)

In [7]:
# ==============================================
# 7. ENCODE LABEL DAN SPLIT DATA (VERSI FIXED 20%)
# ==============================================
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

le = LabelEncoder()
y = le.fit_transform(df[label_col])

test_size = 0.2  # tetap 20%

try:
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=42, stratify=y
    )
    print("✅ Split stratified berhasil.")
except ValueError:
    print("⚠️ Stratified split gagal, pakai split biasa.")
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=42
    )

print("Jumlah data latih:", X_train.shape[0])
print("Jumlah data uji:", X_test.shape[0])


⚠️ Stratified split gagal, pakai split biasa.
Jumlah data latih: 59
Jumlah data uji: 15


## LATIH MODEL KLASIFIKASI (LOGISTIC REGRESSION)

In [8]:
# ==============================================
# 8. LATIH MODEL KLASIFIKASI (LOGISTIC REGRESSION)
# ==============================================
model = LogisticRegression(max_iter=300, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)


  y_type = type_of_target(y, input_name="y")


## EVALUASI HASIL KLASIFIKASI

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# ==============================================
# 9. EVALUASI HASIL KLASIFIKASI
# ==============================================
print("Akurasi:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Ambil label yang benar-benar muncul di data uji
labels_in_test = sorted(list(set(y_test)))

# Ambil nama kategori yang sesuai label di atas
target_names = [le.classes_[i] for i in labels_in_test]

print("\nClassification Report:\n")
print(classification_report(y_test, y_pred, labels=labels_in_test, target_names=target_names))


Akurasi: 0.0

Confusion Matrix:
 [[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0]
 [0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0]
 [0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0]]

Classification Repor

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [10]:
pip install pyLDAvis



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


## VISUALISASI TOPIK LDA

In [None]:
# ==============================================
# 10. VISUALISASI TOPIK LDA
# ==============================================
# Jalankan hanya jika mau lihat visualisasi topik interaktif
# pip install pyLDAvis
import pyLDAvis
import pyLDAvis.gensim

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(vis)
