# UTS Klasifikasi LDA Naive Bayes & SVM

In [1]:
!pip install scikit-learn gensim nltk pandas numpy


Defaulting to user installation because normal site-packages is not writeable


In [2]:
import os, re, warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [3]:
try:
    import gensim
    from gensim import corpora, models
    HAS_GENSIM = True
except Exception:
    HAS_GENSIM = False

try:
    from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
    sastrawi_stemmer = StemmerFactory().create_stemmer()
    HAS_SASTRAWI = True
except Exception:
    HAS_SASTRAWI = False
    sastrawi_stemmer = None

In [4]:
CSV_PATH = "Berita.csv"  
MERGE_TITLE = True
NUM_TOPICS = 10           
TEST_SIZE = 0.2
RANDOM_STATE = 42
PERFORM_CV = False       
CV_FOLDS = 5
MIN_TOKEN_LEN = 3
FILTER_DICT_MIN = 5      
FILTER_DICT_MAX_PCT = 0.5

In [5]:
ind_stop = {
"yang","dan","di","ke","dari","pada","untuk","dengan","atau","adalah","ini","itu","sebagai","oleh","karena","agar",
"bisa","juga","sangat","tidak","dalam","antara","lebih","jika","tetapi","namun","per","setiap","hingga","sehingga",
"memiliki","telah","baru","saja","kami","kita","anda","dia","mereka","seperti","ada","apa","siapa","dimana","kapan",
"bagaimana","yaitu","maupun","lain","lainnya","oleh","setelah","sebelum","sejak","masih","harus"
}
eng_stop = {"the","and","is","in","to","of","a","for","on","that","with","as","by","an","are","was","be","from"}
STOPWORDS = set([w.lower() for w in ind_stop]) | eng_stop

def preprocess_text(s, stem=HAS_SASTRAWI):
    if not isinstance(s, str):
        return ""
    s = s.lower()
    s = re.sub(r'http\S+|www\.\S+', ' ', s)
    s = re.sub(r'\S+@\S+', ' ', s)
    s = re.sub(r'[^a-z\s]', ' ', s)
    tokens = s.split()
    tokens = [t for t in tokens if len(t) >= MIN_TOKEN_LEN and t not in STOPWORDS]
    if stem and sastrawi_stemmer is not None:
        try:
            tokens = [sastrawi_stemmer.stem(t) for t in tokens]
        except Exception:
            pass
    return " ".join(tokens)

In [6]:
if not os.path.exists(CSV_PATH):
    raise FileNotFoundError(f"File tidak ditemukan: {CSV_PATH}")

df = pd.read_csv(CSV_PATH)

if MERGE_TITLE and "judul"  in df.columns and "berita" in df.columns:
    df['text'] = df['judul'].fillna('') + ". " + df['berita'].fillna('')
elif "berita" in df.columns:
    df['text'] = df['berita'].fillna('')
else:
    raise ValueError("Kolom 'berita' tidak ditemukan.")

if "kategori" not in df.columns:
    raise ValueError("Kolom 'kategori' (label) tidak ditemukan.")

In [7]:
df['text_clean'] = df['text'].astype(str).apply(preprocess_text)

In [8]:
if HAS_GENSIM:
    tokenized = [t.split() for t in df['text_clean'].tolist()]
    dictionary = corpora.Dictionary(tokenized)
    dictionary.filter_extremes(no_below=FILTER_DICT_MIN, no_above=FILTER_DICT_MAX_PCT, keep_n=10000)
    corpus = [dictionary.doc2bow(doc) for doc in tokenized]
    lda = models.LdaModel(corpus=corpus, id2word=dictionary, num_topics=NUM_TOPICS, random_state=RANDOM_STATE, passes=10, alpha='auto', eta='auto')

In [9]:
for tid in range(NUM_TOPICS):
        print(f"Topic {tid}: ", ", ".join([w for w,_ in lda.show_topic(tid, topn=8)]))

Topic 0:  indonesia, piala, timnas, main, aff, vietnam, dua, sama
Topic 1:  persen, prabowo, menteri, perintah, program, tahun, makan, gizi
Topic 2:  indonesia, latih, main, saya, timnas, kluivert, ahsan, tanding
Topic 3:  red, sparks, megawati, menang, poin, liga, jalan, jakarta
Topic 4:  pesawat, uang, persen, jabat, tahun, orang, air, lapor
Topic 5:  israel, senjata, gaza, gencat, hamas, palestina, serang, sepakat
Topic 6:  bakar, presiden, negara, yoon, trump, orang, perintah, los
Topic 7:  laut, pagar, tangerang, menteri, kpk, nelayan, kkp, ikan
Topic 8:  korban, banjir, orang, warga, polisi, kasus, duga, rumah
Topic 9:  menit, gol, main, gawang, hasil, dua, babak, unggul


In [22]:
def doc2vec(bow):
        vec = np.zeros(NUM_TOPICS)
        for tid, prob in lda.get_document_topics(bow, minimum_probability=0.0):
            vec[tid] = prob
        return vec
X = np.array([doc2vec(b) for b in corpus])
y = df['kategori'].astype(str).values

X = np.array([doc2vec(b) for b in corpus])
y = df['kategori'].astype(str).values

print("Bentuk matriks X:", X.shape)

topic_cols = [f"Topic_{i}" for i in range(NUM_TOPICS)]
df_topics = pd.DataFrame(X, columns=topic_cols)
df_topics['kategori'] = y

df_show = pd.concat([df[['text']].reset_index(drop=True), df_topics], axis=1)

print(df_show.head(10))


Bentuk matriks X: (1500, 10)
                                                text   Topic_0   Topic_1  \
0  Airlangga Harap Kenaikan UMP Tingkatkan Daya B...  0.000164  0.998526   
1  PT SIER Beri Penghargaan untuk 50 Tenant Terba...  0.000168  0.896207   
2  Prabowo Bakal Bentuk Kementerian Penerimaan Ne...  0.000195  0.934493   
3  Sinergi Kemenag & BPJS Ketenagakerjaan Lindung...  0.000242  0.980760   
4  Pemerintah Segera Bentuk Satgas PHK Usai Tetap...  0.000200  0.998198   
5  AHY Buka-bukaan Nasib Kelanjutan Pembangunan I...  0.000189  0.998301   
6  Badan Gizi Soal Biaya Makan Gratis Rp10 Ribu: ...  0.000217  0.998048   
7  Zulhas Minta Tambahan Anggaran Rp510 M Demi Ca...  0.000184  0.998345   
8  PLN Akan Uji Coba PLTS IKN 22 Desember. Uji co...  0.181992  0.815390   
9  Profil Jhony Saputra, Anak Haji Isam yang Jadi...  0.000305  0.368914   

    Topic_2   Topic_3   Topic_4   Topic_5   Topic_6   Topic_7   Topic_8  \
0  0.000170  0.000163  0.000156  0.000150  0.000189  0.0001

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y)


In [14]:
nb = MultinomialNB()
nb.fit(X_train, y_train)
y_pred_nb = nb.predict(X_test)

print("=== HASIL NAÏVE BAYES ===")
print("Akurasi:", accuracy_score(y_test, y_pred_nb))
print(classification_report(y_test, y_pred_nb))


=== HASIL NAÏVE BAYES ===
Akurasi: 0.8166666666666667
               precision    recall  f1-score   support

      Ekonomi       0.73      0.91      0.81        75
Internasional       0.88      0.79      0.83        75
     Nasional       0.68      0.59      0.63        75
     Olahraga       0.99      0.99      0.99        75

     accuracy                           0.82       300
    macro avg       0.82      0.82      0.81       300
 weighted avg       0.82      0.82      0.81       300



In [15]:
svm = SVC(kernel='linear', random_state=42)
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)

print("=== HASIL SVM ===")
print("Akurasi:", accuracy_score(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm))

=== HASIL SVM ===
Akurasi: 0.84
               precision    recall  f1-score   support

      Ekonomi       0.73      0.91      0.81        75
Internasional       0.95      0.81      0.88        75
     Nasional       0.74      0.64      0.69        75
     Olahraga       0.96      1.00      0.98        75

     accuracy                           0.84       300
    macro avg       0.85      0.84      0.84       300
 weighted avg       0.85      0.84      0.84       300

