# TF-IDF & Word Embedding

## TF - IDF Vectorizer

In [19]:
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

# Konfigurasi Pandas untuk tampilan penuh
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

# Fungsi untuk membersihkan teks
def clean_text(text):
    """
    Membersihkan teks dengan mengubahnya menjadi huruf kecil,
    menghapus tanda baca, angka, dan spasi berlebih.
    """
    if not isinstance(text, str):
        return ''
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)   # hapus tanda baca
    text = re.sub(r'\d+', '', text)       # hapus angka
    text = re.sub(r'\s+', ' ', text).strip()  # hapus spasi berlebih
    return text

# Menggunakan try-except untuk membaca data dari file CSV
try:
    # Membaca seluruh data dari file CSV berita_tempo.csv
    df = pd.read_csv('berita_tempo.csv')
    print("Berhasil memuat seluruh data dari 'berita_tempo.csv'.")
    
    # Gunakan kolom 'isi' sebagai korpus
    corpus = df['isi'].astype(str).tolist()
except FileNotFoundError:
    print("Error: File 'berita_tempo.csv' tidak ditemukan.")
    print("Menggunakan data dummy untuk demonstrasi.")
    corpus = [
        "Pelatih Atletico Madrid, Diego Simeone, mengaku kesal saat dihina fans Liverpool sepanjang pertandingan.",
        "Atletico Madrid takluk dari Liverpool dengan skor 2-3 di Anfield pada lanjutan Liga Champions.",
        "Bicara soal seberapa hebat Lionel Messi, Zinedine Zidane punya sudut pandang berbeda.",
        "Marc Marquez menjalani operasi pada lengannya usai kecelakaan di MotoGP Portugal 2025.",
        "Pebalap Yamaha, Alex Rins akan menggunakan motor M1 yang dikembangkan untuk 2026."
    ]

# Menerapkan pembersihan teks pada korpus
cleaned_corpus = [clean_text(text) for text in corpus]

# Inisialisasi CountVectorizer untuk menghitung Term Frequency (TF)
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(cleaned_corpus)
count_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

# Inisialisasi TfidfTransformer untuk menghitung IDF dan TF-IDF
transformer = TfidfTransformer()

# ---
### 1. Hasil Count Vectorizer (Matriks Term Frequency)
print("---")
print(">>> Count Vectorizer (Matriks Term Frequency):")
print(count_df.to_string())
print("-" * 50)

# ---
### 2. Hasil IDF (Inverse Document Frequency)
transformer.fit(X)
idf_values = transformer.idf_
feature_names = vectorizer.get_feature_names_out()
idf_df = pd.DataFrame({'word': feature_names, 'idf_value': idf_values})
print(">>> IDF (Inverse Document Frequency):")
print(idf_df.to_string())
print("-" * 50)

# ---
### 3. Hasil TF-IDF (Term Frequency - Inverse Document Frequency)
tfidf_matrix = transformer.transform(X)
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)
print(">>> TF-IDF (Term Frequency - Inverse Document Frequency):")
print(tfidf_df.to_string())
print("-" * 50)

Berhasil memuat seluruh data dari 'berita_tempo.csv'.
---
>>> Count Vectorizer (Matriks Term Frequency):
     nan
0      1
1      1
2      1
3      1
4      1
5      1
6      1
7      1
8      1
9      1
10     1
11     1
12     1
13     1
14     1
15     1
16     1
17     1
18     1
19     1
20     1
21     1
22     1
23     1
24     1
25     1
26     1
27     1
28     1
29     1
30     1
31     1
32     1
33     1
34     1
35     1
36     1
37     1
38     1
39     1
40     1
41     1
42     1
43     1
44     1
45     1
46     1
47     1
48     1
49     1
50     1
51     1
52     1
53     1
54     1
55     1
56     1
57     1
58     1
59     1
60     1
61     1
62     1
63     1
64     1
65     1
66     1
67     1
68     1
69     1
70     1
71     1
72     1
73     1
74     1
75     1
76     1
77     1
78     1
79     1
80     1
81     1
82     1
83     1
84     1
85     1
86     1
87     1
88     1
89     1
90     1
91     1
92     1
93     1
94     1
95     1
96     1
97     1
98  

## Word Embeding

In [27]:
import pandas as pd
import re
import numpy as np
from gensim.models import Word2Vec
import spacy
from transformers import pipeline

# --- Bagian 1: Word Embedding (Pembuatan Representasi Vektor) dengan Gensim ---

# Konfigurasi Pandas dan Numpy untuk tampilan penuh
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
np.set_printoptions(threshold=np.inf)

print(">>> Bagian 1: Membuat Word Embedding dengan Gensim")
print("---")

# Membaca data
try:
    df = pd.read_csv('berita_tempo.csv')
    print("Berhasil memuat seluruh data dari 'berita_tempo.csv'.")
except FileNotFoundError:
    print("Error: File 'berita_tempo.csv' tidak ditemukan.")
    print("Gunakan data dummy untuk demonstrasi.")
    data = {'judul': [
        "Manajemen strategi perusahaan menghadapi persaingan global",
        "Pengaruh kepemimpinan terhadap kinerja organisasi",
        "Analisis keuangan perusahaan berbasis rasio likuiditas"
    ]}
    df = pd.DataFrame(data)

# Gunakan kolom 'judul' sebagai korpus (karena 'isi' NaN semua)
corpus = df['judul'].dropna().astype(str).tolist()

# Pembersihan teks dasar
def clean_text(text):
    if not isinstance(text, str):
        return ''
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Tokenisasi untuk Word2Vec (list of lists)
tokenized_corpus = [clean_text(text).split() for text in corpus if text.strip()]

# Bangun model Word2Vec
embedding_dim = 128
if tokenized_corpus and any(tokenized_corpus):
    word2vec_model = Word2Vec(
        sentences=tokenized_corpus,
        vector_size=embedding_dim,
        window=5,
        min_count=1,
        sg=0  # CBOW, pakai sg=1 kalau mau Skip-gram
    )
    word2vec_model.train(tokenized_corpus, total_examples=word2vec_model.corpus_count, epochs=10)

    words = list(word2vec_model.wv.index_to_key)
    vectors = [word2vec_model.wv[word] for word in words]

    embedding_df = pd.DataFrame(vectors, index=words)
    print("\n>>> Word Embeddings yang sudah terlatih (contoh beberapa kata):")
    print(embedding_df.head(10).to_string())
else:
    print("\nKorpus kosong atau tidak valid, tidak dapat membuat embeddings.")

print("-" * 50)

print("\n>>> Bagian 2: Analisis Sentimen (Fallback Kamus, tanpa PyTorch)")
print("---")

# Kamus kata positif dan negatif (lebih kaya, khusus untuk berita olahraga)
positive_words = {
    "bagus","baik","positif","hebat","menang","sukses","senang","gembira","puas",
    "mantap","unggul","kemenangan","dramatis","permalukan","raih","lolos","ungguli",
    "bangkit","berhasil","gemilang","pecah","unggulan"
}

negative_words = {
    "buruk","jelek","negatif","kalah","gagal","marah","sedih","kecewa","khawatir",
    "rusak","problem","kontroversi","dibatasi","skandal","batal","terpuruk",
    "ancaman","hancur","bentrok","krisis"
}

def simple_sentiment(text):
    text = text.lower()
    pos_count = sum(word in text for word in positive_words)
    neg_count = sum(word in text for word in negative_words)

    if pos_count > neg_count:
        return "POSITIVE", pos_count / (pos_count + neg_count)
    elif neg_count > pos_count:
        return "NEGATIVE", neg_count / (pos_count + neg_count)
    else:
        return "NEUTRAL", 0.0

# Ambil contoh 10 judul pertama
judul_contoh = df['judul'].dropna().head(10).tolist()

for i, teks in enumerate(judul_contoh, start=1):
    label, score = simple_sentiment(teks)
    print(f"Judul {i}: '{teks}'")
    print(f"    Label: {label} -> Skor: {score:.4f}")
    print("-" * 30)

# --- Bagian 3: Deteksi Entitas Bernama (NER dengan spaCy) ---

print("\n>>> Bagian 3: Deteksi Entitas Bernama (Menggunakan spaCy)")
print("---")

try:
    nlp = spacy.load("en_core_web_sm")  # model default bahasa Inggris
    # Ambil contoh teks dari dataset (judul pertama)
    teks_ner = df['judul'].dropna().astype(str).iloc[0] if not df['judul'].dropna().empty else \
               "Apple Inc. founded by Steve Jobs in California, is a major technology company."

    doc = nlp(teks_ner)

    print(f"Teks: {teks_ner[:120]}...\n")
    print("Entitas yang Terdeteksi:")
    for ent in doc.ents:
        print(f"    - {ent.text} ({ent.label_})")

except OSError:
    print("Model spaCy 'en_core_web_sm' tidak ditemukan.")
    print("Silakan install dengan perintah: python -m spacy download en_core_web_sm")
    print("-" * 50)

>>> Bagian 1: Membuat Word Embedding dengan Gensim
---
Berhasil memuat seluruh data dari 'berita_tempo.csv'.

>>> Word Embeddings yang sudah terlatih (contoh beberapa kata):
                0         1         2         3         4         5         6         7         8         9         10        11        12        13        14        15        16        17        18        19        20        21        22        23        24        25        26        27        28        29        30        31        32        33        34        35        36        37        38        39        40        41        42        43        44        45        46        47        48        49        50        51        52        53        54        55        56        57        58        59        60        61        62        63        64        65        66        67        68        69        70        71        72        73        74        75        76        77        78        79        80        

## Klasifikasi

In [28]:
import pandas as pd
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense
import sys

# --- Fungsi Pembersihan Teks
def clean_text(text):
    if not isinstance(text, str):
        return ''
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# --- Memuat Data dari File CSV
file_path = 'berita_tempo.csv'
try:
    df = pd.read_csv(file_path)
    print(f"Berhasil memuat seluruh data dari file CSV: {file_path}")

    # Periksa apakah kolom 'judul' ada
    if 'judul' not in df.columns:
        print("Error: Kolom 'judul' tidak ditemukan dalam file CSV.")
        sys.exit()

    # Periksa apakah kolom 'kategori' ada
    if 'kategori' not in df.columns:
        print("Error: Kolom 'kategori' tidak ditemukan.")
        sys.exit()

    # --- Filter data valid ---
    df.dropna(subset=['judul'], inplace=True)
    counts = df['kategori'].value_counts()
    valid_labels = counts[counts >= 2].index
    df = df[df['kategori'].isin(valid_labels)]

    corpus = df['judul'].tolist()

    # Label kategori -> integer
    label_to_id = {label: i for i, label in enumerate(df['kategori'].unique())}
    labels = [label_to_id[label] for label in df['kategori']]

    print("Kolom 'kategori' ditemukan. Menggunakan label dari file.")
    print(f"Menggunakan label: {label_to_id}")

except FileNotFoundError:
    print(f"Error: File '{file_path}' tidak ditemukan.")
    sys.exit()

# --- Bersihkan teks
cleaned_corpus = [clean_text(text) for text in corpus]

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    cleaned_corpus, labels, test_size=0.2,
    random_state=100, stratify=labels
)

print(f"\nJumlah total data: {len(df)}")
print(f"Jumlah data latih: {len(X_train)}")
print(f"Jumlah data uji: {len(X_test)}")

# ======================================================
# 1. KLASIFIKASI MENGGUNAKAN TF-IDF
# ======================================================
print("\n" + "="*50)
print(">>> KLASIFIKASI MENGGUNAKAN TF-IDF")
print("="*50)

vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Contoh representasi
print("\n--- REPRESENTASI DATA DAN KONTEKS TF-IDF ---")
contoh_tfidf = vectorizer.transform([X_train[0]])
fitur_tfidf = vectorizer.get_feature_names_out()
df_contoh_tfidf = pd.DataFrame(contoh_tfidf.T.todense(), index=fitur_tfidf, columns=['Skor TF-IDF'])
df_contoh_tfidf = df_contoh_tfidf[df_contoh_tfidf['Skor TF-IDF'] > 0].sort_values(by='Skor TF-IDF', ascending=False)
print(df_contoh_tfidf.head(20).to_string())

# Logistic Regression
model_tfidf = LogisticRegression(max_iter=1000, C=0.3, class_weight='balanced')
model_tfidf.fit(X_train_tfidf, y_train)

y_pred_tfidf = model_tfidf.predict(X_test_tfidf)
print(f"\nAkurasi TF-IDF: {accuracy_score(y_test, y_pred_tfidf):.2f}")
print("Laporan Klasifikasi TF-IDF:")
print(classification_report(y_test, y_pred_tfidf, zero_division=0))

# ======================================================
# 2. KLASIFIKASI MENGGUNAKAN WORD EMBEDDING (Keras)
# ======================================================
print("\n" + "="*50)
print(">>> KLASIFIKASI MENGGUNAKAN WORD EMBEDDING")
print("="*50)

# Tokenisasi dan padding
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)
vocab_size = len(tokenizer.word_index) + 1
max_len = max(len(s.split()) for s in cleaned_corpus)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)
X_train_padded = pad_sequences(X_train_seq, maxlen=max_len, padding='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=max_len, padding='post')

embedding_dim = 100

# Build model
model_emb = Sequential()
model_emb.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len, name="embedding_layer"))
model_emb.add(Flatten())
model_emb.add(Dense(128, activation='relu'))
model_emb.add(Dense(len(set(labels)), activation='softmax'))

model_emb.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])

model_emb.fit(X_train_padded, np.array(y_train), epochs=10, verbose=1)

# Ambil vektor embedding
embedding_layer = model_emb.get_layer('embedding_layer')
embedding_weights = embedding_layer.get_weights()[0]
word_index = tokenizer.word_index
word_to_vec = {word: embedding_weights[i] for word, i in word_index.items()}

if 'liverpool' in word_to_vec:
    print(f"Vektor untuk 'liverpool':\n{word_to_vec['liverpool'][:10]}...")
else:
    print("Kata 'liverpool' tidak ditemukan di korpus.")

# Evaluasi
loss, accuracy = model_emb.evaluate(X_test_padded, np.array(y_test), verbose=0)
print(f"\nAkurasi Word Embedding: {accuracy:.2f}")

y_pred_probs_emb = model_emb.predict(X_test_padded)
y_pred_emb = np.argmax(y_pred_probs_emb, axis=1)

print("Laporan Klasifikasi Word Embedding:")
print(classification_report(y_test, y_pred_emb, zero_division=0))

Berhasil memuat seluruh data dari file CSV: berita_tempo.csv
Kolom 'kategori' ditemukan. Menggunakan label dari file.
Menggunakan label: {'bola': 0, 'metro': 1, 'dunia': 2}

Jumlah total data: 149
Jumlah data latih: 119
Jumlah data uji: 30

>>> KLASIFIKASI MENGGUNAKAN TF-IDF

--- REPRESENTASI DATA DAN KONTEKS TF-IDF ---
           Skor TF-IDF
ketua         0.373209
perempuan     0.373209
sementara     0.373209
ma            0.343505
ajukan        0.322429
eks           0.281432
gen           0.281432
pm            0.281432
jadi          0.271650
nepal         0.220870

Akurasi TF-IDF: 0.87
Laporan Klasifikasi TF-IDF:
              precision    recall  f1-score   support

           0       0.82      0.90      0.86        10
           1       1.00      0.90      0.95        10
           2       0.80      0.80      0.80        10

    accuracy                           0.87        30
   macro avg       0.87      0.87      0.87        30
weighted avg       0.87      0.87      0.87      



Epoch 1/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - accuracy: 0.3613 - loss: 1.0948
Epoch 2/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.8992 - loss: 1.0070
Epoch 3/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 1.0000 - loss: 0.9073
Epoch 4/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 1.0000 - loss: 0.7764
Epoch 5/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 1.0000 - loss: 0.6094
Epoch 6/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 1.0000 - loss: 0.4250
Epoch 7/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 1.0000 - loss: 0.2576
Epoch 8/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 1.0000 - loss: 0.1365
Epoch 9/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [