In [None]:
import pandas as pd
import numpy as np

# Membaca file Excel dengan ekstensi .xls
file_path = '/content/hasil_preprocessing.csv'
data = pd.read_csv(file_path)

# Menampilkan beberapa baris pertama dari dataset
print(data.head())

                                               title  \
0                       LPPOM MUI | Bogor - Facebook   
1                     LPH Halal Nusantara - Facebook   
2        Info Sertifikasi Halal Indonesia - Facebook   
3                Halal Corridor | Jakarta - Facebook   
4  PERESMIAN ESQ HALAL CENTER ESQ mendukung progr...   

                                                href  \
0           https://www.facebook.com/halalindonesia/   
1        https://www.facebook.com/halalnusantara.id/   
2  https://www.facebook.com/groups/1127431861260327/   
3  https://www.facebook.com/profile.php/?id=61565...   
4  https://www.facebook.com/AryGinanjarAgustian/v...   

                                                body        source  \
0  LPPOM MUI, Bogor, Indonesia. 145,648 likes · 4...  facebook.com   
1  pentingnya sertifikasi halal, tidak hanya dari...  facebook.com   
2  Group ini adalah media silaturahmi antar Pelak...  facebook.com   
3  Jun 1, 2025· 󰟠 Seru-seruan bareng Halal Cor

In [None]:
pip install pandas openpyxl transformers scikit-learn nltk



In [None]:
import pandas as pd
from transformers import pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import nltk
import re
import numpy as np
# Unduh stopwords jika belum
nltk.download('stopwords')
from nltk.corpus import stopwords

# ===== STEP 1: BACA DATA DAN HAPUS DUPLIKAT =====
df = pd.read_csv("hasil_preprocessing.csv")
df = df.drop_duplicates(subset="clean").reset_index(drop=True)

# ===== STEP 2: ANALISIS SENTIMEN MENGGUNAKAN MODEL MULTIBAHASA BERT =====
sentiment_pipeline = pipeline("sentiment-analysis", model="nlptown/bert-base-multilingual-uncased-sentiment")

def map_sentiment(label):
    if label in ["1 star", "2 stars"]:
        return "Negatif"
    elif label == "3 stars":
        return "Netral"
    else:
        return "Positif"

texts = df["clean"].astype(str).apply(lambda x: x[:512]).tolist()
results = sentiment_pipeline(texts, batch_size=16)
df["Sentimen"] = [map_sentiment(r["label"]) for r in results]

# ===== STEP 3: EKSTRAK MAKSIMAL 8 TOPIK =====
def preprocess(text):
    text = re.sub(r'\d+', '', text.lower())
    text = re.sub(r'\b(?:' + '|'.join(stopwords.words('indonesian')) + r')\b', '', text)
    return text

df["preprocessed"] = df["clean"].astype(str).apply(preprocess)

vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(df["preprocessed"])

k = 8  # maksimal 8 topik
model = KMeans(n_clusters=k, random_state=42, n_init=10) # Added n_init to suppress warning
df["Topik"] = model.fit_predict(X)

# Optional: Labelkan topik dengan kata kunci utama
topik_label = []
for i in range(k):
    idx = (df["Topik"] == i).values
    tfidf_mean = X[idx].mean(axis=0).A1
    keywords = [vectorizer.get_feature_names_out()[j] for j in tfidf_mean.argsort()[-3:][::-1]]
    topik_label.append(", ".join(keywords))

df["Topik"] = df["Topik"].apply(lambda i: topik_label[i])

# ===== STEP 4: SIMPAN HASIL KE FILE EXCEL =====
df_final = df[["title", "href", "body", "source", "clean", "Sentimen", "Topik"]]
df_final.to_excel("hasil_sentimen_topik.xlsx", index=False, sheet_name="Data")

# ===== STEP 5: CETAK RINGKASAN =====
print("\n📊 Ringkasan Sentimen:")
print(df["Sentimen"].value_counts())

print("\n📚 Ringkasan Topik:")
print(df["Topik"].value_counts())

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Device set to use cpu



📊 Ringkasan Sentimen:
Sentimen
Negatif    403
Positif     93
Netral      26
Name: count, dtype: int64

📚 Ringkasan Topik:
Topik
halal, indonesia, sertifikasi    138
sertifikasi, halal, umkm         133
produk, halal, bpjph              64
makanan, islam, halal             47
ayam, goreng, widuran             47
restoran, tokyo, jepang           35
bihalal, halal, tradisi           34
bsi, international, expo          24
Name: count, dtype: int64
