In [7]:
import pandas as pd

df = pd.read_csv("../cleandataset/hoax_dataset_merged.csv")
print(df.head())
print(df['label'].value_counts())  # Check class balance

                                             cleaned  label
0  kota dunia rawan risiko lancong saran asuransi...      0
1  rasa fitur corolla cross hybrid gr sport lengk...      0
2  hasil man united vs paok gol diallo hadir poin...      0
3  temu prabowo menkes lapor potensi kerja asingg...      0
4  ras kucing bulu tebal jakarta kompascom salah ...      0
label
1    2993
0    2735
Name: count, dtype: int64


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)  # Limit to top 5000 words
X = vectorizer.fit_transform(df['cleaned'])  # 'cleaned' = preprocessed text
y = df['label']

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

model = LogisticRegression()
model.fit(X_train, y_train)

# Predict & evaluate
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.94      0.97      0.96       551
           1       0.98      0.94      0.96       595

    accuracy                           0.96      1146
   macro avg       0.96      0.96      0.96      1146
weighted avg       0.96      0.96      0.96      1146



In [11]:
import joblib

joblib.dump(model, "../models/hoax_model.pkl")
joblib.dump(vectorizer, "../models/vectorizer.pkl")

['../models/vectorizer.pkl']

In [15]:
def get_top_influential_words(text, vectorizer, model, top_n=5):
    vectorized = vectorizer.transform([text])
    coefs = model.coef_[0]  # Logistic regression coefficients
    feature_names = vectorizer.get_feature_names_out()
    
    indices = vectorized.nonzero()[1]  # non-zero indices of features in the text
    word_scores = {feature_names[i]: coefs[i] for i in indices}
    
    # Sort words by absolute coefficient value descending
    sorted_words = sorted(word_scores.items(), key=lambda x: abs(x[1]), reverse=True)
    
    return sorted_words[:top_n]
