In [1]:
import pandas as pd
import joblib
import re
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from nltk.corpus import stopwords
import nltk
from tqdm import tqdm
import sys

In [6]:
# --- !!! IMPORTANT: CONFIGURE THESE VARIABLES !!! ---

# 1. The path to your new, unseen CSV file
INPUT_CSV_FILE = 'G:/My Drive/University Files/5th Semester/Data Science/Project/IndoHoaxDetector/data/tweets_from_kompascom_20251106_124453.csv'

# 2. The name of the column in your CSV that contains the main text
TEXT_COLUMN_NAME = 'text' 

# 3. (Optional) The name of the title column, if you want to include it.
#    Set to None if you don't have a title column.
TITLE_COLUMN_NAME = 'title' 

# 4. The name of the file to save your results to
OUTPUT_CSV_FILE = 'prediction_results_tweets_from_kompascom_20251106_124453.csv'

# ---------------------------------------------------

In [3]:
# --- 1. Setup Preprocessing Tools ---
print("Setting up preprocessing tools...")
try:
    # Download NLTK stopwords if not already present
    stopwords.words('indonesian')
except LookupError:
    print("NLTK stopwords not found. Downloading...")
    nltk.download('stopwords')

# Initialize Stemmer (can take a moment)
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# Get Indonesian stop words
stop_words = set(stopwords.words('indonesian'))

def preprocess_text(text):
    """
    The exact same cleaning function used during training.
    """
    text = str(text).lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = text.split()
    cleaned_tokens = []
    for word in tokens:
        if word not in stop_words:
            stemmed_word = stemmer.stem(word)
            cleaned_tokens.append(stemmed_word)
    return ' '.join(cleaned_tokens)


Setting up preprocessing tools...


In [4]:
# --- 2. Load Model and Vectorizer ---
print("Loading saved model and vectorizer...")
try:
    model = joblib.load('logreg_model.pkl')
    vectorizer = joblib.load('tfidf_vectorizer.pkl')
except FileNotFoundError:
    print("Error: Could not find 'logreg_model.pkl' or 'tfidf_vectorizer.pkl'.")
    print("Make sure these files are in the same folder as this script.")
    sys.exit()
print("Files loaded successfully.")


Loading saved model and vectorizer...
Files loaded successfully.


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [7]:
# --- 3. Load and Process New CSV File ---
print(f"Loading new data from '{INPUT_CSV_FILE}'...")
try:
    df = pd.read_csv(INPUT_CSV_FILE)
except FileNotFoundError:
    print(f"Error: Input file '{INPUT_CSV_FILE}' not found.")
    sys.exit()

# Check if the required text column exists
if TEXT_COLUMN_NAME not in df.columns:
    print(f"Error: Column '{TEXT_COLUMN_NAME}' not found in your CSV.")
    print(f"Available columns are: {list(df.columns)}")
    sys.exit()

# Create a combined text column for processing
print("Preparing text for preprocessing...")
# Ensure text column is string and fill missing values
df['text_to_process'] = df[TEXT_COLUMN_NAME].astype(str).fillna('')

# Prepend title if it exists and is specified
if TITLE_COLUMN_NAME and TITLE_COLUMN_NAME in df.columns:
    df['title_str'] = df[TITLE_COLUMN_NAME].astype(str).fillna('')
    df['text_to_process'] = df['title_str'] + ' ' + df['text_to_process']

Loading new data from 'G:/My Drive/University Files/5th Semester/Data Science/Project/IndoHoaxDetector/data/tweets_from_kompascom_20251106_124453.csv'...
Preparing text for preprocessing...


In [9]:
# --- 4. Preprocess and Predict ---
print("Cleaning new text... (This may take a while for large files)")
tqdm.pandas(desc="Cleaning Text")
df['text_clean'] = df['text_to_process'].progress_apply(preprocess_text)

print("Vectorizing text (using loaded TF-IDF)...")
# IMPORTANT: Use .transform() only. DO NOT use .fit_transform()
X_new = vectorizer.transform(df['text_clean'])

print("Making predictions...")
# Predict the labels (0 or 1)
predictions = model.predict(X_new)

# Get the confidence probabilities
probabilities = model.predict_proba(X_new)

Cleaning new text... (This may take a while for large files)


Cleaning Text: 100%|██████████| 5000/5000 [18:21<00:00,  4.54it/s] 

Vectorizing text (using loaded TF-IDF)...
Making predictions...





In [10]:
# --- 5. Format and Save Results ---
print("Formatting results...")
# Add predictions to the DataFrame
df['predicted_label'] = predictions
df['prediction'] = df['predicted_label'].map({0: 'FAKTA', 1: 'HOAX'})

# Add the confidence score for the predicted class
df['confidence_score'] = probabilities.max(axis=1)

# Select columns to save
columns_to_save = [col for col in df.columns if col not in 
                   ['text_to_process', 'title_str', 'text_clean', 'predicted_label']]

final_df = df[columns_to_save]

# Save the final results
final_df.to_csv(OUTPUT_CSV_FILE, index=False)

print("\n--- Batch Prediction Complete ---")
print(f"Results saved to '{OUTPUT_CSV_FILE}'")
print("\nPreview of results:")
print(final_df.head())

Formatting results...

--- Batch Prediction Complete ---
Results saved to 'prediction_results_tweets_from_kompascom_20251106_124453.csv'

Preview of results:
                    id                                               text  \
0  1986283688972460183  Asosiasi Sepak Bola Malaysia (FAM) berpotensi ...   
1  1986281660158255211  Di balik ceritanya yang menusuk dan sinematogr...   
2  1986281407132672265  Presiden Meksiko Claudia Sheinbaum mengalami p...   
3  1986281320046338235  Berdasarkan Laporan Harta Kekayaan Penyelengga...   
4  1986279330906071392  Otoritas Portugal menangkap empat orang setela...   

                       created_at        user   username  retweet_count  \
0  Thu Nov 06 04:05:09 +0000 2025  Kompas.com  kompascom              0   
1  Thu Nov 06 03:57:05 +0000 2025  Kompas.com  kompascom              0   
2  Thu Nov 06 03:56:05 +0000 2025  Kompas.com  kompascom              0   
3  Thu Nov 06 03:55:44 +0000 2025  Kompas.com  kompascom              0   
4  T