In [4]:
# --- 2. Load Model and Vectorizer ---
print("Loading saved model and vectorizer...")
try:
    model = joblib.load('logreg_model.pkl')
    vectorizer = joblib.load('tfidf_vectorizer.pkl')
except FileNotFoundError:
    print("Error: Could not find 'logreg_model.pkl' or 'tfidf_vectorizer.pkl'.")
    print("Make sure these files are in the same folder as this script.")
    sys.exit()
print("Files loaded successfully.")

# --- 3. Load and Process New CSV File ---
print(f"Loading new data from '{INPUT_CSV_FILE}'...")
try:
    df = pd.read_csv(INPUT_CSV_FILE)
except FileNotFoundError:
    print(f"Error: Input file '{INPUT_CSV_FILE}' not found.")
    sys.exit()

# Check if the required text column exists
if TEXT_COLUMN_NAME not in df.columns:
    print(f"Error: Column '{TEXT_COLUMN_NAME}' not found in your CSV.")
    print(f"Available columns are: {list(df.columns)}")
    sys.exit()

# Create a combined text column for processing
print("Preparing text for preprocessing...")
# Ensure text column is string and fill missing values
df['text_to_process'] = df[TEXT_COLUMN_NAME].astype(str).fillna('')

# Prepend title if it exists and is specified
if TITLE_COLUMN_NAME and TITLE_COLUMN_NAME in df.columns:
    df['title_str'] = df[TITLE_COLUMN_NAME].astype(str).fillna('')
    df['text_to_process'] = df['title_str'] + ' ' + df['text_to_process']

# --- 4. Preprocess and Predict ---
print("Cleaning new text... (This may take a while for large files)")
tqdm.pandas(desc="Cleaning Text")
df['text_clean'] = df['text_to_process'].progress_apply(preprocess_text)

print("Vectorizing text (using loaded TF-IDF)...")
# IMPORTANT: Use .transform() only. DO NOT use .fit_transform()
X_new = vectorizer.transform(df['text_clean'])

print("Making predictions...")
# Predict the labels (0 or 1)
predictions = model.predict(X_new)

# Get the confidence probabilities
probabilities = model.predict_proba(X_new)

# --- 5. Format and Save Results ---
print("Formatting results...")
# Add predictions to the DataFrame
df['predicted_label'] = predictions
df['prediction'] = df['predicted_label'].map({0: 'FAKTA', 1: 'HOAX'})

# Add the confidence score for the predicted class
df['confidence_score'] = probabilities.max(axis=1)

# Select columns to save
columns_to_save = [col for col in df.columns if col not in 
                   ['text_to_process', 'title_str', 'text_clean', 'predicted_label']]

final_df = df[columns_to_save]

# Save the final results
final_df.to_csv(OUTPUT_CSV_FILE, index=False)

print("\n--- Batch Prediction Complete ---")
print(f"Results saved to '{OUTPUT_CSV_FILE}'")
print("\nPreview of results:")
print(final_df.head())

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Loading saved model and vectorizer...
Files loaded successfully.
Loading new data from 'data/tweets_from_cekfaktacom_20251105_144313.csv'...
Preparing text for preprocessing...
Cleaning new text... (This may take a while for large files)


Cleaning Text: 100%|██████████| 1990/1990 [03:03<00:00, 10.84it/s]

Vectorizing text (using loaded TF-IDF)...
Making predictions...
Formatting results...

--- Batch Prediction Complete ---
Results saved to 'prediction_results.csv'

Preview of results:
                    id                                               text  \
0  1966029663010197798  Baca artikel lengkap penelusuran kami di tauta...   
1  1966029471858962914  Benarkah ada artikel berita milik detikcom yan...   
2  1966029268821110886  Sebuah narasi beredar menyebut bahwa “Anies Ba...   
3  1964980465879891996  (2/2) \nBaca artikel lengkap penelusuran kami ...   
4  1964980252867985492  Sebuah unggahan video di TikTok dan Facebook m...   

                       created_at         user     username  retweet_count  \
0  Thu Sep 11 06:42:53 +0000 2025  cekfaktacom  cekfaktacom              0   
1  Thu Sep 11 06:42:07 +0000 2025  cekfaktacom  cekfaktacom              0   
2  Thu Sep 11 06:41:19 +0000 2025  cekfaktacom  cekfaktacom              0   
3  Mon Sep 08 09:13:45 +0000 2025  cekfak


