In [3]:
# Retrain News vs. Non-News Classifier
# This script retrains the classifier to better distinguish between news and Amazon descriptions

import pandas as pd
import numpy as np
import re
import joblib
import os
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline

# Function to preprocess text
def preprocess_text(text):
    if isinstance(text, str):
        text = text.lower()
        text = re.sub(r'[^\\w\\s]', ' ', text)
        text = re.sub(r'\\s+', ' ', text)
        return text.strip()
    return ""

# Step 1: Load the data
print("Loading datasets...")

# Load non-news data (Amazon descriptions)
non_news_path = 'News_Dataset/descriptions_sampled.txt'
try:
    with open(non_news_path, 'r', encoding='utf-8') as f:
        non_news_texts = f.readlines()
    print(f"Loaded {len(non_news_texts)} non-news samples")
except Exception as e:
    print(f"Error loading non-news data: {str(e)}")
    non_news_texts = []

# Load news data (both fake and real)
news_paths = ['News_Dataset/Fake.csv', 'News_Dataset/True.csv']
news_texts = []

for path in news_paths:
    try:
        df = pd.read_csv(path)
        if 'text' in df.columns:
            news_texts.extend(df['text'].tolist())
        elif 'title' in df.columns and 'text' in df.columns:
            # Combine title and text if both are available
            combined_texts = df['title'] + " " + df['text']
            news_texts.extend(combined_texts.tolist())
        print(f"Loaded {len(df)} samples from {path}")
    except Exception as e:
        print(f"Error loading {path}: {str(e)}")

# Step 2: Create labels and preprocess
print("Preprocessing data...")
# Label: 0 for non-news (Amazon), 1 for news
non_news_labels = [0] * len(non_news_texts)
news_labels = [1] * len(news_texts)

# Combine datasets
all_texts = non_news_texts + news_texts
all_labels = non_news_labels + news_labels

# Preprocess all texts
preprocessed_texts = [preprocess_text(text) for text in all_texts]

# Step 3: Split into training and testing sets
print("Splitting data into train/test sets...")
X_train, X_test, y_train, y_test = train_test_split(
    preprocessed_texts, all_labels, test_size=0.2, random_state=42, stratify=all_labels
)

# Step 4: Create a pipeline with TF-IDF and Logistic Regression
print("Creating and training model pipeline...")
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=10000, ngram_range=(1, 2))),
    ('classifier', LogisticRegression(max_iter=1000, class_weight='balanced'))
])

# Define hyperparameters to tune
param_grid = {
    'classifier__C': [0.1, 1, 10, 100],
    'classifier__penalty': ['l2'],
}

# Use GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', verbose=1)
grid_search.fit(X_train, y_train)

# Get the best model
best_model = grid_search.best_estimator_
print(f"Best parameters: {grid_search.best_params_}")

# Step 5: Evaluate the model
print("Evaluating model...")
y_pred = best_model.predict(X_test)
accuracy_val = accuracy_score(y_test, y_pred)

print(f"Test Accuracy: {accuracy_val:.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Step 6: Save the model and vectorizer
print("Saving model and vectorizer...")
# Extract the vectorizer and classifier from the pipeline
vectorizer = best_model.named_steps['tfidf']
classifier = best_model.named_steps['classifier']

# Create model directory if it doesn't exist
os.makedirs('model', exist_ok=True)

# Save the vectorizer and model
joblib.dump(vectorizer, 'model/tfidf_vectorizer_news_classifier_new.pkl')
joblib.dump(classifier, 'model/logistic_news_classifier_new.pkl')

print("Model and vectorizer saved successfully.")



Loading datasets...
Loaded 44851 non-news samples
Loaded 23481 samples from News_Dataset/Fake.csv
Loaded 21417 samples from News_Dataset/True.csv
Preprocessing data...
Splitting data into train/test sets...
Creating and training model pipeline...
Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best parameters: {'classifier__C': 0.1, 'classifier__penalty': 'l2'}
Evaluating model...
Test Accuracy: 0.8458
Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.78      0.84      8970
           1       0.81      0.91      0.86      8980

    accuracy                           0.85     17950
   macro avg       0.85      0.85      0.85     17950
weighted avg       0.85      0.85      0.85     17950

Saving model and vectorizer...
Model and vectorizer saved successfully.


In [5]:
def predict_text(text, model, vectorizer):
    processed = preprocess_text(text)
    vec = vectorizer.transform([processed])
    pred = model.predict(vec)[0]
    proba = model.predict_proba(vec)[0]
    label = 'News' if pred == 1 else 'Non-News'
    confidence = proba[pred]
    return label, confidence, proba

In [7]:
# Test with a news article
news_sample = "Breaking news: The stock market experienced a dramatic fall today due to unexpected economic reports."
label, conf, proba = predict_text(news_sample, classifier, vectorizer)
print(f"News sample classified as: {label} with confidence {conf:.4f}")
print(f"Probabilities: Non-News: {proba[0]:.4f}, News: {proba[1]:.4f}")

News sample classified as: News with confidence 0.7041
Probabilities: Non-News: 0.2959, News: 0.7041


In [9]:
# Test with an Amazon description
amazon_sample = "This premium quality kitchen knife set includes 5 stainless steel knives with ergonomic handles. Dishwasher safe and perfect for all your cooking needs."
label, conf, proba = predict_text(amazon_sample, classifier, vectorizer)
print(f"Amazon sample classified as: {label} with confidence {conf:.4f}")
print(f"Probabilities: Non-News: {proba[0]:.4f}, News: {proba[1]:.4f}")


Amazon sample classified as: News with confidence 0.5018
Probabilities: Non-News: 0.4982, News: 0.5018
