In [3]:
# This notebook demonstrates building a classifier to distinguish between news and non-news articles using the sampled non-news dataset and an existing news dataset.
# We use descriptions_sampled.txt as our non-news sample, and Fake.csv & True.csv as our news dataset.

import pandas as pd
import numpy as np
import gzip
import random
import re
import joblib
import os

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Step 1: Load the non-news data from descriptions_sampled.txt
non_news_path = 'News_Dataset/descriptions_sampled.txt'

# For the non-news data, we assume each line is a separate document.
with open(non_news_path, 'r', encoding='utf-8') as f:
    non_news_docs = f.readlines()

non_news_docs = [doc.strip() for doc in non_news_docs if doc.strip()]

# Create a DataFrame for non-news articles and label as 0
non_news_df = pd.DataFrame({'text': non_news_docs, 'label': 0})

# Step 2: Load news data from Fake.csv and True.csv and label them as 1.
# We assume Fake.csv and True.csv are in the working directory.
news_fake = pd.read_csv('News_Dataset/Fake.csv')
news_true = pd.read_csv('News_Dataset/True.csv')

# They are already labeled in our previous model, but for our purpose news=1
news_fake['label'] = 1
news_true['label'] = 1

# For the news data, we will use the 'text' column. Some datasets may require preprocessing.
news_df = pd.concat([news_fake[['text', 'label']], news_true[['text', 'label']]], ignore_index=True)

# To reduce computation time, we may sample a part of the news data if very large
if len(news_df) > 5000:
    news_df = news_df.sample(n=5000, random_state=42)

# Combine non-news and news into one dataset
combined_df = pd.concat([news_df, non_news_df], ignore_index=True)

# Shuffle data
combined_df = combined_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Function for simple text preprocessing
import re

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', ' ', text)  # remove punctuation
    text = re.sub(r'\s+', ' ', text)  # remove extra whitespace
    return text.strip()

combined_df['clean_text'] = combined_df['text'].apply(preprocess_text)

# Step 3: Split the data for training and validation
X = combined_df['clean_text']
y = combined_df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Step 4: Use TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2), min_df=3)

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Step 5: Train a Logistic Regression classifier and optimize hyperparameters using Grid Search
# We use a pipeline: but here we'll do grid search over C and penalty

param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l2'], # l1 requires solver=saga; we could test both if needed
    # Note: 'max_iter': [100, 200] can be tuned if convergence issues occur
}

lr = LogisticRegression(solver='liblinear', max_iter=200, random_state=42)

grid_search = GridSearchCV(lr, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)
grid_search.fit(X_train_tfidf, y_train)

best_model = grid_search.best_estimator_
print("Best parameters:", grid_search.best_params_)

# Evaluate on test set
y_pred = best_model.predict(X_test_tfidf)
accuracy_val = accuracy_score(y_test, y_pred)

print("Test Accuracy:", accuracy_val)
print("\
Classification Report:")
print(classification_report(y_test, y_pred))

# Step 6: Save the model and vectorizer for later prediction
joblib.dump(vectorizer, 'tfidf_vectorizer_news_classifier.pkl')
joblib.dump(best_model, 'logistic_news_classifier.pkl')

print("\
Model and vectorizer saved successfully.")

# Step 7: Function for predicting if an uploaded article is news or non-news

def predict_news_article(text):
    processed = preprocess_text(text)
    vec = vectorizer.transform([processed])
    pred = best_model.predict(vec)[0]
    prob = best_model.predict_proba(vec).max()
    label = 'News' if pred == 1 else 'Non-News'
    return label, prob

# Example usage:
sample_text = "Breaking news: The stock market experienced a dramatic fall today due to unexpected economic reports."
label, conf = predict_news_article(sample_text)
print(f"Sample text classified as: {label} with confidence {conf:.4f}")

print("Notebook execution complete.")

Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best parameters: {'C': 100, 'penalty': 'l2'}
Test Accuracy: 0.9990449813771368
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      8971
           1       1.00      1.00      1.00      1500

    accuracy                           1.00     10471
   macro avg       1.00      1.00      1.00     10471
weighted avg       1.00      1.00      1.00     10471

Model and vectorizer saved successfully.
Sample text classified as: News with confidence 0.9698
Notebook execution complete.
