In [2]:
import numpy as np
import pandas as pd

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /home/mot/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/mot/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/mot/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/mot/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

# Text Preprocessing Pipeline

## 1. Basic Text Cleaning
- Convert to lowercase: `text.lower()`
- Tokenization: Split text into individual words
- Remove non-alphanumeric tokens: `token.isalnum()`

## 2. Language Processing
- Remove stopwords (common words like 'the', 'is', 'at')
- Lemmatization: Reduce words to base form
 - running → run
 - better → good
 - children → child

## 3. TF-IDF Vectorization
- TF (Term Frequency): How often word appears in document
- IDF (Inverse Document Frequency): Word rarity across documents
- Final score = TF * IDF
- Captures word importance while accounting for frequency bias

In [3]:
def combine_news_datasets(fake_path, true_path, final_en_path, welfake_path):
    # Load datasets
    fake_df = pd.read_csv(fake_path)
    true_df = pd.read_csv(true_path)
    final_en_df = pd.read_csv(final_en_path)
    welfake_df = pd.read_csv(welfake_path)

    # Fix fake labels for each dataset
    fake_df['fake'] = 1
    true_df['fake'] = 0
    final_en_df['fake'] = 1 - final_en_df['lebel']  # Convert 0=fake to 1=fake
    welfake_df['fake'] = welfake_df['label']

    # Select required columns and combine
    dfs = []
    for df in [fake_df, true_df, final_en_df, welfake_df]:
        df = df[['title', 'text', 'fake']].copy()
        dfs.append(df)
    
    combined_df = pd.concat(dfs, ignore_index=True)
    
    # Clean data
    combined_df['title'] = combined_df['title'].fillna('').str.strip()
    combined_df['text'] = combined_df['text'].fillna('').str.strip()
    
    # Drop rows with empty title or text
    combined_df = combined_df[
        (combined_df['title'] != '') & 
        (combined_df['text'] != '') &
        (~combined_df['title'].isna()) & 
        (~combined_df['text'].isna())
    ]
    
    # Drop duplicates based on title
    combined_df = combined_df.drop_duplicates(subset=['title'])
    
    # Convert fake to int
    combined_df['fake'] = combined_df['fake'].astype(int)
    
    return combined_df

def preprocess_text(text, stop_words=set(stopwords.words('english'))):
    # Tokenization
    tokens = word_tokenize(text.lower())
    
    # Remove stopwords and lemmatize
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalnum() and token not in stop_words]
    
    return ' '.join(tokens)

def remove_length_outliers(df, threshold_percentile=95):
   length_threshold = df['text_length'].quantile(threshold_percentile/100)
   df_filtered = df[df['text_length'] <= length_threshold].copy()
   print(f"{len(df) - len(df_filtered)} articles > {length_threshold} chars")
   return df_filtered

def preprocess_dataset(df):
    # Clean text
    df['title_clean'] = df['title'].apply(preprocess_text)
    df['text_clean'] = df['text'].apply(preprocess_text)
    
    # Additional features
    df['title_length'] = df['title'].str.len()
    df['text_length'] = df['text'].str.len()
    
    # Remove outliers
    df = remove_length_outliers(df)
    
    return df

df = combine_news_datasets('../data/Fake.csv', '../data/True.csv', '../data/final_en.csv', '../data/WELFake_Dataset.csv')
df = preprocess_dataset(df)

df.info()

3088 articles > 8400.0 chars
<class 'pandas.core.frame.DataFrame'>
Index: 58686 entries, 0 to 127029
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   title         58686 non-null  object
 1   text          58686 non-null  object
 2   fake          58686 non-null  int64 
 3   title_clean   58686 non-null  object
 4   text_clean    58686 non-null  object
 5   title_length  58686 non-null  int64 
 6   text_length   58686 non-null  int64 
dtypes: int64(3), object(4)
memory usage: 3.6+ MB


# Fake News Detection Model Pipeline

## Architecture
- **Feature Engineering**
  - Title TFIDF (20 features, unigrams & bigrams)
  - Text TFIDF (50 features, unigrams & bigrams)
  - Custom TextSelector for DataFrame column handling
  
## Models Evaluated
| Model | Accuracy | Std Dev |
|-------|----------|---------|
| Logistic Regression | 0.893 | 0.185 |
| Naive Bayes | 0.801 | 0.186 |
| Random Forest | 0.900 | 0.216 |

## Model Selection
While Random Forest achieved marginally better accuracy (0.900 vs 0.893), we opt for Logistic Regression because:
- Similar performance with lower complexity
- Better interpretability
- Faster training and inference
- Lower variance (0.185 vs 0.216)

The 0.7% accuracy improvement from Random Forest doesn't justify its added complexity and computational overhead.

In [18]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

class TextSelector:
    def __init__(self, field):
        self.field = field
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X[self.field]

def create_features():
    combined_features = FeatureUnion([
        ('title', Pipeline([
            ('selector', TextSelector('title_clean')),
            ('tfidf', TfidfVectorizer(max_features=20, ngram_range=(1, 2)))
        ])),
        ('text', Pipeline([
            ('selector', TextSelector('text_clean')),
            ('tfidf', TfidfVectorizer(max_features=50, ngram_range=(1, 2)))
        ])),
    ])
    
    return combined_features


def evaluate_models(df):
    features = create_features()
    
    models = {
        'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
        'Naive Bayes': MultinomialNB(),
        'Random Forest': RandomForestClassifier(n_estimators=120, random_state=42)
    }
    
    results = {}
    for name, model in models.items():
        pipeline = Pipeline([
            ('features', features),
            ('classifier', model)
        ])
        
        scores = cross_val_score(
            pipeline, 
            df,
            df['fake'], 
            cv=5, 
            scoring='accuracy',
            n_jobs=-1
        )
        
        results[name] = {
            'mean_accuracy': scores.mean(),
            'std_accuracy': scores.std()
        }
        
        print(f"\n{name}:")
        print(f"Mean accuracy: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})")
    
    return results

# Run evaluation
results = evaluate_models(df)

# Find best model
best_model = max(results.items(), key=lambda x: x[1]['mean_accuracy'])
print(f"\nBest model: {best_model[0]} with accuracy: {best_model[1]['mean_accuracy']:.3f}")


Logistic Regression:
Mean accuracy: 0.893 (+/- 0.185)

Naive Bayes:
Mean accuracy: 0.801 (+/- 0.186)

Random Forest:
Mean accuracy: 0.900 (+/- 0.216)

Best model: Random Forest with accuracy: 0.900
