In [3]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pickle
from flask import Flask, request, jsonify

In [4]:
# Ensure required NLTK data is downloaded
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [19]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
#Download punkt_tab
nltk.download('punkt_tab') # Download the punkt_tab data package

def clean_text(text):
    text = text.lower()
    text = re.sub(f"[{string.punctuation}]", "", text)
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return " ".join(tokens)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [7]:
# Load dataset
df = pd.read_csv("/content/drive/MyDrive/fake_and_real_news.csv.zip")

In [8]:
df.head()

Unnamed: 0,Text,label
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,Fake
1,U.S. conservative leader optimistic of common ...,Real
2,"Trump proposes U.S. tax overhaul, stirs concer...",Real
3,Court Forces Ohio To Allow Millions Of Illega...,Fake
4,Democrats say Trump agrees to work on immigrat...,Real


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9900 entries, 0 to 9899
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Text    9900 non-null   object
 1   label   9900 non-null   object
dtypes: object(2)
memory usage: 154.8+ KB


In [10]:
df.isnull()

Unnamed: 0,Text,label
0,False,False
1,False,False
2,False,False
3,False,False
4,False,False
...,...,...
9895,False,False
9896,False,False
9897,False,False
9898,False,False


In [11]:
df.isnull().sum()

Unnamed: 0,0
Text,0
label,0


In [12]:
# Remove missing and duplicate entries
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

In [20]:
# Clean text
df['clean_text'] = df['Text'].apply(clean_text)

In [23]:
#Feature Engineering & Vectorization
vectorizer = TfidfVectorizer(ngram_range=(1, 2))
X = vectorizer.fit_transform(df['clean_text'])
y = df['label']

In [24]:
# Model Training & Evaluation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [25]:
models = {
    "Naïve Bayes": MultinomialNB(),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "SVM": SVC(kernel='linear', probability=True)
}

In [27]:
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"{name} Performance:")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    #Specify pos_label for precision_score, recall_score and f1_score
    print("Precision:", precision_score(y_test, y_pred, pos_label='Real')) # Assuming 'Real' is the positive label
    print("Recall:", recall_score(y_test, y_pred, pos_label='Real')) # Assuming 'Real' is the positive label
    print("F1-score:", f1_score(y_test, y_pred, pos_label='Real')) # Assuming 'Real' is the positive label
    print("-------------------------------------")

Naïve Bayes Performance:
Accuracy: 0.9700963000506843
Precision: 0.9517839922854388
Recall: 0.9909638554216867
F1-score: 0.970978848991638
-------------------------------------
Logistic Regression Performance:
Accuracy: 0.9903699949315763
Precision: 0.9899699097291875
Recall: 0.9909638554216867
F1-score: 0.9904666332162569
-------------------------------------
Random Forest Performance:
Accuracy: 0.9939178915357324
Precision: 0.9949698189134809
Recall: 0.9929718875502008
F1-score: 0.9939698492462311
-------------------------------------
SVM Performance:
Accuracy: 0.9959452610238216
Precision: 0.994
Recall: 0.9979919678714859
F1-score: 0.9959919839679359
-------------------------------------


In [28]:
# Hyperparameter Tuning & Performance Improvement
from sklearn.model_selection import GridSearchCV

param_grid = {
    'C': [0.1, 1, 10],
    'max_iter': [500, 1000]
}

grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_

In [29]:
ensemble_model = VotingClassifier(estimators=[
    ('lr', best_model),
    ('rf', RandomForestClassifier(n_estimators=200)),
    ('svm', SVC(kernel='linear', probability=True))
], voting='soft')
ensemble_model.fit(X_train, y_train)

In [30]:
# Save the best model
with open("fake_news_model.pkl", "wb") as f:
    pickle.dump(ensemble_model, f)