In [2]:
# loading and preprocessing data
import pandas as pd
import numpy as np 
true = pd.read_csv('True.csv')
false = pd.read_csv('Fake.csv')


In [3]:
true['subject'].value_counts(normalize = True)
true['value'] = 1
true = true.drop(columns = ['text', 'date'])
true.columns

Index(['title', 'subject', 'value'], dtype='object')

In [6]:
false = false.drop(columns = ['text', 'date'])
false['value'] = 0
false.columns

Index(['title', 'subject', 'value'], dtype='object')

In [8]:
# final dataset True / Fake news distribution 
df = pd.merge(true, false, how = 'outer')
df.value.value_counts(normalize = True)

value
0    0.522985
1    0.477015
Name: proportion, dtype: float64

In [10]:
# working with nltk to define stop words, working with re to clean the text 
import re
import nltk 
from nltk.corpus import stopwords

stop_words = set(stopwords.words("english"))
def clean(text): 
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.lower()
    text = ' '.join([word for word in text.split() if word not in stop_words]) 
    return text

In [12]:
df['title'] = df['title'].apply(clean)
corpus = df['title'] 

In [14]:
df.head(20)

Unnamed: 0,title,subject,value
0,donald trump gets handed 10 year old girl video,News,0
1,ex gop congressman shreds fellow republicans h...,News,0
2,trump gets stomped whining president obama usi...,News,0
3,watch democratic rep delivers scathing rebuke ...,News,0
4,aftertrumpimplodes hashtag hilariously imagine...,News,0
5,blacklivesmatter leader run mayor racially tro...,News,0
6,bringbackobama hashtag blows twitter americans...,News,0
7,freechrischristie twitter reacts hostage situa...,News,0
8,makeamericabrannigan futurama voice actor read...,News,0
9,nevertrump conservative perfect description pu...,News,0


In [16]:
# embedding and splitting the data 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

tfvector = TfidfVectorizer()
x = tfvector.fit_transform(corpus)
y = df.value
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.2)

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [20]:
# training logistic regression model 
lr_model = LogisticRegression()
lr_model.fit(x_train, y_train)
y_pred_lr = lr_model.predict(x_test)

In [22]:
# training random forest classifier (best parameters taken from grid search)
rf = RandomForestClassifier(max_depth = 15, n_estimators = 100, random_state = 42)
rf.fit(x_train, y_train)
y_pred_rf = rf.predict(x_test)

In [24]:
# evaluating the model on the test dataset 
def evaluation(model, y_test, y_pred):
    print(f'{model} Model Performance:')
    print('Accuracy:', accuracy_score(y_test, y_pred))
    print('Precision:', precision_score(y_test, y_pred))
    print('Recall:', recall_score(y_test, y_pred))
    print('F1 score:', f1_score(y_test, y_pred))

evaluation('Logistic Regression', y_test, y_pred_lr)
evaluation('Random Forest Classifier', y_test, y_pred_rf)
    

Logistic Regression Model Performance:
Accuracy: 0.9444320712694878
Precision: 0.9355570683004312
Recall: 0.950437989857077
F1 score: 0.9429388221841052
Random Forest Classifier Model Performance:
Accuracy: 0.8288418708240535
Precision: 0.9504663879060792
Recall: 0.6811894882434302
F1 score: 0.7936081643614878


In [26]:
# while logistic regression did very nicely with scores, random forest model doesn't show the best performance 
# finetuning it with grid search to find best params
from sklearn.model_selection import GridSearchCV
param_grid = {
"n_estimators": [50, 100, 200],
"max_depth": [5, 10, 15],
}

grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=3, scoring="accuracy")
grid_search.fit(x_train, y_train)

best_model = grid_search.best_estimator_
print("Best Parameters:", grid_search.best_params_)

Best Parameters: {'max_depth': 15, 'n_estimators': 100}
