# Fake News Detection

## Importing required library

In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import re
import string
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import metrics as m
import itertools
from sklearn.metrics import plot_confusion_matrix
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords
from sklearn.model_selection import GridSearchCV

In [9]:
df_fake = pd.read_csv("Fake.csv")
df_true = pd.read_csv("True.csv")

In [10]:
df_fake["class"] = 0
df_true["class"] = 1

In [11]:
print(len(df_fake))
df_fake.drop(df_fake.tail(13400).index, inplace = True)
df_fake

23481


Unnamed: 0,title,text,subject,date,class
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0
...,...,...,...,...,...
10076,ALT-LEFT PLANS TO HIJACK PRESIDENT TRUMP’S AZ ...,The pieces are in place ahead of President Don...,politics,"Aug 22, 2017",0
10077,WATCH: LOUIE GOHMERT Wants LYING Democrat VA G...,Why did the Democrat VA Governor throw gasolin...,politics,"Aug 22, 2017",0
10078,PATRIOT ARTIST’S LATEST TRIBUTE TO TRUMP SUPPO...,Patriot artist John McNaughton just revealed h...,politics,"Aug 21, 2017",0
10079,CHELSEA CLINTON Uses “Lucifer” To Support Argu...,Chelsea Clinton thought she was quite clever w...,politics,"Aug 21, 2017",0


In [12]:
print(len(df_true))
df_true.drop(df_true.tail(11400).index, inplace = True)
df_true

21417


Unnamed: 0,title,text,subject,date,class
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",1
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",1
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",1
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",1
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",1
...,...,...,...,...,...
10012,New Jersey's Christie loses bid to freeze Atla...,ATLANTIC CITY (Reuters) - A New Jersey judge o...,politicsNews,"April 8, 2016",1
10013,Leak of Senate encryption bill prompts swift b...,WASHINGTON (Reuters) - Security researchers an...,politicsNews,"April 8, 2016",1
10014,Papal official denies report Sanders invited h...,VATICAN CITY/WASHINGTON (Reuters) - U.S. Democ...,politicsNews,"April 8, 2016",1
10015,Bill Clinton confronts protesters who say his ...,NEW YORK (Reuters) - Former President Bill Cli...,politicsNews,"April 7, 2016",1


In [13]:
df_fake.shape, df_true.shape

((10081, 5), (10017, 5))

## Merging Fake news and true news

In [14]:
df = pd.concat([df_fake, df_true], axis =0 )
df

Unnamed: 0,title,text,subject,date,class
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0
...,...,...,...,...,...
10012,New Jersey's Christie loses bid to freeze Atla...,ATLANTIC CITY (Reuters) - A New Jersey judge o...,politicsNews,"April 8, 2016",1
10013,Leak of Senate encryption bill prompts swift b...,WASHINGTON (Reuters) - Security researchers an...,politicsNews,"April 8, 2016",1
10014,Papal official denies report Sanders invited h...,VATICAN CITY/WASHINGTON (Reuters) - U.S. Democ...,politicsNews,"April 8, 2016",1
10015,Bill Clinton confronts protesters who say his ...,NEW YORK (Reuters) - Former President Bill Cli...,politicsNews,"April 7, 2016",1


In [15]:
df.columns

Index(['title', 'text', 'subject', 'date', 'class'], dtype='object')

In [16]:
df = df.drop(["title", "subject","date"], axis = 1)

In [17]:
df

Unnamed: 0,text,class
0,Donald Trump just couldn t wish all Americans ...,0
1,House Intelligence Committee Chairman Devin Nu...,0
2,"On Friday, it was revealed that former Milwauk...",0
3,"On Christmas day, Donald Trump announced that ...",0
4,Pope Francis used his annual Christmas Day mes...,0
...,...,...
10012,ATLANTIC CITY (Reuters) - A New Jersey judge o...,1
10013,WASHINGTON (Reuters) - Security researchers an...,1
10014,VATICAN CITY/WASHINGTON (Reuters) - U.S. Democ...,1
10015,NEW YORK (Reuters) - Former President Bill Cli...,1


## Randomly shuffling the df

In [18]:
df = df.sample(frac = 1)

In [19]:
df

Unnamed: 0,text,class
7265,The Wisconsin Department of Justice just relea...,0
7392,LONDON (Reuters) - World leaders offered to wo...,1
9563,WASHINGTON (Reuters) - U.S. Republican preside...,1
3832,Trump s biggest champion on the other side of ...,0
3386,Kayleigh McEneny wanted an answer and she got ...,0
...,...,...
523,Donald Trump s former Chief Strategist Stephen...,0
2089,Trump just will not stop hammering his border ...,0
2561,CNN is definitely sick and tired of Donald Tru...,0
6914,"Normally, the thought of listening to Coldplay...",0


## Check Duplicates

In [20]:
df.drop_duplicates(inplace = True)

#New shape of the df
df.shape

(20044, 2)

## Checking Missing values


In [21]:
# Check for Missing data (NAN,na,NaN) for each column
df.isnull().sum()

text     0
class    0
dtype: int64

## Reset the index

In [22]:
df.reset_index(inplace = True)
df.drop(["index"], axis = 1, inplace = True)

In [23]:
df

Unnamed: 0,text,class
0,The Wisconsin Department of Justice just relea...,0
1,LONDON (Reuters) - World leaders offered to wo...,1
2,WASHINGTON (Reuters) - U.S. Republican preside...,1
3,Trump s biggest champion on the other side of ...,0
4,Kayleigh McEneny wanted an answer and she got ...,0
...,...,...
20039,Donald Trump s former Chief Strategist Stephen...,0
20040,Trump just will not stop hammering his border ...,0
20041,CNN is definitely sick and tired of Donald Tru...,0
20042,"Normally, the thought of listening to Coldplay...",0


## Creating a function to convert the text in lowercase, remove the extra space, special chr., ulr and links.

In [24]:
def wordopt(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W"," ",text) 
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)    
    return text

In [25]:
df["text"] = df["text"].apply(wordopt)

In [26]:
df

Unnamed: 0,text,class
0,the wisconsin department of justice just relea...,0
1,london reuters world leaders offered to wo...,1
2,washington reuters u s republican preside...,1
3,trump s biggest champion on the other side of ...,0
4,kayleigh mceneny wanted an answer and she got ...,0
...,...,...
20039,donald trump s former chief strategist stephen...,0
20040,trump just will not stop hammering his border ...,0
20041,cnn is definitely sick and tired of donald tru...,0
20042,normally the thought of listening to coldplay...,0


## Remove Stop words and stemming

In [27]:
ps = PorterStemmer()

In [28]:
def stemming_and_stopwords(text):
    
    # Remove stopwords and tokenization
    clean_words = [word for word in text.split() if word not in stopwords.words('english')]
    
    # Stemming
    for i in range(len(clean_words)):
        clean_words[i] = ps.stem(clean_words[i])

    return clean_words

### Defining dependent and independent variable as x and y

In [29]:
x = df["text"]
y = df["class"]

In [30]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)

In [31]:
print(x_train.shape)
print(y_train.shape)

(15033,)
(15033,)


In [32]:
print(x_test.shape)
print(y_test.shape)

(5011,)
(5011,)


## Converting text to vectors

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [34]:
vectorization = TfidfVectorizer(analyzer=stemming_and_stopwords)
xv_train = vectorization.fit_transform(x_train)
xv_test = vectorization.transform(x_test)

In [35]:
print(type(xv_test))

<class 'scipy.sparse.csr.csr_matrix'>


In [36]:
print(xv_test.shape)

(5011, 44742)


In [37]:
xv_test[0]

<1x44742 sparse matrix of type '<class 'numpy.float64'>'
	with 230 stored elements in Compressed Sparse Row format>

## Confusion Matrix


In [38]:
def confusion_mat(Y_test, pred):
    TN = 0
    TP=0
    FN=0
    FP=0
    

    for i in range(len(Y_test)):
        
        if pred[i] == 0 and Y_test[i] == 0:
            TN=TN+1
        elif pred[i] == 1 and Y_test[i] == 1:
            TP=TP+1
        elif pred[i] == 1 and Y_test[i] == 0:
            FP=FP+1
        else:
            FN=FN+1
    
    return TN,TP,FN,FP

In [39]:
def accuracy_score(TN,TP,FN,FP):
    total = TN+TP+FN+FP
    return (TN+TP)/total


def precision_score(TN,TP,FN,FP):
    return TP/(TP+FP)


def recall_score(TN,TP,FN,FP):
    return TP/(TP+FN)


def f1_score(TN,TP,FN,FP):
    p = TP/(TP+FP)
    r = TP/(TP+FN)
    return  2*p*r/(p+r)

## Logistic Regression

In [40]:
from sklearn.linear_model import LogisticRegression

In [41]:
LR = LogisticRegression()
LR.fit(xv_train,y_train)

LogisticRegression()

In [42]:
pred_lr=LR.predict(xv_test)

In [43]:
accuracy = m.accuracy_score(y_test, pred_lr)
accuracy

0.9878267810816205

In [44]:
#CONFUSION MATRIX
TN,TP,FN,FP = confusion_mat(y_test.to_numpy(), pred_lr)

print("True Negative ", TN)
print("False Positive ", FP)
print("False Negative ", FN)
print("True Positive ", TP)

True Negative  2504
False Positive  33
False Negative  28
True Positive  2446


In [45]:
print("ACCURACY   : ",accuracy_score(TN,TP,FN,FP))
print("PRECISION  : ",precision_score(TN,TP,FN,FP))
print("RECALL     : ",recall_score(TN,TP,FN,FP))
print("F1_SCORE : ",f1_score(TN,TP,FN,FP))

ACCURACY   :  0.9878267810816205
PRECISION  :  0.9866881807180314
RECALL     :  0.9886822958771221
F1_SCORE :  0.98768423177872


In [46]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, pred_lr)

array([[2504,   33],
       [  28, 2446]], dtype=int64)

In [47]:
print(classification_report(y_test, pred_lr))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      2537
           1       0.99      0.99      0.99      2474

    accuracy                           0.99      5011
   macro avg       0.99      0.99      0.99      5011
weighted avg       0.99      0.99      0.99      5011



## Naive Bayes

In [48]:
from sklearn.naive_bayes import MultinomialNB

In [49]:
NB = MultinomialNB()
NB.fit(xv_train, y_train)

MultinomialNB()

In [50]:
pred_NB = NB.predict(xv_test)

In [51]:
accuracy = m.accuracy_score(y_test, pred_NB)
accuracy

0.9369387347834763

In [52]:
#CONFUSION MATRIX
TN,TP,FN,FP = confusion_mat(y_test.to_numpy(), pred_NB)

print("True Negative ", TN)
print("False Positive ", FP)
print("False Negative ", FN)
print("True Positive ", TP)

True Negative  2356
False Positive  181
False Negative  135
True Positive  2339


In [53]:
print("ACCURACY   : ",accuracy_score(TN,TP,FN,FP))
print("PRECISION  : ",precision_score(TN,TP,FN,FP))
print("RECALL     : ",recall_score(TN,TP,FN,FP))
print("F1_SCORE : ",f1_score(TN,TP,FN,FP))

ACCURACY   :  0.9369387347834763
PRECISION  :  0.9281746031746032
RECALL     :  0.9454324979789814
F1_SCORE :  0.9367240688826592


In [54]:
print(classification_report(y_test, pred_NB))

              precision    recall  f1-score   support

           0       0.95      0.93      0.94      2537
           1       0.93      0.95      0.94      2474

    accuracy                           0.94      5011
   macro avg       0.94      0.94      0.94      5011
weighted avg       0.94      0.94      0.94      5011



## Gradient Boosting Classifier

In [55]:
from sklearn.ensemble import GradientBoostingClassifier

In [56]:
GBC = GradientBoostingClassifier(random_state=0)
GBC.fit(xv_train, y_train)

GradientBoostingClassifier(random_state=0)

In [57]:
pred_gbc = GBC.predict(xv_test)

In [58]:
accuracy = m.accuracy_score(y_test, pred_gbc)
accuracy

0.9982039513071244

In [59]:
#CONFUSION MATRIX
TN,TP,FN,FP = confusion_mat(y_test.to_numpy(), pred_gbc)

print("True Negative ", TN)
print("False Positive ", FP)
print("False Negative ", FN)
print("True Positive ", TP)

True Negative  2533
False Positive  4
False Negative  5
True Positive  2469


In [60]:
print("ACCURACY   : ",accuracy_score(TN,TP,FN,FP))
print("PRECISION  : ",precision_score(TN,TP,FN,FP))
print("RECALL     : ",recall_score(TN,TP,FN,FP))
print("F1_SCORE : ",f1_score(TN,TP,FN,FP))

ACCURACY   :  0.9982039513071244
PRECISION  :  0.9983825313384553
RECALL     :  0.997978981406629
F1_SCORE :  0.9981807155852032


In [61]:
print(classification_report(y_test, pred_gbc))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2537
           1       1.00      1.00      1.00      2474

    accuracy                           1.00      5011
   macro avg       1.00      1.00      1.00      5011
weighted avg       1.00      1.00      1.00      5011



## Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
RFC = RandomForestClassifier(random_state=0)
RFC.fit(xv_train, y_train)

RandomForestClassifier(random_state=0)

In [None]:
pred_rfc = RFC.predict(xv_test)

In [None]:
accuracy = m.accuracy_score(y_test, pred_rfc)
accuracy

0.9870627199337612

In [None]:
#CONFUSION MATRIX
TN,TP,FN,FP = confusion_mat(y_test.to_numpy(), pred_rfc)

print("True Negative ", TN)
print("False Positive ", FP)
print("False Negative ", FN)
print("True Positive ", TP)

True Negative  4220
False Positive  93
False Negative  32
True Positive  5317


In [None]:
print("ACCURACY   : ",accuracy_score(TN,TP,FN,FP))
print("PRECISION  : ",precision_score(TN,TP,FN,FP))
print("RECALL     : ",recall_score(TN,TP,FN,FP))
print("F1_SCORE : ",f1_score(TN,TP,FN,FP))

ACCURACY   :  0.9870627199337612
PRECISION  :  0.9828096118299445
RECALL     :  0.9940175733782015
F1_SCORE :  0.9883818198717353


In [None]:
print(classification_report(y_test, pred_rfc))

              precision    recall  f1-score   support

           0       0.99      0.98      0.99      4313
           1       0.98      0.99      0.99      5349

    accuracy                           0.99      9662
   macro avg       0.99      0.99      0.99      9662
weighted avg       0.99      0.99      0.99      9662

