# Detecting Fake News

### Importing the relevant libraries

In [1]:
import numpy as np
import pandas as pd
import itertools
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import re

### Loading the raw data

In [3]:
df=pd.read_csv('train.csv')
df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [4]:
df.describe()

Unnamed: 0,id,label
count,20800.0,20800.0
mean,10399.5,0.500625
std,6004.587135,0.500012
min,0.0,0.0
25%,5199.75,0.0
50%,10399.5,1.0
75%,15599.25,1.0
max,20799.0,1.0


### Checking null  values

In [5]:
df.isnull()

Unnamed: 0,id,title,author,text,label
0,False,False,False,False,False
1,False,False,False,False,False
2,False,False,False,False,False
3,False,False,False,False,False
4,False,False,False,False,False
...,...,...,...,...,...
20795,False,False,False,False,False
20796,False,False,False,False,False
20797,False,False,False,False,False
20798,False,False,False,False,False


### Declare the inputs and the targets

In [6]:
df = df.drop([16921], axis=0)
features = df.iloc[:, 3].values
labels = df.iloc[:, 4].values

### Preprocessing the data

In [7]:
processed_features = []

for sentence in range(0, len(features)):
    # Remove all the special characters
    processed_feature = re.sub(r'\W', ' ', str(features[sentence]))

    # remove all single characters
    processed_feature= re.sub(r'\s+[a-zA-Z]\s+', ' ', processed_feature)

    # Remove single characters from the start
    processed_feature = re.sub(r'\^[a-zA-Z]\s+', ' ', processed_feature) 

    # Substituting multiple spaces with single space
    processed_feature = re.sub(r'\s+', ' ', processed_feature, flags=re.I)

    # Removing prefixed 'b'
    processed_feature = re.sub(r'^b\s+', '', processed_feature)

    # Converting to Lowercase
    processed_feature = processed_feature.lower()

    processed_features.append(processed_feature)

In [8]:
featur = pd.DataFrame(data=processed_features, columns=["texts"])
lab=pd.DataFrame(data=labels,columns=["labels"])

### Train Test Split

In [9]:
x_train,x_test,y_train,y_test=train_test_split(featur["texts"],lab["labels"],test_size=0.2,random_state=7)

### Initialize TfidfVectorizer and Fit the data 

In [10]:
tf_v=TfidfVectorizer(max_features=2500,stop_words='english',max_df=0.7)
tf_train=tf_v.fit_transform(x_train)
tf_test=tf_v.transform(x_test)

### Passive Aggressive Classifier Model

In [11]:
pac=PassiveAggressiveClassifier(max_iter=50,verbose=1)
pac.fit(tf_train,y_train)
y_pred=pac.predict(tf_test)

-- Epoch 1
Norm: 48.38, NNZs: 2500, Bias: 1.055272, T: 16519, Avg. loss: 0.211314
Total training time: 0.01 seconds.
-- Epoch 2
Norm: 59.36, NNZs: 2500, Bias: 1.234261, T: 33038, Avg. loss: 0.118522
Total training time: 0.03 seconds.
-- Epoch 3
Norm: 67.63, NNZs: 2500, Bias: 1.443539, T: 49557, Avg. loss: 0.097333
Total training time: 0.04 seconds.
-- Epoch 4
Norm: 74.67, NNZs: 2500, Bias: 1.508001, T: 66076, Avg. loss: 0.087357
Total training time: 0.05 seconds.
-- Epoch 5
Norm: 80.71, NNZs: 2500, Bias: 1.641200, T: 82595, Avg. loss: 0.078786
Total training time: 0.06 seconds.
-- Epoch 6
Norm: 86.19, NNZs: 2500, Bias: 1.690173, T: 99114, Avg. loss: 0.071010
Total training time: 0.07 seconds.
-- Epoch 7
Norm: 91.25, NNZs: 2500, Bias: 1.749648, T: 115633, Avg. loss: 0.067028
Total training time: 0.08 seconds.
-- Epoch 8
Norm: 95.84, NNZs: 2500, Bias: 1.779323, T: 132152, Avg. loss: 0.062141
Total training time: 0.09 seconds.
-- Epoch 9
Norm: 99.96, NNZs: 2500, Bias: 1.925969, T: 148671,

### Accuracy

In [12]:
score=accuracy_score(y_test,y_pred)
print(score)

0.9266826923076923


### Confusion Matrix

In [13]:
confusion_matrix(y_test,y_pred,labels=[1,0])


array([[1940,  151],
       [ 154, 1915]], dtype=int64)