## Fake news classifier

### data loading

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('../Final.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,subject,date,info
0,0,"Trump on Twitter (Feb 17) - Tom Price, Boeing,...",The following statements were posted to the ve...,politicsNews,"February 17, 2017",1
1,1,Putin says Russia will respond if Russian medi...,"SOCHI, Russia (Reuters) - Russian President Vl...",worldnews,"October 19, 2017",1
2,2,Rosie O’Donnell Thinks Martial Law is in Order...,21st Century Wire says Arguably irrelevant Hol...,Middle-east,"January 18, 2017",0
3,3,Chris Christie Reportedly Told Trump To Take ...,Sources say that Chris Christie was offered se...,News,"December 11, 2016",0
4,4,STATE DEPT EMPLOYEE TAPPED TO OVERSEE HILLARY ...,You seriously can t make up this stuff State D...,Government News,"Sep 8, 2015",0


In [3]:
df = df.sample(frac = 1).reset_index(drop = True)
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,subject,date,info
0,26917,Taiwan activist to be tried for subversion in ...,Beijing (Reuters) - China will on Monday put o...,worldnews,"September 8, 2017",1
1,10652,REPUBLICAN DEBATE LINE OF THE NIGHT: “Keep swi...,This is hysterical! I don t care who you suppo...,politics,"Feb 25, 2016",0
2,39442,"China punishes over 8,000 people for misuse of...","SHANGHAI (Reuters) - China has punished 8,123 ...",worldnews,"December 23, 2017",1
3,42562,SHOCKING COMMENTS: Hillary Stirs The Pot Incit...,Hillary Clinton used a CNN interview on Friday...,politics,"Jul 9, 2016",0
4,15190,Venezuela has not told U.N. it is changing rep...,NEW YORK (Reuters) - Venezuela s government ha...,worldnews,"November 29, 2017",1


In [4]:
df.drop('Unnamed: 0', axis = 1, inplace = True)
df.head()

Unnamed: 0,title,text,subject,date,info
0,Taiwan activist to be tried for subversion in ...,Beijing (Reuters) - China will on Monday put o...,worldnews,"September 8, 2017",1
1,REPUBLICAN DEBATE LINE OF THE NIGHT: “Keep swi...,This is hysterical! I don t care who you suppo...,politics,"Feb 25, 2016",0
2,"China punishes over 8,000 people for misuse of...","SHANGHAI (Reuters) - China has punished 8,123 ...",worldnews,"December 23, 2017",1
3,SHOCKING COMMENTS: Hillary Stirs The Pot Incit...,Hillary Clinton used a CNN interview on Friday...,politics,"Jul 9, 2016",0
4,Venezuela has not told U.N. it is changing rep...,NEW YORK (Reuters) - Venezuela s government ha...,worldnews,"November 29, 2017",1


In [5]:
df

Unnamed: 0,title,text,subject,date,info
0,Taiwan activist to be tried for subversion in ...,Beijing (Reuters) - China will on Monday put o...,worldnews,"September 8, 2017",1
1,REPUBLICAN DEBATE LINE OF THE NIGHT: “Keep swi...,This is hysterical! I don t care who you suppo...,politics,"Feb 25, 2016",0
2,"China punishes over 8,000 people for misuse of...","SHANGHAI (Reuters) - China has punished 8,123 ...",worldnews,"December 23, 2017",1
3,SHOCKING COMMENTS: Hillary Stirs The Pot Incit...,Hillary Clinton used a CNN interview on Friday...,politics,"Jul 9, 2016",0
4,Venezuela has not told U.N. it is changing rep...,NEW YORK (Reuters) - Venezuela s government ha...,worldnews,"November 29, 2017",1
...,...,...,...,...,...
44893,"Trump, Obama dominate Twitter year, but chicke...",(Reuters) - U.S. President Donald Trump contin...,politicsNews,"December 5, 2017",1
44894,Mattis says no change in U.S. policy to protec...,SEOUL (Reuters) - U.S. Defense Secretary Jim M...,worldnews,"October 27, 2017",1
44895,HERE’S HOW HILLARY’S VP PICK Has Just Proven H...,Hillary s VP pick is proving himself to be a a...,politics,"Jul 26, 2016",0
44896,What Is Going On With The Secret Service?,21st Century Wire says This is a string of dis...,Middle-east,"March 20, 2017",0


In [6]:
df['info'].value_counts()

0    23481
1    21417
Name: info, dtype: int64

In [7]:
X = df.drop('info', axis = 1)
y = df['info']

### preprocessing the text

In [8]:
messages = X.copy()

In [10]:
import nltk
import re
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

In [11]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [12]:
corpus = []
ps = PorterStemmer()
for i in range(0, len(messages)) : 
    review = re.sub('^[a-zA-Z]', ' ', messages['title'][i])

    review = review.lower()
    review = review.split()

    review = [ps.stem(words) for words in review if not words in stopwords.words('english')]

    review = ' '.join(review)

    corpus.append(review)

In [13]:
corpus

["aiwan activist tri subvers china 'open' hear",
 'epublican debat line night: “keep swinging’ men, swing fences!” [video]',
 'hina punish 8,000 peopl misus govern funds: xinhua',
 'hock comments: hillari stir pot incit racial divid lie',
 'enezuela told u.n. chang representative: offici',
 '(video) judg jeanine: can’t trust hillari even think vote woman?',
 'vinc foster’ sister furiou donald trump',
 'hy obama approv $418 million sale u.s. weapon kenya last day office…and contract award firm never produc one planes?',
 'hot mic captur humili moment trump tell christi leav stage (video)',
 'hite hous spokesman spicer trump seek fix imag',
 'eb! bush discov secret america realli want president…and it’ someth never expect',
 'actbox: busi empir czech elect front-runn babi',
 'cuador judg order arrest vice presid odebrecht probe',
 'oscow say u.s. north korea start dialogue: ria',
 'reek centrist elect allianc leader boost popular',
 'ow! confirm ice: 5-time deport illeg alien protect san

In [14]:
X_final = np.array(corpus)
y_final = y

In [16]:
X_final.shape

(44898,)

In [18]:
y_final.shape

(44898,)

### word embedding

In [19]:
import tensorflow
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [20]:
vocab_size = 10000
one_hot_rep = [one_hot(word, vocab_size) for word in corpus]

In [23]:
one_hot_rep = pad_sequences(one_hot_rep, maxlen = 30, padding = 'pre')

In [24]:
one_hot_rep

array([[   0,    0,    0, ..., 4316, 5779, 6111],
       [   0,    0,    0, ..., 7775,   37, 9042],
       [   0,    0,    0, ..., 1038, 6283, 1004],
       ...,
       [   0,    0,    0, ...,  554, 8686, 9054],
       [   0,    0,    0, ..., 8615,  219, 5861],
       [   0,    0,    0, ..., 2719,  906, 8855]])

In [25]:
X_final = np.array(one_hot_rep)

In [27]:
X_final.shape

(44898, 30)

In [28]:
y_final.shape

(44898,)

### train test split

In [29]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size = 0.25, random_state = 42)

In [30]:
print(f"{X_train.shape} :: {X_test.shape} :: {y_train.shape} :: {y_test.shape}")

(33673, 30) :: (11225, 30) :: (33673,) :: (11225,)


### model building and training

In [32]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Bidirectional, LSTM, Embedding, Dense, Dropout

In [45]:
model = Sequential([
    Embedding(input_dim = 10000, output_dim = 30, input_length = 30),
    Bidirectional(LSTM(64, return_sequences = True)),
    Dropout(0.3),
    Bidirectional(LSTM(32)),
    Dense(64, activation = 'relu'),
    Dense(1, activation = 'sigmoid')
])
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [46]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 30, 30)            300000    
                                                                 
 bidirectional_6 (Bidirectio  (None, 30, 128)          48640     
 nal)                                                            
                                                                 
 dropout_3 (Dropout)         (None, 30, 128)           0         
                                                                 
 bidirectional_7 (Bidirectio  (None, 64)               41216     
 nal)                                                            
                                                                 
 dense_6 (Dense)             (None, 64)                4160      
                                                                 
 dense_7 (Dense)             (None, 1)                

### model training and predictions

In [47]:
model.fit(X_train, y_train, epochs = 10, batch_size = 128, validation_split = 0.20)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1919d0896c0>

In [54]:
y_pred = model.predict(X_test)



In [55]:
y_pred = (y_pred > 0.5).astype(int)

In [56]:
y_pred

array([[0],
       [1],
       [0],
       ...,
       [0],
       [0],
       [1]])

In [52]:
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [57]:
print(accuracy_score(y_pred, y_test))
print(confusion_matrix(y_pred, y_test))
print(classification_report(y_pred, y_test))

0.9345211581291759
[[5739  556]
 [ 179 4751]]
              precision    recall  f1-score   support

           0       0.97      0.91      0.94      6295
           1       0.90      0.96      0.93      4930

    accuracy                           0.93     11225
   macro avg       0.93      0.94      0.93     11225
weighted avg       0.94      0.93      0.93     11225

