In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import re
import string
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import metrics as m
import itertools
from sklearn.metrics import plot_confusion_matrix
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords
from sklearn.model_selection import GridSearchCV

In [12]:
import tensorflow as tf

In [13]:
tf.__version__

'2.7.0'

In [14]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense

In [15]:
### Vocabulary size
voc_size=5000

In [2]:
df_fake = pd.read_csv("Fake.csv")
df_true = pd.read_csv("True.csv")

df_fake["class"] = 0
df_true["class"] = 1

In [3]:
df_fake

Unnamed: 0,title,text,subject,date,class
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0
...,...,...,...,...,...
23476,McPain: John McCain Furious That Iran Treated ...,21st Century Wire says As 21WIRE reported earl...,Middle-east,"January 16, 2016",0
23477,JUSTICE? Yahoo Settles E-mail Privacy Class-ac...,21st Century Wire says It s a familiar theme. ...,Middle-east,"January 16, 2016",0
23478,Sunnistan: US and Allied ‘Safe Zone’ Plan to T...,Patrick Henningsen 21st Century WireRemember ...,Middle-east,"January 15, 2016",0
23479,How to Blow $700 Million: Al Jazeera America F...,21st Century Wire says Al Jazeera America will...,Middle-east,"January 14, 2016",0


In [4]:
df_true

Unnamed: 0,title,text,subject,date,class
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",1
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",1
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",1
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",1
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",1
...,...,...,...,...,...
21412,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,worldnews,"August 22, 2017",1
21413,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",worldnews,"August 22, 2017",1
21414,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,worldnews,"August 22, 2017",1
21415,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,worldnews,"August 22, 2017",1


In [5]:
df_fake.shape, df_true.shape

((23481, 5), (21417, 5))

In [6]:
df = pd.concat([df_fake, df_true], axis =0 )
df = df.drop(["title", "subject","date"], axis = 1)
df = df.sample(frac = 1)
df

Unnamed: 0,text,class
17382,MADRID (Reuters) - Spain s government will tak...,1
13377,"DENPASAR, Indonesia (Reuters) - A window appea...",1
20987,"XIAMEN, China (Reuters) - Russian President Vl...",1
16154,MANILA (Reuters) - The Philippines top police...,1
13411,The Dem Convention has been a crazy event so f...,0
...,...,...
16745,European nations have naively opened their bor...,0
15866,Our culture is what makes this Nation great an...,0
5492,"JUPITER, Florida (Reuters) - President Donald ...",1
9378,This story is about more than a massive cover-...,0


In [7]:
df.drop_duplicates(inplace = True)

#New shape of the df
df.shape

(38647, 2)

In [8]:
# Check for Missing data (NAN,na,NaN) for each column
df.isnull().sum()

text     0
class    0
dtype: int64

In [9]:
df.reset_index(inplace = True)
df.drop(["index"], axis = 1, inplace = True)

df

Unnamed: 0,text,class
0,MADRID (Reuters) - Spain s government will tak...,1
1,"DENPASAR, Indonesia (Reuters) - A window appea...",1
2,"XIAMEN, China (Reuters) - Russian President Vl...",1
3,MANILA (Reuters) - The Philippines top police...,1
4,The Dem Convention has been a crazy event so f...,0
...,...,...
38642,WASHINGTON (Reuters) - A U.S. judge on Thursda...,1
38643,MOSCOW (Reuters) - Russia has started establis...,1
38644,WASHINGTON (Reuters) - Democratic U.S. lawmake...,1
38645,BAGHDAD (Reuters) - Iraqi Prime Minister Haide...,1


In [21]:
y = df['class']
y.shape

(38647,)

## Preprocessing

In [10]:
ps = PorterStemmer()
corpus = []
for i in range(0, len(df)):
    text = df['text'][i].lower()
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W"," ",text) 
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)    
    text = text.split()
    
    text = [ps.stem(word) for word in text if not word in stopwords.words('english')]
    text = ' '.join(text)
    corpus.append(text)

In [11]:
corpus

['madrid reuter spain govern take control catalonia rule directli catalan leader carl puigdemont drop bid split region spain thursday gmt deputi prime minist soraya saenz de santamaria said mr puigdemont still opportun start resolv situat must answer ye declar independ saenz de santamaria said madrid given puigdemont monday gmt clarifi posit independ ye catalan leader directli answer question',
 'denpasar indonesia reuter window appear close friday travel strand indonesian holiday island bali airlin cut back flight fear return plume volcan ash erupt volcano close airport week strand thousand visitor australia china countri wind chang flight resum australian budget airlin jetstar said would cancel nine flight friday meteorolog offici warn ash could hit oper bali airport km mile southwest mount agung volcano malaysia airasia bhd said would oper bali day ash could impair visibl night wind condit area unpredict answer space us next flight said martim cazado travel tri get home portug via s

In [31]:
onehot_repr=[one_hot(words,voc_size)for words in corpus] 

In [32]:
onehot_repr

[[16,
  3290,
  4275,
  2386,
  2420,
  1877,
  1203,
  3802,
  2583,
  1651,
  2072,
  2038,
  2900,
  2577,
  3827,
  2522,
  116,
  4275,
  3411,
  4695,
  351,
  1946,
  1931,
  3125,
  4976,
  2639,
  3235,
  2119,
  120,
  2900,
  577,
  4367,
  670,
  3267,
  4214,
  2310,
  4210,
  3923,
  3633,
  2560,
  4976,
  2639,
  3235,
  2119,
  16,
  308,
  2900,
  944,
  4695,
  1735,
  2409,
  2560,
  3923,
  1651,
  2072,
  2583,
  4210,
  273],
 [4953,
  1558,
  3290,
  540,
  914,
  1406,
  4479,
  1732,
  478,
  3803,
  2216,
  882,
  2832,
  3638,
  37,
  171,
  4138,
  4898,
  3163,
  2083,
  526,
  3219,
  3155,
  2181,
  1406,
  4862,
  506,
  478,
  2290,
  4181,
  4907,
  3931,
  1554,
  799,
  1864,
  4138,
  390,
  1084,
  3343,
  3638,
  3314,
  2119,
  62,
  2997,
  2278,
  4138,
  4479,
  2146,
  534,
  1530,
  3219,
  1267,
  4231,
  2468,
  2832,
  4862,
  3031,
  1240,
  4599,
  405,
  2105,
  2181,
  1296,
  158,
  4352,
  2119,
  62,
  2468,
  2832,
  2288,
  3219

## Embedding Representation

In [17]:
sent_length=20
embedded_docs=pad_sequences(onehot_repr,padding='pre',maxlen=sent_length)
print(embedded_docs)

[[3633 2560 4976 ... 2583 4210  273]
 [2119  639 2288 ... 2181 4750 2832]
 [4406 3681 4175 ... 3269 2606 2923]
 ...
 [4041  782 1324 ... 1669 4984 1240]
 [4443 3757  720 ... 1654 4617  652]
 [4549   74 3536 ... 4476  477 3536]]


In [18]:
embedded_docs[0]

array([3633, 2560, 4976, 2639, 3235, 2119,   16,  308, 2900,  944, 4695,
       1735, 2409, 2560, 3923, 1651, 2072, 2583, 4210,  273])

In [19]:
## Creating model
embedding_vector_features=40
model=Sequential()
model.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model.add(LSTM(100))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 20, 40)            200000    
                                                                 
 lstm (LSTM)                 (None, 100)               56400     
                                                                 
 dense (Dense)               (None, 1)                 101       
                                                                 
Total params: 256,501
Trainable params: 256,501
Non-trainable params: 0
_________________________________________________________________
None


In [22]:
len(embedded_docs),y.shape

(38647, (38647,))

In [23]:
X_final=np.array(embedded_docs)
y_final=np.array(y)

In [24]:
X_final.shape,y_final.shape

((38647, 20), (38647,))

In [25]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.33, random_state=42)

## Model Training

In [26]:
### Finally Training
model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=10,batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1e434b93730>

## Adding Dropout

In [27]:
from tensorflow.keras.layers import Dropout
## Creating model
embedding_vector_features=40
model=Sequential()
model.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model.add(Dropout(0.3))
model.add(LSTM(100))
model.add(Dropout(0.3))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

## Performance Metrics And Accuracy

In [29]:
y_pred=model.predict_classes(X_test)


AttributeError: 'Sequential' object has no attribute 'predict_classes'

In [None]:
confusion_matrix(y_test,y_pred)


In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)