## Fake News Classifier Using LSTM

### Import required libraries

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

**Dataset link:** https://www.kaggle.com/c/fake-news/data

In [19]:
df=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/AI & Deep Learning/Datasets/fake_news.csv')
df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


### Dataset information

In [20]:
df.shape

(20800, 5)

In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20800 entries, 0 to 20799
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      20800 non-null  int64 
 1   title   20242 non-null  object
 2   author  18843 non-null  object
 3   text    20761 non-null  object
 4   label   20800 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 812.6+ KB


### Missing values

In [22]:
df.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

#### Drop missing rows

In [23]:
df.dropna(inplace=True)
df.isna().sum()

id        0
title     0
author    0
text      0
label     0
dtype: int64

### Selecting useful columns

In [24]:
data=df[['title','label']]
data.head()

Unnamed: 0,title,label
0,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",0
2,Why the Truth Might Get You Fired,1
3,15 Civilians Killed In Single US Airstrike Hav...,1
4,Iranian woman jailed for fictional unpublished...,1


### Text preprocessing

In [28]:
import nltk
import re
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
stopwords=stopwords.words("english")
from nltk.stem import WordNetLemmatizer
lemma=WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


### Create function to apply text preprocessing

In [29]:
def text_preprocessing(text):
    text=str(text).lower()
    text=re.sub("[^a-zA-Z]", " ", text)
    text=[word for word in text.split(" ") if word not in stopwords]
    text=[lemma.lemmatize(word) for word in text]
    text=" ".join(text)
    text=re.sub(" +"," ", text)
    return text

data['clean_title']=data['title'].apply(text_preprocessing)
data.head()

Unnamed: 0,title,label,clean_title
0,House Dem Aide: We Didn’t Even See Comey’s Let...,1,house dem aide even see comey letter jason cha...
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",0,flynn hillary clinton big woman campus breitbart
2,Why the Truth Might Get You Fired,1,truth might get fired
3,15 Civilians Killed In Single US Airstrike Hav...,1,civilian killed single u airstrike identified
4,Iranian woman jailed for fictional unpublished...,1,iranian woman jailed fictional unpublished sto...


### Input and output feaures

In [32]:
X=data['clean_title']
y=data['label']
print(X.shape, y.shape)

(18285,) (18285,)


### Import necessary libraries for creating one hot representation and Embedding layer

In [35]:
import tensorflow as tf

In [36]:
tf.__version__

'2.12.0'

In [38]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM, Dense

### Onehot Representation of corpus

In [46]:
voc_size=10000
onehot_representation=[one_hot(words,voc_size)for words in X] 
onehot_representation[:5]

[[9672, 8240, 9699, 326, 4012, 3146, 6269, 4411, 4710, 4738],
 [2335, 2419, 6790, 9197, 1840, 8184, 805],
 [9621, 5955, 4674, 768],
 [6365, 2587, 2383, 2002, 1497, 313],
 [7395, 1840, 6350, 4858, 7340, 4012, 1840, 4549, 9993, 124]]

In [47]:
print(X[1])
print(onehot_representation[1])

flynn hillary clinton big woman campus breitbart
[2335, 2419, 6790, 9197, 1840, 8184, 805]


In [60]:
tot_words_df=pd.DataFrame(X)
tot_words_df.columns=['sentences']
tot_words_df['word_count']=tot_words_df['sentences'].apply(lambda x: len(x.split(" ")))
tot_words_df['word_count'].max()

47

**Maximum words in our document is 47** so select maxlen parameter as 47

### Word Embedding and padding

In [62]:
total_words_sentence=47
embedded_document=pad_sequences(onehot_representation, padding='post', maxlen=total_words_sentence)
embedded_document[:5]

array([[9672, 8240, 9699,  326, 4012, 3146, 6269, 4411, 4710, 4738,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0],
       [2335, 2419, 6790, 9197, 1840, 8184,  805,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0],
       [9621, 5955, 4674,  768,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0],
       [6365, 2587, 2383, 2002,

In [63]:
print(X[1])
print(onehot_representation[1])
print(embedded_document[1])

flynn hillary clinton big woman campus breitbart
[2335, 2419, 6790, 9197, 1840, 8184, 805]
[2335 2419 6790 9197 1840 8184  805    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0]


### Finalizing input and output features

In [64]:
X_final=np.array(embedded_document)
y_final=np.array(y)
print(X_final.shape, y_final.shape)

(18285, 47) (18285,)


### Train test split

In [65]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.2, random_state=42)
print(x_train.shape, y_train.shape,x_test.shape,y_test.shape)

(14628, 47) (14628,) (3657, 47) (3657,)


### Building model

#### Parameters:
- input_dim : Size of the vocabulary
- output_dim : Length of the vector for each word
- input_length : Maximum length of a sequence

In [82]:
embedding_vector_features=40
model=Sequential()
model.add(Embedding(input_dim=voc_size, output_dim=embedding_vector_features, input_length=total_words_sentence))
model.add(LSTM(32))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 47, 40)            400000    
                                                                 
 lstm_4 (LSTM)               (None, 32)                9344      
                                                                 
 dense_4 (Dense)             (None, 1)                 33        
                                                                 
Total params: 409,377
Trainable params: 409,377
Non-trainable params: 0
_________________________________________________________________
None


### Model Training

In [83]:
model.fit(x_train, y_train, epochs=5, batch_size=32, validation_data=(x_test,y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fdeec134250>

### Model Evaluation

In [84]:
y_pred=model.predict(x_test)
y_pred=np.where(y_pred >= 0.5, 1, 0)



In [85]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,y_pred)

array([[1949,  133],
       [1367,  208]])

In [86]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.59      0.94      0.72      2082
           1       0.61      0.13      0.22      1575

    accuracy                           0.59      3657
   macro avg       0.60      0.53      0.47      3657
weighted avg       0.60      0.59      0.50      3657



### Adding dropout layer to suppress some neurons

In [87]:
from tensorflow.keras.layers import Dropout
embedding_vector_features=40
model=Sequential()
model.add(Embedding(input_dim=voc_size, output_dim=embedding_vector_features, input_length=total_words_sentence))
model.add(Dropout(0.25))
model.add(LSTM(32))
model.add(Dropout(0.25))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.fit(x_train, y_train, epochs=5, batch_size=32, validation_data=(x_test,y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fdee0cd11e0>

In [88]:
y_pred=model.predict(x_test)
y_pred=np.where(y_pred >= 0.5, 1, 0)



In [89]:
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[1909  173]
 [ 125 1450]]
              precision    recall  f1-score   support

           0       0.94      0.92      0.93      2082
           1       0.89      0.92      0.91      1575

    accuracy                           0.92      3657
   macro avg       0.92      0.92      0.92      3657
weighted avg       0.92      0.92      0.92      3657

