FAKE NEWS CLASSIFICATION USING LSTM NEURAL NETWORK

In this notebook the different stages to train a model to correctly classify fake news are performed. 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tensorflow.keras.layers import Embedding,LSTM,Dense,Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
import nltk
import re
from nltk.corpus import stopwords





In [2]:
train = pd.read_csv("train.csv")
test  = pd.read_csv("test.csv")
train.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [3]:
test.head()

Unnamed: 0,id,title,author,text
0,20800,"Specter of Trump Loosens Tongues, if Not Purse...",David Streitfeld,"PALO ALTO, Calif. — After years of scorning..."
1,20801,Russian warships ready to strike terrorists ne...,,Russian warships ready to strike terrorists ne...
2,20802,#NoDAPL: Native American Leaders Vow to Stay A...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...
3,20803,"Tim Tebow Will Attempt Another Comeback, This ...",Daniel Victor,"If at first you don’t succeed, try a different..."
4,20804,Keiser Report: Meme Wars (E995),Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...


In [4]:
print(train.shape)
print(test.shape)

(20800, 5)
(5200, 4)


In [5]:
train.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [6]:
test.isnull().sum()

id          0
title     122
author    503
text        7
dtype: int64

In [7]:
def filling_NAN(train_data, test_data):
    train=train_data.fillna(" ")
    test=test_data.fillna(" ")
    return train, test

train, test=filling_NAN(train,test)

In [8]:
train['content']=train['title'] + " " + train['author']
test['content']=test['title'] + " " + test['author']

In [9]:
x=train.drop(columns=['label'], axis=1)
y=train['label']

In [10]:
#resetting the index

messages=x.copy()
messages_test=test.copy()
messages.reset_index(inplace=True)
messages_test.reset_index=True

DATA PROCESSING
1) Remove all sequences except english ones
2) characters are converted to lower case to avoid false predictions
3) all sentences are tokenized into words
4) stemming is then applied to the tokenized words
5) words are joined together and stored in the corpus

In [11]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
def prepocessing(data):
    corpus = []
    for i in range(0,len(data)):
        review = re.sub('[^a-zA-Z]',' ',data['content'][i])
        review = review.lower()
        review = review.split()
        review = [ps.stem(word) for word in review if word not in stopwords.words('english')]
        review = ' '.join(review)
        corpus.append(review)
    return corpus

train_corpus=prepocessing(messages)
test_corpus=prepocessing(messages_test)


the words are then converted to one-hot vectors with a vocabulary size = 5000

In [12]:
vocab_size=5000
one_hot_train=[]
for word in train_corpus:
    one_hot_word=one_hot(word, vocab_size)
    one_hot_train.append(one_hot_word)

one_hot_test=[]
for word in test_corpus:
    one_hot_word=one_hot(word, vocab_size)
    one_hot_test.append(one_hot_word)

Applying padding so that each sequence has the same length=20. this is done with an embedding layer

In [13]:
total_length=20
embedded_train=pad_sequences(one_hot_train, padding='pre', maxlen=total_length)
embedded_test=pad_sequences(one_hot_test, padding='pre', maxlen=total_length)


In [14]:
#converting into array
x_train_final=np.array(embedded_train)
y_train_final=np.array(y)
x_test_final=np.array(embedded_test)
print(x_test_final.shape, x_train_final.shape, y_train_final.shape)


(5200, 20) (20800, 20) (20800,)


Creating the models.


Three different models will be created and compared: logistic regression, naive bayes and LSTM neural network

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

x_train, x_test, y_train, y_test=train_test_split(x_train_final, y_train_final,
                                                  train_size=0.1,
                                                  random_state=42)


1. LOGISTIC REGRESSION 

In [16]:
model_LR=LogisticRegression(max_iter=600)
model_LR.fit(x_train, y_train )
predictions_LR=model_LR.predict(x_test)
model_LR.score(x_test, y_test)

0.7549679487179487

In [17]:
model_LR_report=classification_report(y_test, predictions_LR)
print(predictions_LR)
print(model_LR_report)

[1 1 0 ... 0 0 1]
              precision    recall  f1-score   support

           0       0.75      0.76      0.76      9338
           1       0.76      0.75      0.75      9382

    accuracy                           0.75     18720
   macro avg       0.75      0.75      0.75     18720
weighted avg       0.76      0.75      0.75     18720



2. NAIVE BAYES

In [18]:
model_NB=MultinomialNB()
model_NB.fit(x_train, y_train)
predictions_NB=model_NB.predict(x_test)
print(predictions_NB)
print(model_NB.score(x_test, y_test))
model_NB_report=classification_report(y_test, predictions_NB)
print(model_NB_report)

[1 1 0 ... 0 0 1]
0.7196047008547009
              precision    recall  f1-score   support

           0       0.74      0.68      0.71      9338
           1       0.71      0.76      0.73      9382

    accuracy                           0.72     18720
   macro avg       0.72      0.72      0.72     18720
weighted avg       0.72      0.72      0.72     18720



3. LSTM 

In [19]:
embedding_feature_vector=40
model=Sequential()
model.add(Embedding(vocab_size,embedding_feature_vector, input_length=total_length ))
model.add(Dropout(0.3))
model.add(LSTM(200))
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 20, 40)            200000    
                                                                 
 dropout (Dropout)           (None, 20, 40)            0         
                                                                 
 lstm (LSTM)                 (None, 200)               192800    
                                                                 
 dropout_1 (Dropout)         (None, 200)               0         
                                                                 
 dense (Dense)               (None, 1)                 201       
                                                                 
Total params: 393001 (1.50 MB)
Trainable params: 393001 (1.50 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [26]:
model.fit(x_train, y_train,
          validation_data=(x_test, y_test),
          epochs=10,
          batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x25d29955a10>

In [29]:

y_pred = model.predict(x_test)



In [30]:
y_pred_ = np.argmax(model.predict(x_test),axis=1)



In [31]:
print(y_test)
print(y_pred)

[1 1 0 ... 1 1 1]
[[0.4993032 ]
 [0.50030386]
 [0.4988399 ]
 ...
 [0.49982983]
 [0.5010666 ]
 [0.4998441 ]]


In [37]:
model_report=classification_report(y_test, y_pred_, zero_division=1)
print(model_report)

              precision    recall  f1-score   support

           0       0.50      1.00      0.67      9338
           1       1.00      0.00      0.00      9382

    accuracy                           0.50     18720
   macro avg       0.75      0.50      0.33     18720
weighted avg       0.75      0.50      0.33     18720



In [36]:
score_LR = accuracy_score(y_test,predictions_LR)
score_NB = accuracy_score(y_test,predictions_NB)
score_LSTM = accuracy_score(y_test,y_pred_)

results = pd.DataFrame([["Logistic Regression",score_LR],["Naive Bayes",score_NB],["LSTM",score_LSTM]],columns=["Model","Accuracy"])
results

Unnamed: 0,Model,Accuracy
0,Logistic Regression,0.754968
1,Naive Bayes,0.719605
2,LSTM,0.498825
