### Fake News Classifier using LSTM

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('data/fnc_train.csv')

In [3]:
df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [4]:
df.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [5]:
df = df.dropna()

In [6]:
X = df.drop('label', axis=1)
y = df['label']

In [8]:
X.shape

(18285, 4)

In [9]:
import tensorflow as tf
tf.__version__

'2.18.0'

In [74]:
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, Input, Dense, LSTM, Dropout
from tensorflow.keras.models import Sequential

In [13]:
 voc_size = 5000

In [14]:
# one hot representation

In [15]:
messages = X.copy()

In [16]:
messages.reset_index(inplace=True)

In [20]:
import nltk
import re
from nltk.corpus import stopwords  # used to clean data (symbols, spaces)

In [24]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [28]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
corpus = []

for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]',' ', messages['title'][i])
    review = review.lower()
    review = review.split()

    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [None]:
corpus

In [30]:
one_hot_rep = [one_hot(words, voc_size) for words in corpus]

In [38]:
len(max(one_hot_rep, key=len))

47

In [39]:
##### Embedding Representation
sent_len = 20
Embedded_docs = pad_sequences(one_hot_rep, padding='pre', maxlen=sent_len)

In [42]:
Embedded_docs

array([[   0,    0,    0, ..., 4012, 2295, 3878],
       [   0,    0,    0, ..., 1946,  441, 1403],
       [   0,    0,    0, ...,  965,  776, 1145],
       ...,
       [   0,    0,    0, ...,  736, 3623, 4830],
       [   0,    0,    0, ..., 2716, 1397, 4614],
       [   0,    0,    0, ...,  259, 3473, 1559]], dtype=int32)

In [75]:
# model
embedded_vector_features = 40  #dimensions
model = Sequential()
model.add(Input(shape=(sent_len,)))
model.add(Embedding(voc_size, embedded_vector_features))
model.add(LSTM(100)) # one layer -100 neurons
model.add(Dropout(0.3))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])

In [76]:
model.summary()

In [77]:
import numpy as np
X_final = np.array(Embedded_docs)
y_final = np.array(y)

In [78]:
X_final.shape, y.shape

((18285, 20), (18285,))

In [79]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.33, random_state=42)


In [80]:
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=64)

Epoch 1/10
[1m192/192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 53ms/step - accuracy: 0.7920 - loss: 0.4259 - val_accuracy: 0.9120 - val_loss: 0.2058
Epoch 2/10
[1m192/192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 49ms/step - accuracy: 0.9468 - loss: 0.1380 - val_accuracy: 0.9185 - val_loss: 0.1959
Epoch 3/10
[1m192/192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 48ms/step - accuracy: 0.9634 - loss: 0.0991 - val_accuracy: 0.9167 - val_loss: 0.2182
Epoch 4/10
[1m192/192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 51ms/step - accuracy: 0.9746 - loss: 0.0776 - val_accuracy: 0.9148 - val_loss: 0.2541
Epoch 5/10
[1m192/192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 48ms/step - accuracy: 0.9808 - loss: 0.0617 - val_accuracy: 0.9114 - val_loss: 0.2786
Epoch 6/10
[1m192/192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 48ms/step - accuracy: 0.9871 - loss: 0.0390 - val_accuracy: 0.9140 - val_loss: 0.3584
Epoch 7/10
[1m192/

<keras.src.callbacks.history.History at 0x1f8edd84790>

In [89]:
y_pred = model.predict(X_test)
y_pred = (y_pred > 0.5).astype('int')

[1m189/189[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 12ms/step


In [90]:
from sklearn.metrics import confusion_matrix, accuracy_score

In [91]:
confusion_matrix(y_test, y_pred)

array([[3127,  292],
       [ 228, 2388]])

In [92]:
accuracy_score(y_test, y_pred)

0.9138359569179785