##Fake News Classification using LSTM

Dataset used : [Fake News] (https://www.kaggle.com/datasets/algord/fake-news)

In [None]:
import pandas as pd


In [None]:
data = pd.read_csv('/content/sample_data/FakeNewsNet.csv')

In [None]:
data.head()

Unnamed: 0,title,news_url,source_domain,tweet_num,real
0,Kandi Burruss Explodes Over Rape Accusation on...,http://toofab.com/2017/05/08/real-housewives-a...,toofab.com,42,1
1,People's Choice Awards 2018: The best red carp...,https://www.today.com/style/see-people-s-choic...,www.today.com,0,1
2,Sophia Bush Sends Sweet Birthday Message to 'O...,https://www.etonline.com/news/220806_sophia_bu...,www.etonline.com,63,1
3,Colombian singer Maluma sparks rumours of inap...,https://www.dailymail.co.uk/news/article-33655...,www.dailymail.co.uk,20,1
4,Gossip Girl 10 Years Later: How Upper East Sid...,https://www.zerchoo.com/entertainment/gossip-g...,www.zerchoo.com,38,1


In [None]:
data.isnull().sum()

Unnamed: 0,0
title,0
news_url,330
source_domain,330
tweet_num,0
real,0


In [None]:
#simple nn using only text field
#lowering, stopwords and stemming (preprocessing)
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re

nltk.download('stopwords')

corpus = []

ps = PorterStemmer()

for i in range(len(data)):
  sent = re.sub('[^a-zA-z0-9]', ' ', data['title'][i])
  sent = sent.lower()
  sent = sent.split()

  sent = [ps.stem(word) for word in sent if word not in stopwords.words('english')]
  sent = ' '.join(sent)
  corpus.append(sent)




[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
data['title'][0]

"Kandi Burruss Explodes Over Rape Accusation on 'Real Housewives of Atlanta' Reunion (Video)"

In [None]:
corpus[0]

'kandi burruss explod rape accus real housew atlanta reunion video'

In [None]:
from keras.layers import Embedding
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from keras.layers import LSTM
from keras.layers import Dense

In [None]:
vocabulary_size = 5000
X = [one_hot(sent, vocabulary_size) for sent in corpus]
X

[[1224, 2878, 1181, 1292, 1651, 1105, 4833, 3242, 2656, 435],
 [96, 453, 2780, 227, 3140, 582, 2414, 2504],
 [4965,
  1482,
  85,
  2016,
  2672,
  2311,
  146,
  2952,
  124,
  30,
  4901,
  2562,
  2194,
  2835,
  2523],
 [4474, 406, 3545, 1649, 4794, 2990, 6, 4371],
 [2913,
  2715,
  3808,
  2305,
  2115,
  4087,
  2419,
  2076,
  1273,
  957,
  1183,
  742,
  4966,
  990],
 [1602, 4524, 2007, 4413, 4149, 1436, 3957, 1935, 3284],
 [2980, 32, 3333, 3160, 3146, 1748],
 [2826, 2187, 910, 4591, 2672, 3341, 2746, 2205, 608, 3929],
 [3337, 1007, 4506, 1059, 2911, 4039, 2016, 4122, 327, 2003, 30, 4901],
 [4519, 2490, 2105, 1691, 3066, 4553],
 [1279, 74, 4901, 3239, 1867, 4017, 269, 198, 4855],
 [1737, 2975, 1125, 1084, 3628, 4064, 985],
 [567, 1601, 4217, 3585, 1083, 2311, 3821, 4187, 4410, 363, 4735, 1546],
 [45, 2879, 1008, 332, 1537, 1719, 1173, 230],
 [2109, 3669, 3910, 3107, 4769, 1840],
 [1719, 1992, 4298, 2764, 3665, 3354, 3547, 3609, 4471],
 [1832, 2465, 3837, 2011, 903, 2613, 4165

In [None]:
max_sent_length = max([len(sent) for sent in X])
X = pad_sequences(X, maxlen=max_sent_length, padding='pre')
X


array([[   0,    0,    0, ..., 3242, 2656,  435],
       [   0,    0,    0, ...,  582, 2414, 2504],
       [   0,    0,    0, ..., 2194, 2835, 2523],
       ...,
       [   0,    0,    0, ..., 4770, 4065, 2924],
       [   0,    0,    0, ..., 3221, 2461, 3284],
       [   0,    0,    0, ..., 1442, 4175, 2780]], dtype=int32)

In [None]:
embedding_vector_features = 40
model = Sequential()
model.add(Embedding(vocabulary_size, embedding_vector_features))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, data['real'], test_size=0.2, random_state=42)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((18556, 35), (4640, 35), (18556,), (4640,))

In [None]:
model.fit(X_train, y_train, epochs=10, batch_size=64)

Epoch 1/10
[1m290/290[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 29ms/step - accuracy: 0.7679 - loss: 0.5213
Epoch 2/10
[1m290/290[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 29ms/step - accuracy: 0.8565 - loss: 0.3391
Epoch 3/10
[1m290/290[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 29ms/step - accuracy: 0.8740 - loss: 0.2964
Epoch 4/10
[1m290/290[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 28ms/step - accuracy: 0.8889 - loss: 0.2618
Epoch 5/10
[1m290/290[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 29ms/step - accuracy: 0.9041 - loss: 0.2328
Epoch 6/10
[1m290/290[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 29ms/step - accuracy: 0.9184 - loss: 0.2118
Epoch 7/10
[1m290/290[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 29ms/step - accuracy: 0.9247 - loss: 0.1884
Epoch 8/10
[1m290/290[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 28ms/step - accuracy: 0.9338 - loss: 0.1720
Epoch 9/10
[1m290/290[0m 

<keras.src.callbacks.history.History at 0x78bfcb89a710>

In [None]:
ypred = model.predict(X_test)

[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step


In [None]:
y_pred = [0 if val[0]<0.5 else 1 for val in ypred]

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[ 648,  483],
       [ 509, 3000]])

In [None]:
from sklearn.metrics import accuracy_score, classification_report
accuracy_score(y_test, y_pred)

0.7862068965517242

In [None]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.56      0.57      0.57      1131
           1       0.86      0.85      0.86      3509

    accuracy                           0.79      4640
   macro avg       0.71      0.71      0.71      4640
weighted avg       0.79      0.79      0.79      4640

