<a href="https://colab.research.google.com/github/thedataninja1786/Machine-Learning/blob/main/Fake_news_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Fake news classifier using an LSTM network

##Importing the necessary modules


In [None]:
!pip install wandb 
from time import time 
import re 
import sys 
import matplotlib.pyplot as plt 
from matplotlib import rcParams
import seaborn as sns
import nltk 
nltk.download('punkt')
nltk.download('averaged_perception_tagger')
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
import itertools 
import datetime 
import pprint 
import warnings
warnings.filterwarnings('ignore')
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.model_selection import train_test_split
import os 
import pandas as pd
import numpy as np 
from google.colab import drive 
from sklearn.utils import shuffle
import wandb
from wandb.keras import WandbCallback
from keras.preprocessing import sequence 
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation 
from keras.layers import Embedding, LSTM 
from keras.layers import Conv1D, Flatten, MaxPooling1D
from keras.preprocessing.sequence import pad_sequences 
from keras.utils.np_utils import to_categorical
from keras.preprocessing import text 
import tensorflow as tf
drive.mount('/content/drive')

##Loading the data and creating labels

In [None]:
data = pd.read_csv('/content/drive/My Drive/news.csv')
data = data[['text','label']].copy()

data['text'] = data['text'].astype(str)

data['label'] = data['label'].apply(lambda x: 1 if x== 'FAKE' else 0)
data.head()

In [None]:
fake_news = (data['label'] == 1).sum()
true_news = (data['label'] == 0).sum()

f'The dataset comprises of {fake_news} articles of fake news and {true_news} articles of true news.'

'The dataset comprises of 3164 articles of fake news and 3171 articles of true news.'

##Data preprocessing 

In [None]:
import re 
import string 

def remove_URL(text):
  url = re.compile(r'https?://\S+|www\.\S+')
  return url.sub(r"",text)

def remove_punct(text):
  translator = str.maketrans("","",string.punctuation)
  return text.translate(translator)

stop = set(stopwords.words('english'))

def remove_stopwords(text):
  filtered_words = [word.lower() for word in text.split() if word.lower() not in stop]
  return " ".join(filtered_words)


In [None]:
data['text'] = data['text'] .apply(lambda x : remove_URL(x))
data['text']  = data['text'] .apply(lambda x : remove_punct(x))
data['text']  = data['text'] .apply(lambda x : remove_stopwords(x))

In [None]:
#Counting unique words
from collections import Counter 

def count_words(text_col):
  count = Counter()
  for text in text_col.values:
    for word in text.split():
      count[word] += 1 
  return count 

counter = count_words(data['text'])
counter.most_common()

##Creating the train and validation data

In [None]:
train_size = int(data.shape[0] * 0.8)

train_df = data[:train_size]
val_df = data[train_size:]

#Split text and labels and convert to numpy arrays 
train_sentences = train_df['text'].to_numpy()
train_labels = train_df['label'].to_numpy()
val_sentences = val_df['text'].to_numpy()
val_labels = val_df['label'].to_numpy()

In [None]:
train_sentences.shape, val_sentences.shape

##Creating the tokenizer

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer

#Vectorize a text corpus by turning each sentence into a sequence of integers 
tokenizer = Tokenizer(num_words = len(counter))
tokenizer.fit_on_texts(train_sentences)

In [None]:
#Now each word has a unique index 
word_index = tokenizer.word_index
word_index

In [None]:
train_sequences = tokenizer.texts_to_sequences(train_sentences)
val_sequences = tokenizer.texts_to_sequences(val_sentences)

In [None]:
#Apply paddying so all the sequences have the same length 
from tensorflow.keras.preprocessing.sequence import pad_sequences

#max number of words in a sequence 
max_length = 100

train_padded = pad_sequences(train_sequences, maxlen=max_length, padding= 'post', truncating= 'post')
val_padded = pad_sequences(val_sequences, maxlen=max_length, padding= 'post', truncating= 'post')
train_padded.shape , val_padded.shape 

((5068, 100), (1267, 100))

##Creating the model

In [None]:
#Creating the LSTM model 
import keras
from tensorflow.keras import layers 

model = keras.models.Sequential()
model.add(layers.Embedding(len(counter),32,input_length = max_length))
model.add(layers.LSTM(64, dropout = 0.1))
model.add(layers.Dense(1,activation = 'sigmoid'))

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 32)           3507392   
_________________________________________________________________
lstm (LSTM)                  (None, 64)                24832     
_________________________________________________________________
dense (Dense)                (None, 1)                 65        
Total params: 3,532,289
Trainable params: 3,532,289
Non-trainable params: 0
_________________________________________________________________


In [None]:
loss = keras.losses.BinaryCrossentropy(from_logits=False)
optim = keras.optimizers.Adam(lr = 0.0001)
metrics = ['accuracy']

model.compile(loss = loss , optimizer= optim, metrics = metrics)

In [None]:
model.fit(train_padded,train_labels,epochs=25,validation_data=(val_padded,val_labels), verbose = 2)

Epoch 1/25
159/159 - 14s - loss: 0.6920 - accuracy: 0.5298 - val_loss: 0.6879 - val_accuracy: 0.5714
Epoch 2/25
159/159 - 14s - loss: 0.6218 - accuracy: 0.6523 - val_loss: 0.4083 - val_accuracy: 0.8414
Epoch 3/25
159/159 - 14s - loss: 0.3297 - accuracy: 0.8777 - val_loss: 0.2769 - val_accuracy: 0.8966
Epoch 4/25
159/159 - 14s - loss: 0.1515 - accuracy: 0.9546 - val_loss: 0.2361 - val_accuracy: 0.9061
Epoch 5/25
159/159 - 14s - loss: 0.0769 - accuracy: 0.9801 - val_loss: 0.2233 - val_accuracy: 0.9203
Epoch 6/25
159/159 - 14s - loss: 0.0399 - accuracy: 0.9903 - val_loss: 0.2244 - val_accuracy: 0.9203
Epoch 7/25
159/159 - 14s - loss: 0.0272 - accuracy: 0.9951 - val_loss: 0.3055 - val_accuracy: 0.9116
Epoch 8/25
159/159 - 14s - loss: 0.0197 - accuracy: 0.9966 - val_loss: 0.3431 - val_accuracy: 0.9084
Epoch 9/25
159/159 - 14s - loss: 0.0150 - accuracy: 0.9976 - val_loss: 0.3363 - val_accuracy: 0.9084
Epoch 10/25
159/159 - 14s - loss: 0.0129 - accuracy: 0.9972 - val_loss: 0.3837 - val_accura

<tensorflow.python.keras.callbacks.History at 0x7f575d2a9668>

In [None]:
predictions = model.predict(train_padded)
predictions = [1 if p > 0.5 else 0 for p in predictions]

In [None]:
print(train_sentences[0:5])

print(train_labels[0:5])

print(predictions[0:5])

##Saving the tokenizer and the model

In [None]:
import pickle 

with open('/content/drive/My Drive/tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

model.save('/content/drive/My Drive/Fake_news_classifier')

##Testing the model on some new data

In [None]:
#Importing the necessary modules 
import requests 
from bs4 import BeautifulSoup as bs 
from urllib.parse import urljoin 
import time 
import pandas as pd 
import re 
from datetime import datetime as dt
import datetime
import os 


In [None]:
#Function for getting the article's data 
def get_article_info(url):
  r = requests.get(url)
  soup = bs(r.content)
  articles = {}
  articles['title'] = soup.find(class_="StandardHeader__title").get_text()
  articles['published_date'] = soup.find(class_="PublicationTime__date").get_text()
  articles['content'] = soup.find(class_="Body__content").get_text()
  return articles 


In [None]:
#Function for getting the article's links from the website 

r = requests.get('http://www.thedailybeast.com/politics.html') #website that publishes political articles
soup = bs(r.content)
links = []
urls = soup.find_all(class_="GridStory__title-link")
for url in urls:
  links.append(url.find(class_='TrackingLink')['href'])

links


In [None]:
#Getting all the necessary info for all the articles 
list_of_articles = []
for link in links:
  list_of_articles.append(get_article_info(link))

articles = pd.DataFrame(articles)
articles.head()

In [None]:
#Making prediction with the new articles 
contents = articles['content'].to_list()

contents_seq = np.array(tokenizer.texts_to_sequences(contents))

contents_pad = pad_sequences(contents_seq,maxlen = 100, padding = 'post')

predictions_2 = model.predict(contents_pad)
predictions_2 = [1 if p > 0.5 else 0 for p in predictions_2]
predictions_2 = ['FAKE' if p ==1 else 'TRUE' for p in predictions_2]

In [None]:

ls = []
for i, content in enumerate(contents):
  ds = {'Article' : content , 'Label' : predictions_2[i]}
  ls.append(ds)

classification = pd.DataFrame(ls)
classification

Unnamed: 0,Article,Label
0,As it seeks to overturn the election results a...,TRUE
1,Let’s say you’re standing next to some railroa...,TRUE
2,President Donald Trump’s administration signal...,FAKE
3,A former cabinet member once said to me that t...,TRUE
4,It’s been three weeks since Donald Trump lost ...,FAKE
5,"Antony Blinken, a longtime aide to President-e...",TRUE
6,Recalling the arduous 2008 path that ultimatel...,FAKE
7,"To understand the wrongness, ignorance, and ju...",TRUE
8,One of the first things Joe Biden can do to he...,TRUE
9,The number screamed at me and I actually wante...,FAKE
