In [33]:
import numpy as np
import pandas as pd
import torch 
import torch.nn as nn 
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow import keras
from keras.preprocessing import text,sequence
from keras.models import Sequential
from keras.layers import Dense,Embedding,LSTM,Dropout
from sklearn.metrics import accuracy_score
import random
from torch.utils.data import DataLoader,Dataset
from torch import optim
import os
#NLP tools
import nltk 
nltk.download('wordnet')
nltk.download("stopwords")   
nltk.download('punkt')
nltk.download('omw-1.4')
from nltk.corpus import stopwords  
from nltk.stem import WordNetLemmatizer 
from nltk.stem import PorterStemmer 
import re

[nltk_data] Downloading package wordnet to /home/jupyter-
[nltk_data]     mwohl4/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jupyter-
[nltk_data]     mwohl4/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/jupyter-
[nltk_data]     mwohl4/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/jupyter-
[nltk_data]     mwohl4/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [25]:
#overview the data
df1 = pd.read_csv('Fake.csv')
df2 = pd.read_csv('True.csv')
df1['label'] = 0
df2['label'] = 1
df = pd.concat([df1, df2], axis=0)
df.title.head(15)

0      Donald Trump Sends Out Embarrassing New Year’...
1      Drunk Bragging Trump Staffer Started Russian ...
2      Sheriff David Clarke Becomes An Internet Joke...
3      Trump Is So Obsessed He Even Has Obama’s Name...
4      Pope Francis Just Called Out Donald Trump Dur...
5      Racist Alabama Cops Brutalize Black Boy While...
6      Fresh Off The Golf Course, Trump Lashes Out A...
7      Trump Said Some INSANELY Racist Stuff Inside ...
8      Former CIA Director Slams Trump Over UN Bully...
9      WATCH: Brand-New Pro-Trump Ad Features So Muc...
10     Papa John’s Founder Retires, Figures Out Raci...
11     WATCH: Paul Ryan Just Told Us He Doesn’t Care...
12     Bad News For Trump — Mitch McConnell Says No ...
13     WATCH: Lindsey Graham Trashes Media For Portr...
14     Heiress To Disney Empire Knows GOP Scammed Us...
Name: title, dtype: object

In [26]:
#Check for null values
df.isnull().sum()

title      0
text       0
subject    0
date       0
label      0
dtype: int64

In [27]:
#get rid of unwanted columns
del df['text']
del df['subject']
del df['date']

In [28]:
#Clean up title data
def clean_text(text):
    text = text.lower()
    text = text.replace("watch:",'') #gets rid of weird WATCH tag
    test = text.replace('\[[^]]*\]', '') #gets rid of punctuation
    text = text.replace(r"\#",'')
    text = text.replace(r"[^A-Za-z0-9()!?\'\`\"]",' ') #non-alphabet and numbers get replaced with space
    text = text.replace("\s{2,}",' ')
    return text

#do some language processing
def remove_stopwords_and_lemmatization(text):
    final_text = []
    text = nltk.word_tokenize(text)
    for word in text:
        if word not in set(stopwords.words('english')):
            lemma = WordNetLemmatizer()
            word = lemma.lemmatize(word) 
            final_text.append(word)
    return " ".join(final_text)

#Total function
def text_munging(text):
    text = clean_text(text)
    text = remove_stopwords_and_lemmatization(text)
    return text

#Apply function on text column
df['title_mod'] = df['title'].apply(text_munging)



In [29]:
df.title_mod.head(15)   

0     donald trump sends embarrassing new year ’ eve...
1     drunk bragging trump staffer started russian c...
2     sheriff david clarke becomes internet joke thr...
3     trump obsessed even obama ’ name coded website...
4     pope francis called donald trump christmas speech
5     racist alabama cop brutalize black boy handcuf...
6     fresh golf course , trump lash fbi deputy dire...
7     trump said insanely racist stuff inside oval o...
8     former cia director slam trump un bullying , o...
9     brand-new pro-trump ad feature much * * kissin...
10    papa john ’ founder retires , figure racism ba...
11    paul ryan told u ’ care struggling family livi...
12    bad news trump — mitch mcconnell say repealing...
13    lindsey graham trash medium portraying trump ‘...
14    heiress disney empire know gop scammed u – shr...
Name: title_mod, dtype: object

In [41]:
#train on fake news set
X_train = df['title_mod']
y_train = df['label']

In [42]:
max_features = 10000
maxlen = 300
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(X_train)
tokenized_train = tokenizer.texts_to_sequences(X_train)
X_train = pad_sequences(tokenized_train, maxlen=maxlen)
tokenized_test = tokenizer.texts_to_sequences(X_test)
X_test = pad_sequences(tokenized_test, maxlen=maxlen)

AttributeError: 'numpy.ndarray' object has no attribute 'lower'

In [None]:
batch_size = 256
epochs = 10
embed_size = 100
model = Sequential()
#Non-trainable embeddidng layer
model.add(Embedding(max_features, output_dim=embed_size, input_length=maxlen, trainable=False))
#LSTM 
model.add(LSTM(units=128 , return_sequences = True , recurrent_dropout = 0.25 , dropout = 0.25))
model.add(LSTM(units=64 , recurrent_dropout = 0.1 , dropout = 0.1))
model.add(Dense(units = 32 , activation = 'relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer=keras.optimizers.Adam(lr = 0.01), loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

In [43]:
history = model.fit(X_train, y_train, validation_split=0.3, epochs=10, batch_size=batch_size, shuffle=True, verbose = 1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [46]:
#time to see what the model says!
test_title = input("Enter title of news article to see if model predicts it is true or fake: ")
#you can find fake news examples here: https://libguides.valenciacollege.edu/c.php?g=612299&p=4251645

Enter title of news article to see if model predicts it is true or fake:  Coronavirus Bioweapon – How China Stole Coronavirus From Canada And Weaponized It


In [54]:
test_title_mod = text_munging(test_title)
tokenizer.fit_on_texts(test_title_mod)
tokenized_predict = tokenizer.texts_to_sequences(test_title_mod)
test_title_mod = pad_sequences(tokenized_predict, maxlen=maxlen)
perc = model.predict(test_title_mod)[1]*100
print('according to this model, the likelihood that this story is true based on the title is ' + str(perc))

according to this model, the likelihood that this story is true based on the title is [16.31644]
