In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from keras.utils import pad_sequences
from sklearn.model_selection import train_test_split
import spacy

In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
# Load the dataset
fake_df = pd.read_csv(r"h:\Desktop\BI Projet\FakeNews.csv")
real_df = pd.read_csv(r"h:\Desktop\BI Projet\RealNews.csv")
fake_df['is_False'] = 1
real_df['is_False'] = 0
df = pd.concat([fake_df, real_df], axis=0)

In [4]:
df = pd.concat([fake_df, real_df], axis=0)
df.head()

Unnamed: 0,title,text,subject,date,is_False
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",1
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",1
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",1
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",1
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",1


In [5]:
df.sample(5)

Unnamed: 0,title,text,subject,date,is_False
15323,EU parliament's Brexit negotiator: 'major issu...,BRUSSELS (Reuters) - Major issues must still...,worldnews,"November 8, 2017",0
16906,Doubts about smoking gun as Duterte lauds Chin...,"MANILA/MARAWI CITY, Philippines (Reuters) - Ph...",worldnews,"October 20, 2017",0
8626,Donna Brazile DESTROYS Palin For Blaming Her ...,Sarah Palin did something entirely expected of...,News,"January 21, 2016",1
16613,Catalan government mulling calling snap electi...,MADRID (Reuters) - The Catalan government is c...,worldnews,"October 24, 2017",0
14363,North Korean foreign minister heads to Cuba,HAVANA (Reuters) - North Korea s foreign minis...,worldnews,"November 20, 2017",0


In [6]:
fake_df.shape

(23481, 5)

In [7]:
real_df.shape

(21417, 5)

In [8]:
df.shape

(44898, 5)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 44898 entries, 0 to 21416
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   title     44898 non-null  object
 1   text      44898 non-null  object
 2   subject   44898 non-null  object
 3   date      44898 non-null  object
 4   is_False  44898 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 2.1+ MB


In [10]:
df.isnull().sum()

title       0
text        0
subject     0
date        0
is_False    0
dtype: int64

In [11]:
df.describe(include='all')

Unnamed: 0,title,text,subject,date,is_False
count,44898,44898.0,44898,44898,44898.0
unique,38729,38646.0,8,2397,
top,Factbox: Trump fills top jobs for his administ...,,politicsNews,"December 20, 2017",
freq,14,627.0,11272,182,
mean,,,,,0.522985
std,,,,,0.499477
min,,,,,0.0
25%,,,,,0.0
50%,,,,,1.0
75%,,,,,1.0


In [12]:
df = df.drop_duplicates(subset=['text'], keep="last")


df.describe()


Unnamed: 0,is_False
count,38646.0
mean,0.451638
std,0.497662
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


In [13]:
stop_words = nlp.Defaults.stop_words

In [14]:
# creating a function to preprocess all text
def preprocess(sentence):
    sentence = sentence.lower()
    sentence = sentence.replace('https://', ' ')
    sentence = re.sub('[^a-zA-Z]',' ',sentence)
    sentence = " ".join(sentence.split())
    raw = []
    for word in sentence.split():
        if word in stop_words:
            pass
        else:
            raw.append(word)
    sentence = ' '.join(raw)
    sentence = ' '.join([i.lemma_ for i in nlp(sentence)])
    return sentence

In [15]:
corpus = [text for text in df['text']]
corpus[0]

'Donald Trump just couldn t wish all Americans a Happy New Year and leave it at that. Instead, he had to give a shout out to his enemies, haters and  the very dishonest fake news media.  The former reality show star had just one job to do and he couldn t do it. As our Country rapidly grows stronger and smarter, I want to wish all of my friends, supporters, enemies, haters, and even the very dishonest Fake News Media, a Happy and Healthy New Year,  President Angry Pants tweeted.  2018 will be a great year for America! As our Country rapidly grows stronger and smarter, I want to wish all of my friends, supporters, enemies, haters, and even the very dishonest Fake News Media, a Happy and Healthy New Year. 2018 will be a great year for America!  Donald J. Trump (@realDonaldTrump) December 31, 2017Trump s tweet went down about as welll as you d expect.What kind of president sends a New Year s greeting like this despicable, petty, infantile gibberish? Only Trump! His lack of decency won t ev

In [None]:
# preprocessing all the news texts
import re
import tqdm
for i in tqdm.tqdm(range(len(corpus))):
    corpus[i] = preprocess(corpus[i])


 86%|█████████████████████████████████████████████████████████████████▍          | 33255/38646 [36:32<03:40, 24.48it/s]

In [None]:
# one-hot encoding representation of all news
from keras.preprocessing.text import one_hot

vocab_size = 10000

ohe = []

for sent in tqdm.tqdm(corpus):
    ohe.append(one_hot(sent,vocab_size))

In [None]:
# maximum length of a news
m = max([len(sent) for sent in ohe])
m

In [None]:
# prepadding
from keras.utils import pad_sequences

emb_doc = pad_sequences(ohe, maxlen = m, padding = 'pre')

In [None]:
# creating the model
from keras.models import Sequential
from keras.layers import Dropout, Embedding, Dense, LSTM

dim = 64
model = Sequential()
model.add(Embedding(vocab_size, dim, input_length=m))
model.add(Dropout(0.3))
model.add(LSTM(256))
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',loss='binary_crossentropy', metrics='acc')
model.summary()

In [None]:
X = np.array(emb_doc)
Y = df['is_False']

In [None]:
# splitting the data
x_train, x_test, y_train, y_test = train_test_split(X, Y, random_state=0, test_size=0.3)

In [None]:
model.fit(x_train, y_train, validation_split=0.1, batch_size=64, epochs=10)

In [None]:
# predictions
pred = model.predict(x_test)
y_pred = np.where(pred > 0.5, 1,0)

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

accuracy_score(y_test, y_pred)
confusion_matrix(y_test, y_pred)