# Import Packages and Libraries

In [None]:
#import necessary packages and libraries
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Dense, Input, Flatten, Dropout, Add
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Embedding
from tensorflow.keras.layers import LSTM, Bidirectional
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
import gensim
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
nltk.download('punkt')
import re
import codecs
import matplotlib.pyplot as plt
from gensim.models import word2vec
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, roc_curve, auc, classification_report


# Read the data

In [None]:
#Read the data and import them into dataframes
df1 = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Fake.csv')
df2 = pd.read_csv('/content/drive/My Drive/Colab Notebooks/True.csv')

# Data Cleaning and Preprocessing

In [None]:
#Assign target variable to differentiate fake and real news
df1['target'] = [1 for i in range(len(df1))]
df2['target'] = [0 for i in range(len(df2))]

In [None]:
#Concatenate real and fake news datsets and clean the data
df_tot = pd.concat([df1, df2])
df_tot = df_tot.replace(r'^\s*$', np.NaN, regex=True)
df_tot = df_tot.dropna()
df_tot = df_tot.reset_index()
df_tot = df_tot.drop('index', axis=1)
df_tot['combined'] = df_tot['title'] + '. ' + df_tot['text']

In [None]:
#Clean the datsets by removing special characters, numbers etc.
def news_wordlist(new, remove_stopwords=False):
    new = re.sub("[^a-zA-Z]"," ", new)
    words = new.lower().split()
    if remove_stopwords:
        stops = set(stopwords.words("english"))     
        words = [w for w in words if not w in stops] 
    return (words)

In [None]:
#define function to return news after cleaning
def news_sentences(new, remove_stopwords=False):
    raw_sentences = nltk.sent_tokenize(new.strip())
    sentences = []
    # 2. Loop for each sentence
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            sentences.append(news_wordlist(raw_sentence, remove_stopwords))

    # This returns the list of lists
    return sentences

sentences = []

for new in df_tot["combined"]:
    sentences += news_sentences(new)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
#Add each word in the combined column of news datset into vocab list
vocab = Counter()

for new in df_tot["combined"]:
    raw_sentences = nltk.sent_tokenize(new.strip())
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            vocab.update(news_wordlist(raw_sentence, remove_stopwords=False))

# Create Word2Vec

In [None]:
#Create Word2Vec model and store it in model
#Word2Vec
num_features = 100  
min_word_count = 5
num_workers = 4     
context = 10        
downsampling = 1e-3 

# Initializing the train model
print("Training model....")
model = word2vec.Word2Vec(sentences,\
                          workers=num_workers,\
                          size=num_features,\
                          min_count=min_word_count,\
                          window=context,
                          sample=downsampling)

# To make the model memory efficient
model.init_sims(replace=True)

# Saving the model for later use. Can be loaded using Word2Vec.load()
model_name = "word2vec_model"
model.save(model_name)

Training model....


In [None]:
#Assign X and Y variables where y is target variable
X = df_tot[['combined']]
y = df_tot['target']

In [None]:
#Split data into training data and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50, random_state=0)

In [None]:
#Put the X_Train and X_Test into a list called X_train_sent and X_test_sent
X_train_sent = list(X_train["combined"].values)
X_test_sent = list(X_test["combined"].values)

# CNN Model

In [None]:
#Import necessary libraries for CNN
from collections import Counter
from keras.preprocessing.text import Tokenizer
import itertools
from keras.preprocessing.sequence import pad_sequences
#from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, Conv1D, GRU, CuDNNGRU, CuDNNLSTM, BatchNormalization
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, Conv1D, GRU, BatchNormalization
from keras.layers import Bidirectional, GlobalMaxPool1D, MaxPooling1D, Add, Flatten
from keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D, concatenate, SpatialDropout1D
from keras.models import Model, load_model
from keras import initializers, regularizers, constraints, optimizers, layers, callbacks
from keras import backend as K
from keras.engine import InputSpec, Layer
from keras.optimizers import Adam


from keras.callbacks import ModelCheckpoint, TensorBoard, Callback, EarlyStopping
from tensorflow.python.keras.layers import LSTM, CuDNNLSTM
from tensorflow.python.keras.layers import GRU, CuDNNGRU


In [None]:
word_vectors = model.wv
MAX_NB_WORDS = len(word_vectors.vocab)
MAX_SEQUENCE_LENGTH = 200

In [None]:
#preprocess and clean train and test data 
X_train_sent_pre = []
for new in X_train_sent:
    news_sent = news_sentences(new)
    news_sent = list(itertools.chain(*news_sent))
    X_train_sent_pre.append(news_sent)

X_test_sent_pre = []
for new in X_test_sent:
    news_sent = news_sentences(new)
    news_sent = list(itertools.chain(*news_sent))
    X_test_sent_pre.append(news_sent)




In [None]:
#assign values for the words in each sentence in the training and test dataset using word2vec model
word_index = {t[0]: i+1 for i,t in enumerate(vocab.most_common(MAX_NB_WORDS))}
sequences = [[word_index.get(t, 0) for t in sentence]
             for sentence in X_train_sent_pre]
test_sequences = [[word_index.get(t, 0)  for t in sentence] 
                  for sentence in X_test_sent_pre]

# pad the sequences
X_train_seq = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, 
                     padding="pre", truncating="post")
X_test_seq = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding="pre",
                          truncating="post")

In [None]:
#shape of X_test_seq
X_test_seq.shape

(22134, 200)

In [None]:
WV_DIM = 100
nb_words = min(MAX_NB_WORDS, len(word_vectors.vocab)) +1
# we initialize the matrix with random numbers
wv_matrix = (np.random.rand(nb_words, WV_DIM) - 0.5) / 5.0
for word, i in word_index.items():
    if i >= MAX_NB_WORDS:
        continue
    try:
        embedding_vector = word_vectors[word]
        # words not found in embedding index will be all-zeros.
        wv_matrix[i] = embedding_vector
    except:
        pass  

In [None]:
#Embedding layer
wv_layer = Embedding(nb_words,
                     WV_DIM,
                     mask_zero=False,
                     weights=[wv_matrix],
                     input_length=MAX_SEQUENCE_LENGTH,
                     trainable=False)


In [None]:
#define attention layer
class Attention(tf.keras.Model):
    def __init__(self, units):
        super(Attention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, features, hidden):
        # hidden shape == (batch_size, hidden size)
        # hidden_with_time_axis shape == (batch_size, 1, hidden size)
        # we are doing this to perform addition to calculate the score
        hidden_with_time_axis = tf.expand_dims(hidden, 1)
          
        # score shape == (batch_size, max_length, 1)
        # we get 1 at the last axis because we are applying score to self.V
        # the shape of the tensor before applying self.V is (batch_size, max_length, units)
        score = tf.nn.tanh(
            self.W1(features) + self.W2(hidden_with_time_axis))
        # attention_weights shape == (batch_size, max_length, 1)
        attention_weights = tf.nn.softmax(self.V(score), axis=1)
          
        # context_vector shape after sum == (batch_size, hidden_size)
        context_vector = attention_weights * features
        context_vector = tf.reduce_sum(context_vector, axis=1)
        return context_vector, attention_weights

In [None]:
#Define function to build the model
def build_model(wv_layer):
    input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    x = wv_layer(input)
    conv1 = Conv1D(filters=1, kernel_size=2, padding='same')(x)
    conv1 = MaxPooling1D(pool_size=32)(conv1)  
    conv2 = Conv1D(filters=2, kernel_size=3, padding='same')(x)
    conv2 = MaxPooling1D(pool_size=32)(conv2)
        
    conv3 = Conv1D(filters=3, kernel_size=4, padding='same')(x)
    conv3 = MaxPooling1D(pool_size=32)(conv3)
        
    cnn = concatenate([conv1, conv2, conv3], axis=-1)
    # flat = Flatten()(cnn)
    x = Bidirectional(LSTM(300, return_sequences=True, dropout=0.25,
                           recurrent_dropout=0.25))(cnn)
    (lstm, forward_h, forward_c, backward_h, backward_c) = Bidirectional(LSTM(300, return_sequences=True, return_state=True), name="bi_lstm_1")(x)
    state_h = concatenate([forward_h, backward_h])
    state_c = concatenate([forward_c, backward_c])
    context_vector, attention_weights = Attention(10)(x, state_h)
    x = Dense(256, activation="relu")(context_vector)
    x = Dropout(0.25)(x)
    x = Dense(1, activation="sigmoid")(x)
    model = Model(inputs=input, outputs=x)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    print(model.summary())
    return model

In [None]:
#Build the model
model = build_model(wv_layer)

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 200)]        0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 200, 100)     4497600     input_1[0][0]                    
__________________________________________________________________________________________________
conv1d (Conv1D)                 (None, 200, 1)       201         embedding[0][0]                  
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 200, 2)       602         embedding[0][0]                  
_______________________________________________________________________________________

epoch = 10

In [None]:
#Running model when epoch=10 and printing results
history = model.fit(X_train_seq, y_train, validation_split=0.1,
                 epochs=10, batch_size=256, shuffle=True)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
test_loss, test_acc = model.evaluate(X_test_seq, y_test)



In [None]:
#print accuracy when epoch=10
test_acc

0.9984639286994934

# Print Model Results

In [None]:
#print overall reults of the model when epoch=10
from sklearn.metrics import classification_report

y_pred = model.predict(X_test_seq, batch_size=64, verbose=1)
y_pred_bool = (y_pred > 0.5).astype("int32")

print(classification_report(y_test, y_pred_bool))

In [None]:
#adding binary prediction(0/1) into list called binary_prediction
pred = model.predict(X_test_seq)

binary_predictions = []

for i in pred:
    if i >= 0.5:
        binary_predictions.append(1)
    else:
        binary_predictions.append(0) 

In [None]:
#Printing results score of model when epoch=10
from sklearn.metrics import accuracy_score
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score
accuracy = accuracy_score(binary_predictions, y_test)
precision = precision_score(binary_predictions, y_test)
recall = recall_score(binary_predictions, y_test)
print('Accuracy on testing set:', accuracy)
print('Precision on testing set:', precision)
print('Recall on testing set:', recall)
f1 = 2 * (precision * recall) / (precision + recall)
print('Recall on testing set:',f1)

Accuracy on testing set: 0.997334417638023
Precision on testing set: 0.9959503477418787
Recall on testing set: 0.9988521984813703


In [None]:
cm = pd.DataFrame(confusion_matrix(y_test,y_pred_bool) , index = ['Fake','Not Fake'] , columns = ['Fake','Not Fake'])
sns.heatmap(cm,cmap= 'Blues', annot = True, fmt='', xticklabels = ['Fake','Not Fake'], yticklabels = ['Fake','Not Fake'])
plt.xlabel("Actual")
plt.ylabel("Predicted")
plt.title('Confusion matrix on test data')
plt.show()

# Function to test on external datasets

In [None]:
#Define function for testing on other datasets
def test1(df, vocab):
    X_test_sent = list(df["text"].values)
    X_test_sent_pre = []
    #vocab = Counter()
    for new in df['text']:
        raw_sentences = nltk.sent_tokenize(new.strip())
        for raw_sentence in raw_sentences:
            if len(raw_sentence) > 0:
                vocab.update(news_wordlist(raw_sentence, remove_stopwords=False))
    for new in X_test_sent:
        news_sent = news_sentences(new)
        news_sent = list(itertools.chain(*news_sent))
        X_test_sent_pre.append(news_sent)
    word_index = {t[0]: i+1 for i,t in enumerate(vocab.most_common(MAX_NB_WORDS))}
    test_sequences = [[word_index.get(t, 0)  for t in sentence] 
                    for sentence in X_test_sent_pre]
    X_train_seq = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH, 
                     padding="pre", truncating="post")
    result = model.predict(X_train_seq)
    result1 = np.argmax(result, axis=1)
    return result1