In [1]:
# Libraries
import io
import string
import re
import random

import pandas as pd
import numpy as np
from collections import Counter

# nltk:- Natural Language Processing Toolkit
import nltk 
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import SnowballStemmer

# For loading Word2Vec Model
from gensim.models import KeyedVectors

from sklearn import preprocessing

# spliting data into train and test
from sklearn.model_selection import train_test_split
# Training Models
from sklearn import model_selection, svm
from sklearn.ensemble import RandomForestClassifier
# For Results
from sklearn.metrics import classification_report ,accuracy_score

# Loading Dataset

In [2]:
df = pd.read_csv('./Data/Dataset.csv', engine='python')

In [3]:
df.head()

Unnamed: 0,URLs,Headline,Body,Label
0,http://www.bbc.com,Four ways Bob Corker skewered Donald Trump,Image copyright Getty Images\nOn Sunday mornin...,1
1,https://www.reuters.com,Linklater's war veteran comedy speaks to moder...,"LONDON (Reuters) - ?Last Flag Flying?, a comed...",1
2,https://www.nytimes.com,Trump?s Fight With Corker Jeopardizes His Legi...,The feud broke into public view last week when...,1
3,https://www.reuters.com,Egypt's Cheiron wins tie-up with Pemex for Mex...,MEXICO CITY (Reuters) - Egypt?s Cheiron Holdin...,1
4,http://www.cnn.com,Jason Aldean opens 'SNL' with Vegas tribute,"Country singer Jason Aldean, who was performin...",1


In [4]:
df.describe()

Unnamed: 0,Label
count,3678.0
mean,0.473083
std,0.499343
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


In [5]:
df.shape

(3678, 4)

In [6]:
df_sources = df[['URLs','Body','Label']]

In [7]:
df_sources.head()

Unnamed: 0,URLs,Body,Label
0,http://www.bbc.com,Image copyright Getty Images\nOn Sunday mornin...,1
1,https://www.reuters.com,"LONDON (Reuters) - ?Last Flag Flying?, a comed...",1
2,https://www.nytimes.com,The feud broke into public view last week when...,1
3,https://www.reuters.com,MEXICO CITY (Reuters) - Egypt?s Cheiron Holdin...,1
4,http://www.cnn.com,"Country singer Jason Aldean, who was performin...",1


In [8]:
# Replacing 1 with "Real" and 0 with "Fake" In a New Colcumn
df_sources['label'] = df_sources['Label'].map(lambda x: 'Real' if x == 1 else 'Fake')

In [9]:
df_sources.head()

Unnamed: 0,URLs,Body,Label,label
0,http://www.bbc.com,Image copyright Getty Images\nOn Sunday mornin...,1,Real
1,https://www.reuters.com,"LONDON (Reuters) - ?Last Flag Flying?, a comed...",1,Real
2,https://www.nytimes.com,The feud broke into public view last week when...,1,Real
3,https://www.reuters.com,MEXICO CITY (Reuters) - Egypt?s Cheiron Holdin...,1,Real
4,http://www.cnn.com,"Country singer Jason Aldean, who was performin...",1,Real


# Data Preprocessing

In [10]:
# cleaning html symbols from the sentence
def cleanhtml(sentence):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, ' ', sentence)
    return cleantext

In [11]:
# cleaning punctuations from the sentence
def cleanpunc(sentence):
    cleaned = re.sub(r'[?|!|\'|"|#]',r'',sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
    return  cleaned

In [12]:
nltk.download('stopwords')

stop = stopwords.words('english') #All the stopwords in English language
#excluding some useful words from stop words list as we doing sentiment analysis
excluding = ['against','not','don', "don't",'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't",
             'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 
             'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't",'shouldn', "shouldn't", 'wasn',
             "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]
stop = [words for words in stop if words not in excluding]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
snow = SnowballStemmer('english') #initialising the snowball stemmer

In [14]:
i = 0
string1 = ' '
final_string = []
fake_words = []                
real_words = []
s = ''

for sent in df_sources['Body'].values:
    filtered_sentence = []
    sent = cleanhtml(str(sent))
    sent = cleanpunc(str(sent))
    for w in sent.split():
        if((w.isalpha()) and (len(w)>2)):  
            if(w.lower() not in stop):    # If it is a stopword
                s = (snow.stem(w.lower())).encode('utf8')
                filtered_sentence.append(s)
                if (df_sources['label'].values)[i] == 'Real':
                    real_words.append(s)
                if(df_sources['label'].values)[i] == 'Fake':
                    fake_words.append(s)
            else:
                continue
        else:
            continue 
    string1 = b" ".join(filtered_sentence) 
    final_string.append(string1)
    i += 1

In [15]:
print("Number of Real words: ", len(real_words))
print("Number of Fake words: ", len(fake_words))

Number of Real words:  567648
Number of Fake words:  404701


In [16]:
df_sources['CleanedText'] = final_string
df_sources.head()

Unnamed: 0,URLs,Body,Label,label,CleanedText
0,http://www.bbc.com,Image copyright Getty Images\nOn Sunday mornin...,1,Real,b'imag copyright getti imag sunday morn donald...
1,https://www.reuters.com,"LONDON (Reuters) - ?Last Flag Flying?, a comed...",1,Real,b'london reuter last flag fli vietnam war vete...
2,https://www.nytimes.com,The feud broke into public view last week when...,1,Real,b'feud broke public view last week corker said...
3,https://www.reuters.com,MEXICO CITY (Reuters) - Egypt?s Cheiron Holdin...,1,Real,b'mexico citi reuter egypt cheiron hold limit ...
4,http://www.cnn.com,"Country singer Jason Aldean, who was performin...",1,Real,b'countri singer jason aldean perform las vega...


In [17]:
# without stem
i = 0
string1 = ' '
final_string_nostem = []
s = ''

for sent in df_sources['Body'].values:
    filtered_sentence=[]
    sent = cleanhtml(str(sent))
    sent = cleanpunc(str(sent))
    for w in sent.split():
        if((w.isalpha()) and (len(w)>2)):  
            if(w.lower() not in stop):
                s = w.lower().encode('utf8')
                filtered_sentence.append(s)
            else:
                continue
        else:
            continue 
    string1 = b" ".join(filtered_sentence)     
    final_string_nostem.append(string1)
    i += 1

In [18]:
df_sources['CleanedText_NoStem'] = final_string_nostem

In [19]:
df_sources.head()

Unnamed: 0,URLs,Body,Label,label,CleanedText,CleanedText_NoStem
0,http://www.bbc.com,Image copyright Getty Images\nOn Sunday mornin...,1,Real,b'imag copyright getti imag sunday morn donald...,b'image copyright getty images sunday morning ...
1,https://www.reuters.com,"LONDON (Reuters) - ?Last Flag Flying?, a comed...",1,Real,b'london reuter last flag fli vietnam war vete...,b'london reuters last flag flying vietnam war ...
2,https://www.nytimes.com,The feud broke into public view last week when...,1,Real,b'feud broke public view last week corker said...,b'feud broke public view last week corker said...
3,https://www.reuters.com,MEXICO CITY (Reuters) - Egypt?s Cheiron Holdin...,1,Real,b'mexico citi reuter egypt cheiron hold limit ...,b'mexico city reuters egypts cheiron holdings ...
4,http://www.cnn.com,"Country singer Jason Aldean, who was performin...",1,Real,b'countri singer jason aldean perform las vega...,b'country singer jason aldean performing las v...


# Feature Extraction

In [20]:
w2v_model = KeyedVectors.load_word2vec_format('Vec.bin.gz', binary=True)

In [21]:
avg_vec = []
datapoint = 3600
sample_cols = random.sample(range(1, datapoint), 3500)
print(sample_cols)

for sent in df_sources['CleanedText_NoStem'].values[sample_cols]:
    cnt = 0
    sent_vec = np.zeros(300)
    sent = sent.decode("utf-8") 
    for word in sent.split():
        try:
            wvec = w2v_model.wv[word]
            sent_vec += wvec
            cnt += 1
        except: 
            pass
    sent_vec /= cnt
    avg_vec.append(sent_vec)
avg_vec = np.array(avg_vec)

[2031, 528, 2172, 1767, 2852, 2910, 2774, 134, 1479, 840, 628, 2622, 3240, 74, 386, 384, 2598, 3003, 1743, 1012, 2401, 499, 1684, 3239, 325, 385, 3090, 694, 2466, 588, 2374, 1107, 707, 429, 1532, 1085, 2243, 2963, 1233, 1459, 2742, 2595, 2797, 2525, 1403, 2782, 2445, 1925, 1794, 258, 3280, 1803, 2209, 47, 2068, 2848, 2266, 235, 3188, 1111, 192, 2444, 1552, 482, 2651, 2279, 2423, 238, 1716, 2757, 2608, 2832, 402, 1646, 718, 664, 2863, 3166, 2189, 477, 2638, 3401, 2656, 1402, 2128, 507, 2792, 311, 2855, 2709, 1686, 1088, 3532, 2273, 698, 2891, 736, 240, 1607, 52, 961, 2496, 2940, 508, 2506, 3297, 2169, 751, 1441, 2490, 1155, 2784, 3122, 1139, 14, 1639, 3583, 2943, 963, 1309, 957, 724, 224, 594, 1171, 3080, 510, 1886, 3426, 1428, 3236, 196, 1656, 103, 1572, 825, 1437, 1670, 2977, 2530, 3246, 3320, 3386, 1447, 2442, 2721, 1703, 3021, 1502, 336, 3158, 2755, 985, 208, 431, 962, 1205, 2112, 33, 1880, 3420, 3496, 3286, 1225, 392, 2095, 1474, 306, 898, 3138, 1530, 3439, 3139, 1738, 466, 3367, 6

  if sys.path[0] == '':


In [22]:
col_mean = np.nanmean(avg_vec, axis=0)

In [23]:
inds = np.where(np.isnan(avg_vec)) 
inds

(array([], dtype=int64), array([], dtype=int64))

In [24]:
avg_vec[inds] = np.take(col_mean, inds[1])

In [25]:
avg_vec

array([[ 0.01997244,  0.00810643,  0.02555133, ..., -0.04480878,
         0.01539862, -0.03777151],
       [-0.00793062,  0.01785057,  0.01505266, ..., -0.03377349,
         0.00850744,  0.01977539],
       [ 0.05515263,  0.02131232,  0.03239809, ..., -0.03908013,
         0.0964587 ,  0.02547928],
       ...,
       [ 0.02048484,  0.04999871,  0.0289135 , ..., -0.05455526,
         0.02844876,  0.02564424],
       [ 0.04568246,  0.05437557,  0.01303368, ..., -0.06733903,
         0.02020742, -0.00325991],
       [-0.01713439,  0.10913958,  0.04739652, ...,  0.00650715,
         0.04477456, -0.01772199]])

In [26]:
from sklearn import preprocessing
avg_vec_norm = preprocessing.normalize(avg_vec)

In [27]:
# labels against the sample data points
s1=[]
for sent in df_sources['label'].values[sample_cols]:
    s1.append(sent)

df1 = pd.DataFrame(s1, columns = ['label']) 
df1

Unnamed: 0,label
0,Real
1,Real
2,Fake
3,Fake
4,Real
...,...
3495,Real
3496,Real
3497,Real
3498,Fake


In [28]:
# spliting data into train and test
X_train, X_test, y_train, y_test = train_test_split(avg_vec, df1['label'], train_size=0.8)

# Training Models

## 1. SVM

In [29]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(X_train,y_train)
# predict the labels on validation dataset
predictions_SVM = SVM.predict(X_test)

### Evalution Measures

In [30]:
# Use accuracy_score and classification_report function to get the accuracy,f1-score, recall, and precision
print(classification_report(y_test,predictions_SVM))
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, y_test)*100)

              precision    recall  f1-score   support

        Fake       0.93      0.96      0.94       373
        Real       0.95      0.92      0.93       327

    accuracy                           0.94       700
   macro avg       0.94      0.94      0.94       700
weighted avg       0.94      0.94      0.94       700

SVM Accuracy Score ->  93.85714285714286


## 2. Random Forest

In [31]:
text_classifier = RandomForestClassifier(n_estimators=200, random_state=42)
text_classifier.fit(X_train,y_train)
predictions = text_classifier.predict(X_test)

### Evalution Measures

In [32]:
# Use accuracy_score and classification_report function to get the accuracy,f1-score, recall, and precision
print(classification_report(y_test,predictions))
print("RF Accuracy Score -> ",accuracy_score(predictions, y_test)*100)

              precision    recall  f1-score   support

        Fake       0.95      0.92      0.94       373
        Real       0.91      0.94      0.93       327

    accuracy                           0.93       700
   macro avg       0.93      0.93      0.93       700
weighted avg       0.93      0.93      0.93       700

RF Accuracy Score ->  93.28571428571428
