In [0]:

from google.colab import drive
drive.mount("/content/gdrive")

import pandas as pd
import os
import re
import numpy as np
from string import punctuation
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import pickle

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import pandas as pd
import numpy as np

import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [0]:
data_real = pd.read_csv('./gdrive/My Drive/real_news_set.csv')
data_real.head()

Unnamed: 0,url,headline,body,label
0,https://web.archive.org/web/20161003021917/htt...,Rep. Kristi Noem: My father’s tragic death and...,"By the time I got to the farm, neighbors and f...",1
1,https://web.archive.org/web/20161004070028/htt...,Russia deploys advanced anti-missile system to...,It comes after Russia’s actions led to the col...,1
2,https://web.archive.org/web/20161004070028/htt...,"Aleppo: ""The international community has turne...","Aleppo has become an abandoned city, reduced t...",1
3,https://web.archive.org/web/20161005034102/htt...,"Pence, Kaine trade attacks, talk over each oth...","Within 10 minutes, they were talking over each...",1
4,https://web.archive.org/web/20161006061117/htt...,US taxpayers made millionaires out of Afghan g...,"A total of $114 billion, which does not includ...",1


In [0]:
## Shape
print("shape: ", data_real.shape)

shape:  (11831, 4)


In [0]:
## Checking for Missing values
data_real.isnull().sum()

url         0
headline    0
body        0
label       0
dtype: int64

In [0]:
data_fake = pd.read_csv('./gdrive/My Drive/fake_news_set.csv')
data_fake.head()
## Shape
print("shape: ", data_fake.shape)
data_fake.isnull().sum()
data = pd.concat([data_real,data_fake])
data = data.sample(frac=1).reset_index(drop=True)
data = data.sample(frac=1).reset_index(drop=True)



shape:  (6822, 4)


In [0]:
length = []
[length.append(len(str(text))) for text in data['body']]
data['length'] = length
data.head()

Unnamed: 0,url,headline,body,label,length
0,dcclothesline.com,Comment on Like a ‘Concentration Camp’ Police ...,"Posted on October 30, 2016 by Claire Bernish \...",0,6718
1,http://www.reuters.com/article/us-quake-newzea...,"Storm lashes New Zealand quake zone, more buil...",Rain and strong winds battered central New Ze...,1,4994
2,https://www.reuters.com/article/us-trade-nafta...,Mexico economy minister downplays trade defici...,Mexico's Economy Minister Ildefonso Guajardo a...,1,1774
3,http://www.npr.org/sections/health-shots/2016/...,Brain Implant Restores Sense Of Touch To Paral...,"Twelve years ago, a car wreck took away Nathan...",1,4110
4,https://web.archive.org/web/20161105000920/htt...,Civilian casualties are starting to rise as Ir...,"GOGJALI, Iraq — The vehicles screeched in...",1,6724


In [0]:
## Minimum Length
print("minimum length of texts: ", min(data['length']))
## Maximum Length
print("maximum length of texts: ", max(data['length']))

minimum length of texts:  1
maximum length of texts:  142961


In [0]:
## Checking for outliers
## typically I think a news with a text length of less than 100 characters is not a news at all
## I think its a glitch
## Checking how many rows have less than 100 characters

print("Number of Outliers: ", len(data[data['length'] < 100]))

## Lets have a look at those texts that I believe they are outliers
data['body'][data['length'] < 100]

Number of Outliers:  193


45       Vietnam Is in Great Danger, You Must Publish a...
58       Vietnam Is in Great Danger, You Must Publish a...
71       Chat with us in Facebook Messenger. Find out w...
221      Vietnam Is in Great Danger, You Must Publish a...
527               A guide to the topical fireworks of 2016
                               ...                        
18138    Vietnam Is in Great Danger, You Must Publish a...
18164    Vietnam Is in Great Danger, You Must Publish a...
18534    (Before It's News)\n \n \nEmigrate While You S...
18548    President Obama said Friday that his wife Mich...
18603    Vietnam Is in Great Danger, You Must Publish a...
Name: body, Length: 193, dtype: object

In [0]:
# dropping the outliers
data = data.drop(data['body'][data['length'] < 100].index, axis = 0)
data.shape

(18460, 5)

In [0]:
# dropping ALL duplicte values 
data.drop_duplicates(subset ="body", 
                     keep = False, inplace = True)
data.shape

(16925, 5)

In [0]:
data['label'].dropna(inplace=True)
print(data['label'].unique())
print(data.shape)

[1 0]
(16925, 5)


In [0]:
data = data.reset_index()
data

Unnamed: 0,index,url,headline,body,label,length
0,1,http://www.reuters.com/article/us-quake-newzea...,"Storm lashes New Zealand quake zone, more buil...",Rain and strong winds battered central New Ze...,1,4994
1,2,https://www.reuters.com/article/us-trade-nafta...,Mexico economy minister downplays trade defici...,Mexico's Economy Minister Ildefonso Guajardo a...,1,1774
2,3,http://www.npr.org/sections/health-shots/2016/...,Brain Implant Restores Sense Of Touch To Paral...,"Twelve years ago, a car wreck took away Nathan...",1,4110
3,4,https://web.archive.org/web/20161105000920/htt...,Civilian casualties are starting to rise as Ir...,"GOGJALI, Iraq — The vehicles screeched in...",1,6724
4,5,https://www.reuters.com/article/us-usa-healthc...,California governor signs drug pricing transpa...,FILE PHOTO: California Governor Jerry Brown at...,1,2833
...,...,...,...,...,...,...
16920,18648,waterfordwhispersnews.com,‘Scattering Of Paedophile Priests Fine Though’...,0 Add Comment \nIN A bid to clear up any confu...,0,2071
16921,18649,http://www.nationalreview.com/article/442073/d...,Here’s Who Trump Should Pick for His National-...,As Donald Trump will now receive much more ...,1,6741
16922,18650,https://web.archive.org/web/20161110000454/htt...,New Jersey takes over debt-ridden Atlantic Cit...,"TRENTON, N. J. — Republican Gov. Chris Chr...",1,3100
16923,18651,https://www.nytimes.com/2017/10/04/arts/televi...,Sarah Silverman Wants to Pop Your Bubble,Since Ms. Silverman became a star at the start...,1,10200


In [0]:
## Minimum Length
print("minimum length of texts: ", min(data['length']))
## Maximum Length
print("maximum length of texts: ", max(data['length']))

max_features = 142900

minimum length of texts:  101
maximum length of texts:  142961


In [0]:
# Change all the text to lower case.
data['body'] = [entry.lower() for entry in data['body']]

In [0]:
#Tokenization : In this each entry in the corpus will be broken into set of words
data['body']= [word_tokenize(entry) for entry in data['body']]

In [0]:
data

Unnamed: 0,index,url,headline,body,label,length
0,1,http://www.reuters.com/article/us-quake-newzea...,"Storm lashes New Zealand quake zone, more buil...","[rain, and, strong, winds, battered, central, ...",1,4994
1,2,https://www.reuters.com/article/us-trade-nafta...,Mexico economy minister downplays trade defici...,"[mexico, 's, economy, minister, ildefonso, gua...",1,1774
2,3,http://www.npr.org/sections/health-shots/2016/...,Brain Implant Restores Sense Of Touch To Paral...,"[twelve, years, ago, ,, a, car, wreck, took, a...",1,4110
3,4,https://web.archive.org/web/20161105000920/htt...,Civilian casualties are starting to rise as Ir...,"[gogjali, ,, iraq, —, the, vehicles, screeched...",1,6724
4,5,https://www.reuters.com/article/us-usa-healthc...,California governor signs drug pricing transpa...,"[file, photo, :, california, governor, jerry, ...",1,2833
...,...,...,...,...,...,...
16920,18648,waterfordwhispersnews.com,‘Scattering Of Paedophile Priests Fine Though’...,"[0, add, comment, in, a, bid, to, clear, up, a...",0,2071
16921,18649,http://www.nationalreview.com/article/442073/d...,Here’s Who Trump Should Pick for His National-...,"[as, donald, trump, will, now, receive, much, ...",1,6741
16922,18650,https://web.archive.org/web/20161110000454/htt...,New Jersey takes over debt-ridden Atlantic Cit...,"[trenton, ,, n., j, ., —, republican, gov, ., ...",1,3100
16923,18651,https://www.nytimes.com/2017/10/04/arts/televi...,Sarah Silverman Wants to Pop Your Bubble,"[since, ms., silverman, became, a, star, at, t...",1,10200


In [0]:
# WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. By default it is set to Noun
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV
i=0
for index,entry in enumerate(data['body']):

    # Declaring Empty List to store the words that follow the rules for this step
    Final_words = []
    # Initializing WordNetLemmatizer()
    word_Lemmatized = WordNetLemmatizer()
    # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
    for word, tag in pos_tag(entry):
        # Below condition is to check for Stop words and consider only alphabets
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)
    # The final processed set of words for each iteration will be stored in 'text_final'
    data.loc[index,'text_final'] = str(Final_words)
    print(index)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
11926
11927
11928
11929
11930
11931
11932
11933
11934
11935
11936
11937
11938
11939
11940
11941
11942
11943
11944
11945
11946
11947
11948
11949
11950
11951
11952
11953
11954
11955
11956
11957
11958
11959
11960
11961
11962
11963
11964
11965
11966
11967
11968
11969
11970
11971
11972
11973
11974
11975
11976
11977
11978
11979
11980
11981
11982
11983
11984
11985
11986
11987
11988
11989
11990
11991
11992
11993
11994
11995
11996
11997
11998
11999
12000
12001
12002
12003
12004
12005
12006
12007
12008
12009
12010
12011
12012
12013
12014
12015
12016
12017
12018
12019
12020
12021
12022
12023
12024
12025
12026
12027
12028
12029
12030
12031
12032
12033
12034
12035
12036
12037
12038
12039
12040
12041
12042
12043
12044
12045
12046
12047
12048
12049
12050
12051
12052
12053
12054
12055
12056
12057
12058
12059
12060
12061
12062
12063
12064
12065
12066
12067
12068
12069
12070
12071
12072
12073
12074
12075
12076
12077
12078
12079
12080
12081

In [0]:
Train_X, Test_X, Train_Y, Test_Y = train_test_split(data['text_final'],data['label'],test_size=0.3)

In [0]:
Train_Y

2529     1
10184    1
10107    0
10989    1
19       1
        ..
9567     0
12083    1
15186    1
4352     1
3070     1
Name: label, Length: 11847, dtype: int64

In [0]:
data.dtypes

index          int64
url           object
headline      object
body          object
label          int64
length         int64
text_final    object
dtype: object

In [0]:
Tfidf_vect = TfidfVectorizer(max_features=7000)
Tfidf_vect.fit(data['text_final'])

filename = 'tokenizer_news.sav'
pickle.dump(Tfidf_vect, open(filename, 'wb'))


Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

In [0]:

from google.colab import files
files.download('tokenizer_news.sav')

In [0]:
print(Tfidf_vect.vocabulary_)



In [0]:
# fit the training dataset on the NB classifier
Naive = naive_bayes.MultinomialNB()
Naive.fit(Train_X_Tfidf,Train_Y)
# predict the labels on validation dataset
predictions_NB = Naive.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print("Naive Bayes Accuracy Score -> ",accuracy_score(predictions_NB, Test_Y)*100)

Naive Bayes Accuracy Score ->  84.30484442693974


In [0]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf,Train_Y)
# predict the labels on validation dataset
predictions_SVM = SVM.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)

SVM Accuracy Score ->  93.24537219377707


In [0]:
## training - LogisticRegression
logisticRegr = LogisticRegression()
logisticRegr.fit(Train_X_Tfidf,Train_Y)

# predict the labels on validation dataset
predictions_LR = logisticRegr.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print("LR Accuracy Score -> ",accuracy_score(predictions_LR, Test_Y)*100)

LR Accuracy Score ->  91.78810555336747


In [0]:

## training - RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(Train_X_Tfidf,Train_Y)

# predict the labels on validation dataset
predictions_RFC = rfc.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print("RFC Accuracy Score -> ",accuracy_score(predictions_RFC, Test_Y)*100)

RFC Accuracy Score ->  86.41197321780228


In [0]:
filename = 'naive_news.sav'
pickle.dump(Naive, open(filename, 'wb'))

In [0]:
filename = 'svm_news.sav'
pickle.dump(SVM, open(filename, 'wb'))

In [0]:
filename = 'LogReg_news.sav'
pickle.dump(logisticRegr, open(filename, 'wb'))

In [0]:
filename = 'RFC_news.sav'
pickle.dump(rfc, open(filename, 'wb'))

In [0]:
preprocessed_data = Train_X_Tfidf, Test_X_Tfidf, Train_Y, Test_Y
filename = 'preprocessed_data_news.sav'
pickle.dump(preprocessed_data, open(filename, 'wb'))

In [0]:
from google.colab import files
files.download('naive_news.sav')
files.download('svm_news.sav')
files.download('LogReg_news.sav')
files.download('RFC_news.sav')

In [0]:
files.download('preprocessed_data_news.sav')