# Fake news detection 

Importing libraries 

In [20]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import re
from time import time
import string
#import itertools
from pprint import pprint

from nltk import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import PassiveAggressiveClassifier, LogisticRegression

from sklearn.metrics import confusion_matrix, classification_report

#from gensim import models
#from gensim.models import word2vec,doc2vec

# Data Exploration

In [21]:
df = pd.read_csv("https://s3.amazonaws.com/assets.datacamp.com/blog_assets/fake_or_real_news.csv")
print(df.shape)

(6335, 4)


In [22]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


# Extracting the training data

In [23]:
y = df.label 
df.drop("label", axis=1) 
X_train, X_test, y_train, y_test = train_test_split(df['text'], y, test_size=0.33, random_state=42)

In [24]:
print(X_train.shape)
print(type(X_train))
print(X_train.head())


print(X_test.shape)
print(type(X_test))
print(X_test.head())

(4244,)
<class 'pandas.core.series.Series'>
5593    The next president is most likely to face an i...
6067    Following news of FBI Director James Comey’s d...
3026    Interviews A protester throws a glass bottle c...
4385    Will it be representative government or thugoc...
120     It is increasingly apparent that the U.S. war ...
Name: text, dtype: object
(2091,)
<class 'pandas.core.series.Series'>
1357    Will Trump pull a Brexit times ten? What would...
2080    Clintons Are Under Multiple FBI Investigations...
2718    Dispatches from Eric Zuesse This piece is cros...
812     Print \n[Ed. – Every now and then the facade c...
4886    Nanny In Jail After Force Feeding Baby To Deat...
Name: text, dtype: object


Let’s initialize a TfidfVectorizer with stop words from the English language and a maximum document frequency of 0.7 (terms with a higher document frequency will be discarded). Stop words are the most common words in a language that are to be filtered out before processing the natural language data. And a TfidfVectorizer turns a collection of raw documents into a matrix of TF-IDF features.

In [25]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7) 
tfidf_train = tfidf_vectorizer.fit_transform(X_train) 
tfidf_test = tfidf_vectorizer.transform(X_test)

In [26]:
print(tfidf_train.shape)
print(tfidf_test.shape)

(4244, 56801)
(2091, 56801)


In [27]:
print(tfidf_vectorizer.get_feature_names()[-10:])

['שתי', 'תאמצנה', 'תוצאה', 'תחל', 'תיירות', 'תנותק', 'תעודת', 'תתרכז', 'القادمون', 'عربي']


In [28]:
tfidf_df = pd.DataFrame(tfidf_train.A, columns=tfidf_vectorizer.get_feature_names())

In [29]:
count_vectorizer = CountVectorizer(stop_words='english')
count_train = count_vectorizer.fit_transform(X_train)
count_test = count_vectorizer.transform(X_test)

In [30]:
count_vectorizer.get_feature_names()[:10]

['00',
 '000',
 '0000',
 '000000031',
 '00000031',
 '0001',
 '000billion',
 '000ft',
 '000km',
 '001']

In [31]:
tfidf_df.head()

Unnamed: 0,00,000,0000,000000031,00000031,0001,000billion,000ft,000km,001,...,שתי,תאמצנה,תוצאה,תחל,תיירות,תנותק,תעודת,תתרכז,القادمون,عربي
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Comparing Models

In [32]:
import sklearn.metrics as metrics

In [33]:
def classify_and_fit(clf, X_train, y_train, X_test, y_test, class_labels = ['FAKE', 'REAL']):
    print("Classifier : ", clf )
    
    clf.fit(X_train, y_train)
    pred = clf.predict(X_test)
    score = metrics.accuracy_score(y_test, pred)
    
    print("Accuracy:   %0.3f" % score)

    print("\nConfusion Matrix :")
    #print(pd.crosstab(y_test, pred, rownames=['True'], colnames=['Predicted'], margins=True))
    cm = metrics.confusion_matrix(y_test, pred, labels=class_labels)
    print(cm)
    
    print("\nReport :")    
    print(classification_report(y_test, pred, target_names=class_labels))
    
    
    return clf

In [34]:
clf = MultinomialNB() 
classify_and_fit(clf, tfidf_train, y_train, tfidf_test, y_test)

Classifier :  MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
Accuracy:   0.815

Confusion Matrix :
[[ 704  367]
 [  19 1001]]

Report :
              precision    recall  f1-score   support

        FAKE       0.97      0.66      0.78      1071
        REAL       0.73      0.98      0.84      1020

   micro avg       0.82      0.82      0.82      2091
   macro avg       0.85      0.82      0.81      2091
weighted avg       0.86      0.82      0.81      2091



MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

# Testing Linear Models

In [35]:
linear_clf = PassiveAggressiveClassifier(n_iter=50)
classify_and_fit(linear_clf, tfidf_train, y_train, tfidf_test, y_test)

Classifier :  PassiveAggressiveClassifier(C=1.0, average=False, class_weight=None,
              early_stopping=False, fit_intercept=True, loss='hinge',
              max_iter=None, n_iter=50, n_iter_no_change=5, n_jobs=None,
              random_state=None, shuffle=True, tol=None,
              validation_fraction=0.1, verbose=0, warm_start=False)




Accuracy:   0.939

Confusion Matrix :
[[1011   60]
 [  67  953]]

Report :
              precision    recall  f1-score   support

        FAKE       0.94      0.94      0.94      1071
        REAL       0.94      0.93      0.94      1020

   micro avg       0.94      0.94      0.94      2091
   macro avg       0.94      0.94      0.94      2091
weighted avg       0.94      0.94      0.94      2091



PassiveAggressiveClassifier(C=1.0, average=False, class_weight=None,
              early_stopping=False, fit_intercept=True, loss='hinge',
              max_iter=None, n_iter=50, n_iter_no_change=5, n_jobs=None,
              random_state=None, shuffle=True, tol=None,
              validation_fraction=0.1, verbose=0, warm_start=False)

# Count versus TF-IDF Features


In [17]:
def most_informative_feature_for_binary_classification(vectorizer, classifier, n=100):
    class_labels = classifier.classes_
    feature_names = vectorizer.get_feature_names()
    topn_class1 = sorted(zip(classifier.coef_[0], feature_names))[:n]
    topn_class2 = sorted(zip(classifier.coef_[0], feature_names))[-n:]
    for coef, feat in topn_class1:
        print(class_labels[0], coef, feat)
    print()
    for coef, feat in reversed(topn_class2):
        print(class_labels[1], coef, feat)


most_informative_feature_for_binary_classification(tfidf_vectorizer, linear_clf, n=30)

FAKE -5.019620107823401 october
FAKE -4.991404304543739 2016
FAKE -4.163823192806515 hillary
FAKE -2.9399579024910416 share
FAKE -2.93190002661481 article
FAKE -2.7568604102431573 election
FAKE -2.75630523788773 november
FAKE -2.523696838421442 snip
FAKE -2.4593226921995712 podesta
FAKE -2.3871970470868282 corporate
FAKE -2.341820614734989 source
FAKE -2.312077564867841 print
FAKE -2.304100745419877 email
FAKE -2.2006567342529175 mosul
FAKE -2.028562280763503 healthcare
FAKE -1.9385052220996728 wikileaks
FAKE -1.871761901124584 donald
FAKE -1.853885163445138 com
FAKE -1.8524514733191586 establishment
FAKE -1.8522099260893314 just
FAKE -1.8515316791872922 stated
FAKE -1.792994089124951 video
FAKE -1.7692811344292134 entire
FAKE -1.7170560749456338 oct
FAKE -1.653156253415706 advertisement
FAKE -1.6521635335266147 photo
FAKE -1.645439054367439 uk
FAKE -1.632347592035885 demand
FAKE -1.62103009800192 ___
FAKE -1.5942594624364452 pay

REAL 5.0043602199944415 said
REAL 3.0937745288610023 sa