In [1]:
## required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB

In [2]:
# pd.set_option('display.max_rows', 500)
# pd.set_option('display.max_columns', 500)
# pd.set_option('display.width', 1000)

# np.set_printoptions(threshold=np.inf)

In [3]:
## load data
df = pd.read_csv('fake_or_real_news.csv')

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [5]:
## create a series to store the labels: y [response variable]
y = df.label

## create training and test sets
X_train, X_test, y_train, y_test = train_test_split(df['text'], y, test_size = 0.33, random_state = 53)

In [6]:
## initialize a CountVectorizer object: count_vectorizer
### CountVectorizer will be used to get bag of words vector and remove the stop words
count_vectorizer = CountVectorizer(stop_words = 'english')

## transform the training data using only the 'text' column values: count_train
### fit and transform the data
### creates the bag of words vectors
### generates mapping of words with ids and vectors representing how many times each word appears in the movie plot
### fit_transform works differently for different models but generally
### fit will find parameters in the data
### transform will apply the model's underlying algorithm or approximation
### here, we are going to create a bag of words dictionary and vector for each document using the training data
count_train = count_vectorizer.fit_transform(X_train.values)

## transform the test data using only the 'text' column values: count_test
### transform will create bag of words vectors for the test data using the same dictionary
### train and test need to use a consistent set of words so that the trained model can understand the test input
count_test = count_vectorizer.transform(X_test.values)

## print the first 10 features of the count_vectorizer
count_vectorizer.get_feature_names()[:10]

['00',
 '000',
 '0000',
 '00000031',
 '000035',
 '00006',
 '0001',
 '0001pt',
 '000ft',
 '000km']

In [7]:
## initialize a TfidfVectorizer object: tfidf_vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words = 'english', max_df=0.7)

## transform the training data: tfidf_train 
tfidf_train = tfidf_vectorizer.fit_transform(X_train.values)

## transform the test data: tfidf_test 
tfidf_test = tfidf_vectorizer.transform(X_test.values)

## print the first 10 features
tfidf_vectorizer.get_feature_names()[:-10]

['00',
 '000',
 '0000',
 '00000031',
 '000035',
 '00006',
 '0001',
 '0001pt',
 '000ft',
 '000km',
 '001',
 '0011',
 '002',
 '003',
 '004',
 '006',
 '006s',
 '007',
 '007s',
 '008',
 '008s',
 '009',
 '0099',
 '00am',
 '00p',
 '00pm',
 '01',
 '010',
 '013',
 '014',
 '015',
 '016',
 '018',
 '01am',
 '02',
 '020',
 '022',
 '023',
 '024',
 '025',
 '027',
 '028',
 '02welcome',
 '03',
 '031',
 '032',
 '0325',
 '033',
 '034',
 '035',
 '037',
 '039',
 '03eb',
 '04',
 '040',
 '0400',
 '042',
 '044',
 '048',
 '049',
 '04pm',
 '05',
 '0509245d29',
 '052',
 '056',
 '06',
 '062',
 '066',
 '068',
 '06pm',
 '07',
 '0700',
 '075',
 '076',
 '079',
 '07dryempjx',
 '08',
 '080',
 '081',
 '082',
 '084',
 '089',
 '0891',
 '09',
 '098263',
 '09am',
 '09pm',
 '0_jgdktlmn',
 '0a_merrill',
 '0d',
 '0fjjvowyhg8qtskiz',
 '0h4at2yetra17uxetni02ls2jeg0mty45jrcu7mrzsrpcbq464i',
 '0hq3vb2giv',
 '0in',
 '0jsn6pjkan',
 '0oeekvljlt',
 '0pt',
 '0t5',
 '0txrbwvobzz4fi5nksw6k5a6cxzbb3juxthmdiz93cby8gvrqiypzhajvjnt2',
 '0wo

In [8]:
## print the first 5 vectors of the tfidf training data
print(tfidf_train.A[:5])

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [9]:
# Create the CountVectorizer DataFrame: count_df
count_df = pd.DataFrame(count_train.A, columns=count_vectorizer.get_feature_names())

# Print the head of count_df
count_df.head()

Unnamed: 0,00,000,0000,00000031,000035,00006,0001,0001pt,000ft,000km,...,حلب,عربي,عن,لم,ما,محاولات,من,هذا,والمرضى,ยงade
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
# Create the TfidfVectorizer DataFrame: tfidf_df
tfidf_df = pd.DataFrame(tfidf_train.A, columns=tfidf_vectorizer.get_feature_names())

# Print the head of tfidf_df
tfidf_df.head()

Unnamed: 0,00,000,0000,00000031,000035,00006,0001,0001pt,000ft,000km,...,حلب,عربي,عن,لم,ما,محاولات,من,هذا,والمرضى,ยงade
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
# Calculate the difference in columns: difference
difference = set(tfidf_df.columns) - set(count_df.columns)
difference

set()

In [12]:
# Check whether the DataFrames are equal
count_df.equals(tfidf_df)

False

In [13]:
# Instantiate a Multinomial Naive Bayes classifier: nb_classifier
nb_classifier = MultinomialNB()

# Fit the classifier to the training data
nb_classifier.fit(count_train, y_train)

# Create the predicted tags: pred
pred = nb_classifier.predict(count_test)

# Calculate the accuracy score: score
score = metrics.accuracy_score(y_test, pred)
print(score)

# Calculate the confusion matrix: cm
cm = metrics.confusion_matrix(y_test, pred, labels=['FAKE','REAL'])
cm

0.893352462936394


array([[ 865,  143],
       [  80, 1003]])

In [14]:
# Create a Multinomial Naive Bayes classifier: nb_classifier
nb_classifier_tfidf = MultinomialNB()

# Fit the classifier to the training data
nb_classifier_tfidf.fit(tfidf_train, y_train)

# Create the predicted tags: pred
pred_tfidf = nb_classifier_tfidf.predict(tfidf_test)

# Calculate the accuracy score: score
score_tfidf = metrics.accuracy_score(y_test, pred_tfidf)
print(score_tfidf)

# Calculate the confusion matrix: cm
cm_tfidf = metrics.confusion_matrix(y_test, pred_tfidf, labels=['FAKE', 'REAL'])
cm_tfidf

0.8565279770444764


array([[ 739,  269],
       [  31, 1052]])

In [17]:
# Create the list of alphas: alphas
alphas = np.arange(0.1, 1, 0.1)

# Define train_and_predict()
def train_and_predict(alpha):
    # Instantiate the classifier: nb_classifier
    nb_classifier = MultinomialNB(alpha=alpha)
    # Fit to the training data
    nb_classifier.fit(tfidf_train, y_train)
    # Predict the labels: pred
    pred = nb_classifier.predict(tfidf_test)
    # Compute accuracy: score
    score = metrics.accuracy_score(y_test, pred)
    return score

# Iterate over the alphas and print the corresponding score
for alpha in alphas:
    print('Alpha: ', alpha)
    print('Score: ', train_and_predict(alpha))
    print()

Alpha:  0.1
Score:  0.8976566236250598

Alpha:  0.2
Score:  0.8938307030129125

Alpha:  0.30000000000000004
Score:  0.8900047824007652

Alpha:  0.4
Score:  0.8857006217120995

Alpha:  0.5
Score:  0.8842659014825442

Alpha:  0.6
Score:  0.874701099952176

Alpha:  0.7000000000000001
Score:  0.8703969392635102

Alpha:  0.8
Score:  0.8660927785748446

Alpha:  0.9
Score:  0.8589191774270684



In [18]:
# Get the class labels: class_labels
class_labels = nb_classifier_tfidf.classes_

# Extract the features: feature_names
feature_names = tfidf_vectorizer.get_feature_names()

# Zip the feature names together with the coefficient array and sort by weights: feat_with_weights
feat_with_weights = sorted(zip(nb_classifier_tfidf.coef_[0], feature_names))

# Print the first class label and the top 20 feat_with_weights entries
print(class_labels[0], feat_with_weights[:20])

# Print the second class label and the bottom 20 feat_with_weights entries
print(class_labels[1], feat_with_weights[-20:])

FAKE [(-11.316312804238807, '0000'), (-11.316312804238807, '000035'), (-11.316312804238807, '0001'), (-11.316312804238807, '0001pt'), (-11.316312804238807, '000km'), (-11.316312804238807, '0011'), (-11.316312804238807, '006s'), (-11.316312804238807, '007'), (-11.316312804238807, '007s'), (-11.316312804238807, '008s'), (-11.316312804238807, '0099'), (-11.316312804238807, '00am'), (-11.316312804238807, '00p'), (-11.316312804238807, '00pm'), (-11.316312804238807, '014'), (-11.316312804238807, '015'), (-11.316312804238807, '018'), (-11.316312804238807, '01am'), (-11.316312804238807, '020'), (-11.316312804238807, '023')]
REAL [(-7.742481952533027, 'states'), (-7.717550034444668, 'rubio'), (-7.703583809227384, 'voters'), (-7.654774992495461, 'house'), (-7.649398936153309, 'republicans'), (-7.6246184189367, 'bush'), (-7.616556675728881, 'percent'), (-7.545789237823644, 'people'), (-7.516447881078008, 'new'), (-7.448027933291952, 'party'), (-7.411148410203476, 'cruz'), (-7.410910239085596, 'st