In [None]:
"""Code snippets have been referenced from https://www.datacamp.com/community/tutorials/scikit-learn-fake-news"""

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer


In [2]:
import itertools

In [3]:
from matplotlib import pyplot as plt


In [4]:

df = pd.read_csv("buzzfeed.csv")
    
# checking the head of dataframe
df.head()

Unnamed: 0.1,Unnamed: 0,X,title,label,platform
0,1,1,Another Terrorist Attack in NYC...Why Are we S...,real,buzzfeed
1,2,2,"Trump: Drugs a 'Very, Very Big Factor' in Char...",real,buzzfeed
2,3,3,"Obama To UN: 'Giving Up Liberty, Enhances Secu...",real,buzzfeed
3,4,4,Trump vs. Clinton: A Fundamental Clash over Ho...,real,buzzfeed
4,5,5,President Obama Vetoes 9/11 Victims Bill,real,buzzfeed


In [5]:
# shape of df
df.shape

(176, 5)

In [6]:
df = df.set_index("Unnamed: 0")
df.head()

Unnamed: 0_level_0,X,title,label,platform
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1,Another Terrorist Attack in NYC...Why Are we S...,real,buzzfeed
2,2,"Trump: Drugs a 'Very, Very Big Factor' in Char...",real,buzzfeed
3,3,"Obama To UN: 'Giving Up Liberty, Enhances Secu...",real,buzzfeed
4,4,Trump vs. Clinton: A Fundamental Clash over Ho...,real,buzzfeed
5,5,President Obama Vetoes 9/11 Victims Bill,real,buzzfeed


In [7]:
df.columns

Index(['X', 'title', 'label', 'platform'], dtype='object')

In [8]:
# Set `y` 
y = df.label 

# Drop the `label` column
df.drop("label", axis=1)

# Make training and test sets 
X_train, X_test, y_train, y_test = train_test_split(df['title'], y, test_size=0.20, random_state=53)

In [9]:
# Initialize the `count_vectorizer` 
count_vectorizer = CountVectorizer(stop_words='english')

# Fit and transform the training data 
count_train = count_vectorizer.fit_transform(X_train) 

# Transform the test set 
count_test = count_vectorizer.transform(X_test)

In [10]:
# Initialize the `tfidf_vectorizer` 
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7) 

# Fit and transform the training data 
tfidf_train = tfidf_vectorizer.fit_transform(X_train) 

# Transform the test set 
tfidf_test = tfidf_vectorizer.transform(X_test)

In [11]:
# Get the feature names of `tfidf_vectorizer` 
print(tfidf_vectorizer.get_feature_names()[-10:])

# Get the feature names of `count_vectorizer` 
print(count_vectorizer.get_feature_names()[:10])

['won', 'word', 'words', 'works', 'world', 'worst', 'wounded', 'wrong', 'york', 'young']
['10', '100percentfedup', '11', '1st', '2011', '2012', '2013', '2014', '2016', '2018']


In [12]:
print(tfidf_vectorizer.get_feature_names()[-10:])

['won', 'word', 'words', 'works', 'world', 'worst', 'wounded', 'wrong', 'york', 'young']


In [13]:
count_df = pd.DataFrame(count_train.A, columns=count_vectorizer.get_feature_names())

In [14]:
tfidf_df = pd.DataFrame(tfidf_train.A, columns=tfidf_vectorizer.get_feature_names())

In [15]:
difference = set(count_df.columns) - set(tfidf_df.columns)
difference

set()

In [16]:
print(count_df.equals(tfidf_df))

False


In [17]:
count_df.head()

Unnamed: 0,10,100percentfedup,11,1st,2011,2012,2013,2014,2016,2018,...,won,word,words,works,world,worst,wounded,wrong,york,young
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
tfidf_df.head()

Unnamed: 0,10,100percentfedup,11,1st,2011,2012,2013,2014,2016,2018,...,won,word,words,works,world,worst,wounded,wrong,york,young
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    See full source and example: 
    http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
    
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [20]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score 
from sklearn.metrics import confusion_matrix

In [21]:
clf = MultinomialNB() 

In [22]:
clf.fit(tfidf_train, y_train)
pred = clf.predict(tfidf_test)
score = accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)
#cm = confusion_matrix(y_test, pred, labels=['FAKE', 'REAL'])
#plot_confusion_matrix(cm, classes=['FAKE', 'REAL'])

accuracy:   0.694


In [23]:
clf = MultinomialNB() 

In [24]:
clf.fit(count_train, y_train)
pred = clf.predict(count_test)
score = accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)
#cm = confusion_matrix(y_test, pred, labels=['FAKE', 'REAL'])
#plot_confusion_matrix(cm, classes=['FAKE', 'REAL'])

accuracy:   0.694


In [25]:
from sklearn.linear_model import PassiveAggressiveClassifier

In [26]:
linear_clf = PassiveAggressiveClassifier(max_iter=50)

In [27]:
linear_clf.fit(tfidf_train, y_train)
pred = linear_clf.predict(tfidf_test)
score = accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)
#cm = confusion_matrix(y_test, pred, labels=['FAKE', 'REAL'])
#plot_confusion_matrix(cm, classes=['FAKE', 'REAL'])

accuracy:   0.667


In [28]:
clf = MultinomialNB(alpha=0.1)

In [29]:
last_score = 0
for alpha in np.arange(0,1,.1):
    nb_classifier = MultinomialNB(alpha=alpha)
    nb_classifier.fit(tfidf_train, y_train)
    pred = nb_classifier.predict(tfidf_test)
    score = accuracy_score(y_test, pred)
    if score > last_score:
        clf = nb_classifier
    print("Alpha: {:.2f} Score: {:.5f}".format(alpha, score))

Alpha: 0.00 Score: 0.58333
Alpha: 0.10 Score: 0.63889
Alpha: 0.20 Score: 0.66667
Alpha: 0.30 Score: 0.66667
Alpha: 0.40 Score: 0.66667
Alpha: 0.50 Score: 0.69444
Alpha: 0.60 Score: 0.69444
Alpha: 0.70 Score: 0.69444
Alpha: 0.80 Score: 0.69444
Alpha: 0.90 Score: 0.69444


  'setting alpha = %.1e' % _ALPHA_MIN)


In [30]:
def most_informative_feature_for_binary_classification(vectorizer, classifier, n=100):
    """
    See: https://stackoverflow.com/a/26980472
    
    Identify most important features if given a vectorizer and binary classifier. Set n to the number
    of weighted features you would like to show. (Note: current implementation merely prints and does not 
    return top classes.)
    """

    class_labels = classifier.classes_
    feature_names = vectorizer.get_feature_names()
    topn_class1 = sorted(zip(classifier.coef_[0], feature_names))[:n]
    topn_class2 = sorted(zip(classifier.coef_[0], feature_names))[-n:]

    for coef, feat in topn_class1:
        print(class_labels[0], coef, feat)

    print()

    for coef, feat in reversed(topn_class2):
        print(class_labels[1], coef, feat)


most_informative_feature_for_binary_classification(tfidf_vectorizer, linear_clf, n=30)

fake -1.702796725977134 22c6
fake -1.4696517174319543 freedom
fake -1.275791436692157 daily
fake -1.2542039495835733 muslim
fake -1.1215165727987786 winner
fake -1.114564048340495 campaign
fake -1.0678293623846409 hillary
fake -0.9775476307496065 promise
fake -0.967448792589386 new
fake -0.9492571750819842 2013
fake -0.9492571750819842 deep
fake -0.9206306797082028 goes
fake -0.9113895182699796 exposed
fake -0.8971606804075654 white
fake -0.8792441328604574 confirm
fake -0.8792441328604574 networks
fake -0.8760229616099684 unfit
fake -0.8701251473625365 sued
fake -0.8700168296431802 army
fake -0.8114113534934129 way
fake -0.8075740528168479 just
fake -0.7804288629360127 york
fake -0.768161466034945 adapt
fake -0.7614720655705974 maintenance
fake -0.7614720655705974 website
fake -0.7308720330404348 boom
fake -0.7258341369874883 herald
fake -0.7240349583616174 bombshell
fake -0.7240349583616174 millions
fake -0.7240349583616174 oligarchs

real 1.7215119680557502 trump
real 1.062791307762

In [31]:
hash_vectorizer = HashingVectorizer(stop_words='english', non_negative=True)
hash_train = hash_vectorizer.fit_transform(X_train)
hash_test = hash_vectorizer.transform(X_test)



In [32]:
clf = PassiveAggressiveClassifier(max_iter=50)

In [33]:
clf.fit(hash_train, y_train)
pred = clf.predict(hash_test)
score = accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)
#cm = confusion_matrix(y_test, pred, labels=['FAKE', 'REAL'])
#plot_confusion_matrix(cm, classes=['FAKE', 'REAL'])

accuracy:   0.694


In [34]:
clf = MultinomialNB(alpha=.90)

In [35]:
clf.fit(hash_train, y_train)
pred = clf.predict(hash_test)
score = accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)
#cm = confusion_matrix(y_test, pred, labels=['FAKE', 'REAL'])
#plot_confusion_matrix(cm, classes=['FAKE', 'REAL'])

accuracy:   0.778


In [36]:
df_a = pd.read_csv("awetesttext.csv")
df_a = df_a[0:10]
df_a = df_a.set_index("X")
y_a = df_a.label 

# Drop the `label` column
df_a.drop("label", axis=1)

# Make training and test sets 
X_train_a, X_test_a, y_train_a, y_test_a = train_test_split(df_a['title'], y_a, test_size=0,random_state=1)


test_me = hash_vectorizer.transform(X_train_a)
pred = clf.predict(test_me)
score = accuracy_score(y_train_a, pred)
print("accuracy:   %0.3f" % score)

accuracy:   0.500




In [37]:
pred

array(['real', 'real', 'real', 'fake', 'real', 'real', 'fake', 'real',
       'fake', 'fake'], dtype='<U4')

In [40]:
X_train_a

X
3.0     My mom took that picture in August when we wer...
10.0    Ford Motor CEO Says Trump Is Lying AGAIN! Comp...
7.0     State Of Minnesota Has Just Handed Over FULL C...
5.0     BREAKING: Barrels Removed From Clinton Propert...
1.0     What happens when you give 4chan Pictures of t...
4.0     A picture someone took of a shark swimming by ...
2.0     New photo of what seems to be Boston bombing s...
8.0     KISS' Gene Simmons: "You Want to Win the War o...
9.0     Kerry says Syrian diplomacy at impasse after t...
6.0     BREAKING: Johnny Depp Taken Into Custody By Se...
Name: title, dtype: object

In [41]:
pred

array(['real', 'real', 'real', 'fake', 'real', 'real', 'fake', 'real',
       'fake', 'fake'], dtype='<U4')