In [None]:
"""Code snippets have been referenced from https://www.datacamp.com/community/tutorials/scikit-learn-fake-news"""

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer


In [2]:
import itertools

In [3]:
from matplotlib import pyplot as plt


In [4]:

df = pd.read_csv("politifact.csv")


df_a = pd.read_csv("awetesttext.csv")
    
# checking the head of dataframe
df.head()

Unnamed: 0.1,Unnamed: 0,X,title,label,platform
0,1,1,Virginia Republican Wants Schools To Check Chi...,fake,politifact
1,2,2,BREAKING: PUTIN INTERFERENCE COULD GIVE COURTS...,fake,politifact
2,3,3,BREAKING: Barrels Removed From Clinton Propert...,fake,politifact
3,4,4,BREAKING: Johnny Depp Taken Into Custody By Se...,fake,politifact
4,5,5,State Of Minnesota Has Just Handed Over FULL C...,fake,politifact


In [5]:
df_a = df_a[0:10]

In [6]:
df_a

Unnamed: 0,X,title,label
0,1.0,What happens when you give 4chan Pictures of t...,fake
1,2.0,New photo of what seems to be Boston bombing s...,real
2,3.0,My mom took that picture in August when we wer...,real
3,4.0,A picture someone took of a shark swimming by ...,fake
4,5.0,BREAKING: Barrels Removed From Clinton Propert...,fake
5,6.0,BREAKING: Johnny Depp Taken Into Custody By Se...,fake
6,7.0,State Of Minnesota Has Just Handed Over FULL C...,fake
7,8.0,"KISS' Gene Simmons: ""You Want to Win the War o...",real
8,9.0,Kerry says Syrian diplomacy at impasse after t...,real
9,10.0,Ford Motor CEO Says Trump Is Lying AGAIN! Comp...,real


In [7]:
# shape of df
df.shape

(208, 5)

In [8]:
df = df.set_index("Unnamed: 0")
df.head()

Unnamed: 0_level_0,X,title,label,platform
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1,Virginia Republican Wants Schools To Check Chi...,fake,politifact
2,2,BREAKING: PUTIN INTERFERENCE COULD GIVE COURTS...,fake,politifact
3,3,BREAKING: Barrels Removed From Clinton Propert...,fake,politifact
4,4,BREAKING: Johnny Depp Taken Into Custody By Se...,fake,politifact
5,5,State Of Minnesota Has Just Handed Over FULL C...,fake,politifact


In [9]:
df_a = df_a.set_index("X")
df_a.head()

Unnamed: 0_level_0,title,label
X,Unnamed: 1_level_1,Unnamed: 2_level_1
1.0,What happens when you give 4chan Pictures of t...,fake
2.0,New photo of what seems to be Boston bombing s...,real
3.0,My mom took that picture in August when we wer...,real
4.0,A picture someone took of a shark swimming by ...,fake
5.0,BREAKING: Barrels Removed From Clinton Propert...,fake


In [14]:
df.columns

Index(['X', 'title', 'label', 'platform'], dtype='object')

In [15]:
df_a.columns

Index(['title', 'label'], dtype='object')

In [16]:
# Set `y` 
y = df.label 

# Drop the `label` column
df.drop("label", axis=1)

# Make training and test sets 
X_train, X_test, y_train, y_test = train_test_split(df['title'], y, test_size=0.20, random_state=53)

In [87]:
# Set `y` 
y_a = df_a.label 

# Drop the `label` column
df_a.drop("label", axis=1)

# Make training and test sets 
X_train_a, X_test_a, y_train_a, y_test_a = train_test_split(df_a['title'], y_a, test_size=0,random_state=1)



In [88]:
X_test_a 

Series([], Name: title, dtype: object)

In [89]:
X_train_a 

X
3.0     My mom took that picture in August when we wer...
10.0    Ford Motor CEO Says Trump Is Lying AGAIN! Comp...
7.0     State Of Minnesota Has Just Handed Over FULL C...
5.0     BREAKING: Barrels Removed From Clinton Propert...
1.0     What happens when you give 4chan Pictures of t...
4.0     A picture someone took of a shark swimming by ...
2.0     New photo of what seems to be Boston bombing s...
8.0     KISS' Gene Simmons: "You Want to Win the War o...
9.0     Kerry says Syrian diplomacy at impasse after t...
6.0     BREAKING: Johnny Depp Taken Into Custody By Se...
Name: title, dtype: object

In [90]:
y_train_a

X
3.0     real
10.0    real
7.0     fake
5.0     fake
1.0     fake
4.0     fake
2.0     real
8.0     real
9.0     real
6.0     fake
Name: label, dtype: object

In [91]:
y_test_a

Series([], Name: label, dtype: object)

In [18]:
# Initialize the `count_vectorizer` 
count_vectorizer = CountVectorizer(stop_words='english')

# Fit and transform the training data 
count_train = count_vectorizer.fit_transform(X_train) 

# Transform the test set 
count_test = count_vectorizer.transform(X_test)

In [19]:
# Initialize the `tfidf_vectorizer` 
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7) 

# Fit and transform the training data 
tfidf_train = tfidf_vectorizer.fit_transform(X_train) 

# Transform the test set 
tfidf_test = tfidf_vectorizer.transform(X_test)

In [20]:
# Get the feature names of `tfidf_vectorizer` 
print(tfidf_vectorizer.get_feature_names()[-10:])

# Get the feature names of `count_vectorizer` 
print(count_vectorizer.get_feature_names()[:10])

['word', 'work', 'working', 'world', 'worst', 'written', 'yeah', 'year', 'years', 'york']
['000', '10', '11', '12', '15', '16', '18', '183st', '19', '1st']


In [21]:
print(tfidf_vectorizer.get_feature_names()[-10:])

['word', 'work', 'working', 'world', 'worst', 'written', 'yeah', 'year', 'years', 'york']


In [22]:
count_df = pd.DataFrame(count_train.A, columns=count_vectorizer.get_feature_names())

In [23]:
tfidf_df = pd.DataFrame(tfidf_train.A, columns=tfidf_vectorizer.get_feature_names())

In [24]:
difference = set(count_df.columns) - set(tfidf_df.columns)
difference

set()

In [25]:
print(count_df.equals(tfidf_df))

False


In [26]:
count_df.head()

Unnamed: 0,000,10,11,12,15,16,18,183st,19,1st,...,word,work,working,world,worst,written,yeah,year,years,york
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [27]:
tfidf_df.head()

Unnamed: 0,000,10,11,12,15,16,18,183st,19,1st,...,word,work,working,world,worst,written,yeah,year,years,york
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.4037,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.246043,0.0,0.0


In [28]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    See full source and example: 
    http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
    
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [29]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score 
from sklearn.metrics import confusion_matrix

In [30]:
clf = MultinomialNB() 

In [31]:
clf.fit(tfidf_train, y_train)
pred = clf.predict(tfidf_test)
score = accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)
#cm = confusion_matrix(y_test, pred, labels=['FAKE', 'REAL'])
#plot_confusion_matrix(cm, classes=['FAKE', 'REAL'])

accuracy:   0.690


In [32]:
clf = MultinomialNB() 

In [33]:
clf.fit(count_train, y_train)
pred = clf.predict(count_test)
score = accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)
#cm = confusion_matrix(y_test, pred, labels=['FAKE', 'REAL'])
#plot_confusion_matrix(cm, classes=['FAKE', 'REAL'])

accuracy:   0.738


In [34]:
from sklearn.linear_model import PassiveAggressiveClassifier

In [35]:
linear_clf = PassiveAggressiveClassifier(max_iter=50)

In [36]:
linear_clf.fit(tfidf_train, y_train)
pred = linear_clf.predict(tfidf_test)
score = accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)
#cm = confusion_matrix(y_test, pred, labels=['FAKE', 'REAL'])
#plot_confusion_matrix(cm, classes=['FAKE', 'REAL'])

accuracy:   0.690


In [37]:
clf = MultinomialNB(alpha=0.1)

In [38]:
last_score = 0
for alpha in np.arange(0,1,.1):
    nb_classifier = MultinomialNB(alpha=alpha)
    nb_classifier.fit(tfidf_train, y_train)
    pred = nb_classifier.predict(tfidf_test)
    score = accuracy_score(y_test, pred)
    if score > last_score:
        clf = nb_classifier
    print("Alpha: {:.2f} Score: {:.5f}".format(alpha, score))

Alpha: 0.00 Score: 0.64286
Alpha: 0.10 Score: 0.66667
Alpha: 0.20 Score: 0.69048
Alpha: 0.30 Score: 0.66667
Alpha: 0.40 Score: 0.66667
Alpha: 0.50 Score: 0.66667
Alpha: 0.60 Score: 0.66667
Alpha: 0.70 Score: 0.66667
Alpha: 0.80 Score: 0.66667
Alpha: 0.90 Score: 0.69048


  'setting alpha = %.1e' % _ALPHA_MIN)


In [39]:
def most_informative_feature_for_binary_classification(vectorizer, classifier, n=100):
    """
    See: https://stackoverflow.com/a/26980472
    
    Identify most important features if given a vectorizer and binary classifier. Set n to the number
    of weighted features you would like to show. (Note: current implementation merely prints and does not 
    return top classes.)
    """

    class_labels = classifier.classes_
    feature_names = vectorizer.get_feature_names()
    topn_class1 = sorted(zip(classifier.coef_[0], feature_names))[:n]
    topn_class2 = sorted(zip(classifier.coef_[0], feature_names))[-n:]

    for coef, feat in topn_class1:
        print(class_labels[0], coef, feat)

    print()

    for coef, feat in reversed(topn_class2):
        print(class_labels[1], coef, feat)


most_informative_feature_for_binary_classification(tfidf_vectorizer, linear_clf, n=30)

fake -2.182637174256411 breaking
fake -1.7578110348124643 201c
fake -1.7578110348124643 201d
fake -1.2541085003214751 fired
fake -1.247521501561159 page
fake -1.1396820203491593 arrested
fake -1.0437273828358655 report
fake -1.0091883861017683 home
fake -0.9160215907329232 dead
fake -0.8565606277320624 navy
fake -0.7971662308408348 australia
fake -0.7926819171754316 say
fake -0.7858970419810448 residents
fake -0.7785594786128441 appears
fake -0.7785594786128441 g20
fake -0.7785594786128441 pretending
fake -0.7689563364754535 love
fake -0.7594206221404406 white
fake -0.753327392589855 women
fake -0.748575085508427 attack
fake -0.731788681566071 years
fake -0.7148106047068318 news
fake -0.7073993262274596 americans
fake -0.7002371672603901 murder
fake -0.677214504580184 democrat
fake -0.6712777606574615 canceled
fake -0.6712777606574615 reinstated
fake -0.6712777606574615 sacred
fake -0.6643321091008431 russian
fake -0.6626765322058704 muslims

real 1.4981362735109849 debate
real 1.30717

In [40]:
hash_vectorizer = HashingVectorizer(stop_words='english', non_negative=True)
hash_train = hash_vectorizer.fit_transform(X_train)
hash_test = hash_vectorizer.transform(X_test)



In [41]:
clf = MultinomialNB(alpha=.90)

In [42]:
clf.fit(hash_train, y_train)
pred = clf.predict(hash_test)
score = accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)
#cm = confusion_matrix(y_test, pred, labels=['FAKE', 'REAL'])
#plot_confusion_matrix(cm, classes=['FAKE', 'REAL'])

accuracy:   0.738


In [43]:
clf = PassiveAggressiveClassifier(max_iter=50)

In [44]:
clf.fit(hash_train, y_train)
pred = clf.predict(hash_test)
score = accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)
#cm = confusion_matrix(y_test, pred, labels=['FAKE', 'REAL'])
#plot_confusion_matrix(cm, classes=['FAKE', 'REAL'])

accuracy:   0.714


In [92]:
test_me = hash_vectorizer.transform(X_train_a)



In [93]:
pred = clf.predict(test_me)

In [94]:
pred

array(['fake', 'real', 'fake', 'fake', 'real', 'fake', 'real', 'real',
       'real', 'fake'], dtype='<U4')

In [95]:
score = accuracy_score(y_train_a, pred)
print("accuracy:   %0.3f" % score)

accuracy:   0.800


In [96]:
score

0.8

In [97]:
X_train_a

X
3.0     My mom took that picture in August when we wer...
10.0    Ford Motor CEO Says Trump Is Lying AGAIN! Comp...
7.0     State Of Minnesota Has Just Handed Over FULL C...
5.0     BREAKING: Barrels Removed From Clinton Propert...
1.0     What happens when you give 4chan Pictures of t...
4.0     A picture someone took of a shark swimming by ...
2.0     New photo of what seems to be Boston bombing s...
8.0     KISS' Gene Simmons: "You Want to Win the War o...
9.0     Kerry says Syrian diplomacy at impasse after t...
6.0     BREAKING: Johnny Depp Taken Into Custody By Se...
Name: title, dtype: object

In [None]:
pred