### Libraries

In [127]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import re
from time import time
import string
#import itertools
from pprint import pprint

from nltk import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import PassiveAggressiveClassifier, LogisticRegression

from sklearn.metrics import confusion_matrix, classification_report

#from gensim import models
#from gensim.models import word2vec,doc2vec

### Load Data

In [98]:
df = pd.read_csv("data/fake_or_real_news.csv")
print(df.shape)

In [100]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [101]:
# Set index
df = df.set_index('Unnamed: 0') 
df.head()

Unnamed: 0_level_0,title,text,label
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [102]:
#Rename index
df.index.names = ['docId']
df.head()

Unnamed: 0_level_0,title,text,label
docId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


### Pre-Process  : Article text & title

In [103]:
# Remove Punctuation
# Convert to Lowercase
# Stem words
# Remove Numbers
# Remove Stop Words

from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

def remove_punctuation(s):
    s = ''.join([i for i in s if i not in frozenset(string.punctuation)])
    return s

def preprocess_string(s):
    s = remove_punctuation(s)
    s= s.lower()
    
    #tokenize
    words = word_tokenize(s)

    #unique words
    words = list(set(words))

    #stem
    ps = PorterStemmer()    
    words = [ps.stem(w) for w in words]
    
    #stopwords
    STOP_WORDS = list(stopwords.words('english'))         #About 900 stopwords
    words = [w for w in words if not w in STOP_WORDS]

    #recombine words
    s = ' '.join(words)
    
    #regex remove numbers
    #RE_PREPROCESS = r'\W+|\d+' #the regular expressions that matches all non-characters
    #s = re.sub(RE_PREPROCESS, ' ', s)
    
    #remove extra spaces
    s = re.sub(' +',' ',s)
    
    return s

preprocess_string("Baked 3=$ ,Stuart's baked  M  Potatoes")

'stuart 3 potato bake'

In [104]:
def preprocess_df(df, input_col, output_col):
    df[output_col] = df[input_col].apply(preprocess_string)
    return df

#### Note: Preprocessing text is expected to take around 2-3 minutes

In [105]:
df = preprocess_df(df, 'text', 'cleanText')

In [106]:
df = preprocess_df(df, 'title', 'cleanTitle')

In [107]:
df.head()

Unnamed: 0_level_0,title,text,label,cleanText,cleanTitle
docId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE,bring knew done result store breath much lie n...,’ fear hillari smell
10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE,came print alreadi done day desper exampl limp...,polit suicid trump paul watch ralli video mome...
3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL,citi secur later note staffer taken attorney b...,gestur go kerri pari sympathi
10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE,accord avoid post vote convent jill work dure ...,warn support tri anger erupt dnc berni twitter
875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL,deleg victori point momentum watch convent fro...,york matter primari thi whi battl new


### Create Train-Test set from Article Text

In [16]:
y = df.label 
df.drop("label", axis=1) 
X_train, X_test, y_train, y_test = train_test_split(df['cleanText'], y, test_size=0.33, random_state=42)

In [17]:
print(X_train.shape)
print(type(X_train))
print(X_train.head())


print(X_test.shape)
print(type(X_test))
print(X_test.head())

(4244,)
<class 'pandas.core.series.Series'>
docId
4857    bring secur stake temperament gate affect illf...
9885    declin came watch belong dure anoth new aid an...
6681    accord personnel fals load friend result game ...
9306    bring class fund veneer indic special design e...
2232    egypt tripl painstakingli show magazin new tra...
Name: cleanText, dtype: object
(2091,)
<class 'pandas.core.series.Series'>
docId
9957    bring scandal fragment devour effect knew word...
7596    accord came 3 appear result carolina attorney ...
8905    accord bring 8th effect wish word 3 oppos done...
8752    print individu earlier could octob 27 cnn trut...
7804    later milk proceed attempt homicid fed 66 pour...
Name: cleanText, dtype: object


### Vectorize Article Text (Bag of words)

In [18]:
count_vectorizer = CountVectorizer(stop_words='english') 
count_train = count_vectorizer.fit_transform(X_train) 
count_test = count_vectorizer.transform(X_test)

In [19]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7) 
tfidf_train = tfidf_vectorizer.fit_transform(X_train) 
tfidf_test = tfidf_vectorizer.transform(X_test)

In [21]:
print(count_train.shape)
print(count_test.shape)

print(tfidf_train.shape)
print(tfidf_test.shape)

(4244, 50615)
(2091, 50615)
(4244, 50612)
(2091, 50612)
(4244, 1048576)
(2091, 1048576)


In [22]:
print(tfidf_vectorizer.get_feature_names()[-10:])
print(count_vectorizer.get_feature_names()[:10])

['שתי', 'תאמצנה', 'תוצאה', 'תחל', 'תיירות', 'תנותק', 'תעודת', 'תתרכז', 'القادمون', 'عربي']
['00', '000', '0000', '0000000031', '000000031', '00017b2908ff9fa45188d243fd49aaeeb2dhrcofficecom', '0004', '0006', '0007', '0008']


In [27]:
count_df = pd.DataFrame(count_train.A, columns=count_vectorizer.get_feature_names())
tfidf_df = pd.DataFrame(tfidf_train.A, columns=tfidf_vectorizer.get_feature_names())

In [28]:
difference = set(count_df.columns) - set(tfidf_df.columns)
difference

{'ha', 'thi', 'wa'}

In [29]:
print(count_df.equals(tfidf_df))

False


In [30]:
tfidf_df.head()

Unnamed: 0,00,000,0000,0000000031,000000031,00017b2908ff9fa45188d243fd49aaeeb2dhrcofficecom,0004,0006,0007,0008,...,שתי,תאמצנה,תוצאה,תחל,תיירות,תנותק,תעודת,תתרכז,القادمون,عربي
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
count_df.head()

Unnamed: 0,00,000,0000,0000000031,000000031,00017b2908ff9fa45188d243fd49aaeeb2dhrcofficecom,0004,0006,0007,0008,...,שתי,תאמצנה,תוצאה,תחל,תיירות,תנותק,תעודת,תתרכז,القادمون,عربي
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Classify Article as Fake / Real

#### Define: General Purpose Method to fit, classify and score a model 

In [87]:
def classify_and_fit(clf, X_train, y_train, X_test, y_test, class_labels = ['FAKE', 'REAL']):
    print("Classifier : ", clf )
    
    clf.fit(X_train, y_train)
    pred = clf.predict(X_test)
    score = metrics.accuracy_score(y_test, pred)
    
    print("Accuracy:   %0.3f" % score)

    print("\nConfusion Matrix :")
    #print(pd.crosstab(y_test, pred, rownames=['True'], colnames=['Predicted'], margins=True))
    cm = metrics.confusion_matrix(y_test, pred, labels=class_labels)
    print(cm)
    
    print("\nReport :")    
    print(classification_report(y_test, pred, target_names=class_labels))
    
    
    return clf



### Method 1 A: Naive Bayes Classifier (Tfidf vectors)

In [108]:
clf = MultinomialNB() 
classify_and_fit(clf, tfidf_train, y_train, tfidf_test, y_test)

Classifier :  MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
Accuracy:   0.749

Confusion Matrix :
[[ 551  520]
 [   4 1016]]

Report :
             precision    recall  f1-score   support

       FAKE       0.99      0.51      0.68      1071
       REAL       0.66      1.00      0.79      1020

avg / total       0.83      0.75      0.73      2091



MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

### Method 1 B: Naive Bayes Classifier (Count vectors)

In [109]:
clf = MultinomialNB() 
classify_and_fit(clf, count_train, y_train, count_test, y_test)

Classifier :  MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
Accuracy:   0.881

Confusion Matrix :
[[878 193]
 [ 56 964]]

Report :
             precision    recall  f1-score   support

       FAKE       0.94      0.82      0.88      1071
       REAL       0.83      0.95      0.89      1020

avg / total       0.89      0.88      0.88      2091



MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

### Method 2: Passive Aggressive Classifier (Tfidf vectors)

In [91]:
#http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.PassiveAggressiveClassifier.html

In [92]:
linear_clf = PassiveAggressiveClassifier(n_iter=50)
classify_and_fit(linear_clf, tfidf_train, y_train, tfidf_test, y_test)

Classifier :  PassiveAggressiveClassifier(C=1.0, class_weight=None, fit_intercept=True,
              loss='hinge', n_iter=50, n_jobs=1, random_state=None,
              shuffle=True, verbose=0, warm_start=False)
Accuracy:   0.938

Confusion Matrix :
[[1007   64]
 [  65  955]]

Report :
             precision    recall  f1-score   support

       FAKE       0.94      0.94      0.94      1071
       REAL       0.94      0.94      0.94      1020

avg / total       0.94      0.94      0.94      2091



PassiveAggressiveClassifier(C=1.0, class_weight=None, fit_intercept=True,
              loss='hinge', n_iter=50, n_jobs=1, random_state=None,
              shuffle=True, verbose=0, warm_start=False)

### Method 3: Logistic Regression (Tfidf vectors)

In [113]:
logistic_clf = LogisticRegression()
classify_and_fit(logistic_clf, tfidf_train, y_train, tfidf_test, y_test)

Classifier :  LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
Accuracy:   0.920

Confusion Matrix :
[[998  73]
 [ 95 925]]

Report :
             precision    recall  f1-score   support

       FAKE       0.91      0.93      0.92      1071
       REAL       0.93      0.91      0.92      1020

avg / total       0.92      0.92      0.92      2091



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [None]:
#feature_names = tfidf_vectorizer.get_feature_names
#tokens_with_weights = sorted(list(zip(feature_names, clf.coef_[0])))


### Improve : Use Grid Search to Optimize Classifier

In [144]:
def optimize_fake_news_pipeline(pipeline, X_train, y_train):
    # multiprocessing requires the fork to happen in a __main__ protected
    # block

    # find the best parameters for both the feature extraction and the
    # classifier
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=True)

    print("Performing grid search...")
    print("pipeline:", [name for name, _ in pipeline.steps])
    print("parameters:")
    pprint(parameters)
    t0 = time()
    grid_search.fit(X_train, y_train)
    print("done in %0.3fs" % (time() - t0))
    print()

    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))


#### Optimize: Naive Bayes

In [187]:
ALPHA = (0.01, 0.05, 0.1) #Tune the Smoothing Parameter for Naive Bayes
parameters = {
    'tfidf__min_df':np.array([0]),
    'tfidf__max_df':np.array([0.7]),
    'nb__alpha': ALPHA
}

In [188]:
print("OPTIMIZE: Naive Bayes Pipeline\n")
pipelinparameters = {
    'tfidf__min_df':np.array([0]),
    'tfidf__max_df':np.array([0.7]),
    'nb__alpha': ALPHA
}e = Pipeline([('tfidf', TfidfVectorizer()), ('nb', MultinomialNB())])
optimize_fake_news_pipeline(pipeline, X_train, y_train)

OPTIMIZE: Naive Bayes Pipeline
Performing grid search...
pipeline: ['tfidf', 'nb']
parameters:
{'nb__alpha': (0.01, 0.05, 0.1),
 'tfidf__max_df': array([ 0.7]),
 'tfidf__min_df': array([0])}
Fitting 3 folds for each of 3 candidates, totalling 9 fits


[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:   13.3s finished


done in 15.526s

Best score: 0.893
Best parameters set:
	nb__alpha: 0.01
	tfidf__max_df: 0.69999999999999996
	tfidf__min_df: 0


In [195]:
parameters = { 'nb__alpha': ALPHA
}

In [196]:
print("OPTIMIZE: Naive Bayes Pipeline\n")
pipeline = Pipeline([('count', CountVectorizer()), ('nb', MultinomialNB())])
optimize_fake_news_pipeline(pipeline, X_train, y_train)

OPTIMIZE: Naive Bayes Pipeline

Performing grid search...
pipeline: ['count', 'nb']
parameters:
{'nb__alpha': (0.01, 0.05, 0.1)}
Fitting 3 folds for each of 3 candidates, totalling 9 fits


[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:   14.5s finished


done in 16.602s

Best score: 0.888
Best parameters set:
	nb__alpha: 0.05


#### Optimize: Logistic Regression 

In [189]:
PENALTY = ('l1', 'l2')    #Tune for Penalty in Logistic Regression
parameters = {
    'tfidf__min_df':np.array([0]),
    'tfidf__max_df':np.array([0.7]),
    'logistic__penalty': PENALTY,
}


print("OPTIMIZE: Logistic Regression Pipeline\n")
pipeline = Pipeline([('tfidf', TfidfVectorizer()), ('logistic', LogisticRegression())])
optimize_fake_news_pipeline(pipeline, X_train, y_train)

OPTIMIZE: Logistic Regression Pipeline
Performing grid search...
pipeline: ['tfidf', 'logistic']
parameters:
{'logistic__penalty': ('l1', 'l2'),
 'tfidf__max_df': array([ 0.7]),
 'tfidf__min_df': array([0])}
Fitting 3 folds for each of 2 candidates, totalling 6 fits


[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:   15.4s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:   15.4s finished


done in 17.784s

Best score: 0.906
Best parameters set:
	logistic__penalty: 'l2'
	tfidf__max_df: 0.69999999999999996
	tfidf__min_df: 0


#### Optimize: Passive Aggressive Classifier

In [190]:
FIT_INTERCEPT = (True, False) #Tune for Intercept fitting in Passive Aggressive Classifier
parameters = {
    'tfidf__min_df':np.array([0]),
    'tfidf__max_df':np.array([0.7]),
    'passive__fit_intercept':FIT_INTERCEPT    
}

In [193]:
print("OPTIMIZE: Passive Aggressive Pipeline\n")
pipeline = Pipeline([('tfidf', TfidfVectorizer()), ('passive', PassiveAggressiveClassifier())])
optimize_fake_news_pipeline(pipeline, X_train, y_train)

OPTIMIZE: Passive Aggressive Pipeline
Performing grid search...
pipeline: ['tfidf', 'passive']
parameters:
{'passive__fit_intercept': (True, False),
 'tfidf__max_df': array([ 0.7]),
 'tfidf__min_df': array([0])}
Fitting 3 folds for each of 2 candidates, totalling 6 fits


[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:    9.6s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:    9.6s finished


done in 11.426s

Best score: 0.922
Best parameters set:
	passive__fit_intercept: True
	tfidf__max_df: 0.69999999999999996
	tfidf__min_df: 0


### Compare: Optimized Models for Naive Bayes, Logistic Regression and Passive Aggressive Classifier

In [199]:
clf_1A = MultinomialNB(alpha = 0.01) 
classify_and_fit(clf_1A, tfidf_train, y_train, tfidf_test, y_test)

Classifier :  MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True)
Accuracy:   0.904

Confusion Matrix :
[[924 147]
 [ 54 966]]

Report :
             precision    recall  f1-score   support

       FAKE       0.94      0.86      0.90      1071
       REAL       0.87      0.95      0.91      1020

avg / total       0.91      0.90      0.90      2091



MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True)

In [200]:
clf_1B = MultinomialNB(alpha = 0.05)
classify_and_fit(clf_1B, count_train, y_train, count_test, y_test)

Classifier :  MultinomialNB(alpha=0.05, class_prior=None, fit_prior=True)
Accuracy:   0.904

Confusion Matrix :
[[936 135]
 [ 66 954]]

Report :
             precision    recall  f1-score   support

       FAKE       0.93      0.87      0.90      1071
       REAL       0.88      0.94      0.90      1020

avg / total       0.91      0.90      0.90      2091



MultinomialNB(alpha=0.05, class_prior=None, fit_prior=True)

In [201]:
clf_2 = LogisticRegression(penalty='l2')
classify_and_fit(clf_2, tfidf_train, y_train, tfidf_test, y_test)

Classifier :  LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
Accuracy:   0.920

Confusion Matrix :
[[998  73]
 [ 95 925]]

Report :
             precision    recall  f1-score   support

       FAKE       0.91      0.93      0.92      1071
       REAL       0.93      0.91      0.92      1020

avg / total       0.92      0.92      0.92      2091



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [202]:
clf_3 = PassiveAggressiveClassifier(fit_intercept=True)
classify_and_fit(clf_3, tfidf_train, y_train, tfidf_test, y_test)

Classifier :  PassiveAggressiveClassifier(C=1.0, class_weight=None, fit_intercept=True,
              loss='hinge', n_iter=5, n_jobs=1, random_state=None,
              shuffle=True, verbose=0, warm_start=False)
Accuracy:   0.937

Confusion Matrix :
[[1004   67]
 [  64  956]]

Report :
             precision    recall  f1-score   support

       FAKE       0.94      0.94      0.94      1071
       REAL       0.93      0.94      0.94      1020

avg / total       0.94      0.94      0.94      2091



PassiveAggressiveClassifier(C=1.0, class_weight=None, fit_intercept=True,
              loss='hinge', n_iter=5, n_jobs=1, random_state=None,
              shuffle=True, verbose=0, warm_start=False)

### Compare: Informative Features

In [93]:
def most_informative_feature_for_binary_classification(vectorizer, classifier, n=100):
    """
    See: https://stackoverflow.com/a/26980472
    
    Identify most important features if given a vectorizer and binary classifier. Set n to the number
    of weighted features you would like to show. (Note: current implementation merely prints and does not 
    return top classes.)
    """

    class_labels = classifier.classes_
    feature_names = vectorizer.get_feature_names()
    topn_class1 = sorted(zip(classifier.coef_[0], feature_names))[:n]
    topn_class2 = sorted(zip(classifier.coef_[0], feature_names))[-n:]

    for coef, feat in topn_class1:
        print(class_labels[0], coef, feat)

    print()
    for coef, feat in reversed(topn_class2):
        print(class_labels[1], coef, feat)
    
    return topn_class1, topn_class2

In [156]:
res = most_informative_feature_for_binary_classification(tfidf_vectorizer, linear_clf, n=30)

FAKE -5.67113088514 octob
FAKE -5.49568571614 2016
FAKE -3.9827836493 share
FAKE -3.80541627778 novemb
FAKE -3.53490888305 articl
FAKE -2.95730636216 sourc
FAKE -2.90972086612 elect
FAKE -2.83820023232 print
FAKE -2.6397580198 howev
FAKE -2.2844792595 oct
FAKE -2.2712199773 comment
FAKE -2.14705069424 pleas
FAKE -2.05134224276 wikileak
FAKE -2.05010662685 mainstream
FAKE -1.97054067113 video
FAKE -1.94582997157 post
FAKE -1.91824660218 nov
FAKE -1.9113633889 email
FAKE -1.87920257972 podesta
FAKE -1.79691290318 load
FAKE -1.79576577288 corrupt
FAKE -1.77361594032 28
FAKE -1.76939545208 snip
FAKE -1.68621466246 vote
FAKE -1.65550815423 entir
FAKE -1.63163977398 connect
FAKE -1.5973490388 hillari
FAKE -1.57377561834 min
FAKE -1.56899035776 imag
FAKE -1.55678531207 url

REAL 2.6068026986 conserv
REAL 2.45781348786 rush
REAL 2.34140621337 monday
REAL 2.26148297611 debat
REAL 2.16795506615 candid
REAL 2.13226775589 gop
REAL 2.11692841908 dont
REAL 2.07348163174 grow
REAL 2.06721250248 messa