# Training and testing a model to determine real vs. fake headlines

Reference: https://www.datacamp.com/community/tutorials/scikit-learn-fake-news
; https://s3.amazonaws.com/assets.datacamp.com/production/course_5064/slides/chapter4.pdf

In [7]:
import warnings
warnings.filterwarnings('ignore')

In [8]:
# Import basic libraries
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import gzip

# Read in dataset(s)

In [15]:
test_df = pd.DataFrame(pd.read_csv('../00_Resources/fake_or_real_news.csv'))
test_df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [39]:
with open('../00_Resources/true_fake.csv', 'rb') as fd:
    gzip_fd = gzip.GzipFile(fileobj=fd)
    df = pd.read_csv(gzip_fd)
df = df.drop(columns={'Unnamed: 0'})
df = df.sample(frac=1).reset_index(drop=True)
df

Unnamed: 0,title,text,subject,date,category
0,Senator McCain treated for side effects of can...,WASHINGTON (Reuters) - U.S. Senator John McCai...,politicsNews,"December 13, 2017",True
1,"Britain won't pay EU more, or sooner, than if ...",BRUSSELS (Reuters) - Britain will honor all it...,worldnews,"December 8, 2017",True
2,REVEALED: HILLARY’S OVER-THE-TOP Spending Duri...,CLINTON S STATE DEPARTMENT WASTE AND MISMANAGE...,politics,"Oct 28, 2016",False
3,WOW! TOP SPONSORS OF NFL Issue Statements Abou...,"Despite massive boycott threats by consumers, ...",left-news,"Sep 26, 2017",False
4,Suicide bombers kill 10 in Nigeria's Maiduguri...,"MAIDUGURI, Nigeria (Reuters) - Suicide bomb at...",worldnews,"November 15, 2017",True
...,...,...,...,...,...
44893,WATCH: TRUMP STAYS ABOVE FRAY In Flint After D...,Even the Democrat ministers can t help themsel...,left-news,"Sep 15, 2016",False
44894,Philippines' Duterte says to deal with Trump i...,MANILA (Reuters) - Philippine President Rodrig...,worldnews,"October 29, 2017",True
44895,Twitter to meet Congressional panel probing 20...,(Reuters) - Twitter Inc representatives will m...,politicsNews,"September 21, 2017",True
44896,No Class Michelle Obama Takes Several Swipes A...,No class and no decorum! The always bitter Mic...,politics,"Sep 22, 2017",False


# Set X and y and split

In [40]:
# ML libraries
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression, ElasticNet, Ridge, Lasso
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [41]:
# TODO: find out how to CountVectorize a multi-dimensional array.

# Drop the `label` column
X = df['title']
y = df['category']

In [42]:
# Split the data into training and testing
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

print(X.shape, y.shape)

(44898,) (44898,)


In [43]:
# This cell is a test 
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer(stop_words='english')
count_train = count_vectorizer.fit_transform(X_train.values)
count_test = count_vectorizer.transform(X_test.values)


count_test

<14817x18180 sparse matrix of type '<class 'numpy.int64'>'
	with 129594 stored elements in Compressed Sparse Row format>

# CountVectorizer and TfidfVectorizer

In [44]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize the `count_vectorizer` 
count_vectorizer = CountVectorizer(stop_words='english')

# Fit and transform the training data 
count_train = count_vectorizer.fit_transform(X_train) 

# Transform the test set 
count_test = count_vectorizer.transform(X_test)

In [45]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the `tfidf_vectorizer` 
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7) 

# Fit and transform the training data 
tfidf_train = tfidf_vectorizer.fit_transform(X_train) 

# Transform the test set 
tfidf_test = tfidf_vectorizer.transform(X_test)

In [46]:
# Get the feature names of `tfidf_vectorizer` 
print(tfidf_vectorizer.get_feature_names()[-10:])

# Get the feature names of `count_vectorizer` 
print(count_vectorizer.get_feature_names()[:10])

['zor', 'zschaepe', 'zucker', 'zuckerberg', 'zulia', 'zuma', 'zummar', 'zurich', 'état', 'žižek']
['00', '000', '000m', '0045', '0111', '0112', '0130', '0149', '027', '0330']


In [47]:
count_df = pd.DataFrame(count_train.A, columns=count_vectorizer.get_feature_names())


In [48]:
tfidf_df = pd.DataFrame(tfidf_train.A, columns=tfidf_vectorizer.get_feature_names())


In [49]:
difference = set(count_df.columns) - set(tfidf_df.columns)
difference

set()

In [50]:
print(count_df.equals(tfidf_df))

False


In [51]:
count_df

Unnamed: 0,00,000,000m,0045,0111,0112,0130,0149,027,0330,...,zor,zschaepe,zucker,zuckerberg,zulia,zuma,zummar,zurich,état,žižek
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30076,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
30077,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
30078,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
30079,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Test/train SVM

In [52]:
from sklearn.svm import SVC
# Create a support vector machine linear classifer and fit it to the training data
model = SVC(kernel='linear')
model.fit(tfidf_train, y_train)

In [53]:
# Print the model score using the test data
model.score(tfidf_test, y_test)

0.948505095498414

In [54]:
# Calculate the classification report
from sklearn.metrics import classification_report
preds = model.predict(tfidf_test)
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

       False       0.96      0.94      0.95      7691
        True       0.94      0.96      0.95      7126

    accuracy                           0.95     14817
   macro avg       0.95      0.95      0.95     14817
weighted avg       0.95      0.95      0.95     14817



In [55]:
from sklearn.svm import SVC
# Create a support vector machine linear classifer and fit it to the training data
model = SVC(kernel='linear')
model.fit(count_train, y_train)

In [56]:
# Print the model score using the test data
model.score(count_test, y_test)

0.9459404737801175

In [57]:
# Calculate the classification report
from sklearn.metrics import classification_report
preds = model.predict(count_test)
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

       False       0.96      0.94      0.95      7691
        True       0.94      0.95      0.94      7126

    accuracy                           0.95     14817
   macro avg       0.95      0.95      0.95     14817
weighted avg       0.95      0.95      0.95     14817



In [58]:
# Create the SVC Model
from sklearn.svm import SVC 
model = SVC(kernel='linear')
model

In [59]:
# Create the SVC Model
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC 
from sklearn import set_config
set_config(display="diagram")

pipe = make_pipeline(model)
pipe

In [60]:
# Create the GridSearch estimator along with a parameter object containing the values to adjust
from sklearn.model_selection import GridSearchCV
param_grid = {'svc__C': [1, 5, 10, 50],
              'svc__gamma': [0.0001, 0.0005, 0.001, 0.005]}
grid = GridSearchCV(pipe, param_grid, verbose=3)

In [61]:
# Fit the model using the grid search estimator. 
# This will take the SVC model and try each combination of parameters
grid.fit(tfidf_train, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] svc__C=1, svc__gamma=0.0001 .....................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ......... svc__C=1, svc__gamma=0.0001, score=0.948, total=  50.9s
[CV] svc__C=1, svc__gamma=0.0001 .....................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   50.9s remaining:    0.0s


[CV] ......... svc__C=1, svc__gamma=0.0001, score=0.942, total=  40.2s
[CV] svc__C=1, svc__gamma=0.0001 .....................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.5min remaining:    0.0s


[CV] ......... svc__C=1, svc__gamma=0.0001, score=0.943, total=  40.0s
[CV] svc__C=1, svc__gamma=0.0001 .....................................
[CV] ......... svc__C=1, svc__gamma=0.0001, score=0.947, total=  40.6s
[CV] svc__C=1, svc__gamma=0.0001 .....................................
[CV] ......... svc__C=1, svc__gamma=0.0001, score=0.938, total=  40.2s
[CV] svc__C=1, svc__gamma=0.0005 .....................................
[CV] ......... svc__C=1, svc__gamma=0.0005, score=0.948, total=  49.8s
[CV] svc__C=1, svc__gamma=0.0005 .....................................
[CV] ......... svc__C=1, svc__gamma=0.0005, score=0.942, total=  40.0s
[CV] svc__C=1, svc__gamma=0.0005 .....................................
[CV] ......... svc__C=1, svc__gamma=0.0005, score=0.943, total=  40.4s
[CV] svc__C=1, svc__gamma=0.0005 .....................................
[CV] ......... svc__C=1, svc__gamma=0.0005, score=0.947, total=  40.0s
[CV] svc__C=1, svc__gamma=0.0005 .....................................
[CV] .

[CV] ........ svc__C=50, svc__gamma=0.0001, score=0.930, total=  52.0s
[CV] svc__C=50, svc__gamma=0.0001 ....................................
[CV] ........ svc__C=50, svc__gamma=0.0001, score=0.927, total=  51.8s
[CV] svc__C=50, svc__gamma=0.0001 ....................................
[CV] ........ svc__C=50, svc__gamma=0.0001, score=0.926, total=  51.8s
[CV] svc__C=50, svc__gamma=0.0001 ....................................
[CV] ........ svc__C=50, svc__gamma=0.0001, score=0.926, total=  52.0s
[CV] svc__C=50, svc__gamma=0.0001 ....................................
[CV] ........ svc__C=50, svc__gamma=0.0001, score=0.924, total=  49.6s
[CV] svc__C=50, svc__gamma=0.0005 ....................................
[CV] ........ svc__C=50, svc__gamma=0.0005, score=0.930, total=  53.0s
[CV] svc__C=50, svc__gamma=0.0005 ....................................
[CV] ........ svc__C=50, svc__gamma=0.0005, score=0.927, total=  52.0s
[CV] svc__C=50, svc__gamma=0.0005 ....................................
[CV] .

[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed: 69.7min finished


In [62]:
# List the best parameters for this dataset
print(f"Best params: {grid.best_params_}")
# List the best score
print(f"Best score: {grid.best_score_}")

Best params: {'svc__C': 1, 'svc__gamma': 0.0001}
Best score: 0.9435522612792123


In [63]:
grid.best_estimator_

In [65]:
# Make predictions with the hypertuned model
predictions = grid.predict(tfidf_test)

# Calculate classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions, target_names = ["False", "True"]))

              precision    recall  f1-score   support

       False       0.96      0.94      0.95      7691
        True       0.94      0.96      0.95      7126

    accuracy                           0.95     14817
   macro avg       0.95      0.95      0.95     14817
weighted avg       0.95      0.95      0.95     14817



In [67]:
# save model using joblib
import joblib
filename = '../Models/fake_title_SVM_model.sav'
joblib.dump(grid.best_estimator_, filename)

['../Models/fake_title_SVM_model.sav']

# Check out a few other model types

In [74]:
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score

linear_clf = PassiveAggressiveClassifier()

linear_clf.fit(tfidf_train, y_train)
pred = linear_clf.predict(tfidf_test)
score = accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)

accuracy:   0.938


In [77]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB(alpha=0.1)

last_score = 0
for alpha in np.arange(0,1,.1):
    nb_classifier = MultinomialNB(alpha=alpha)
    nb_classifier.fit(tfidf_train, y_train)
    pred = nb_classifier.predict(tfidf_test)
    score = accuracy_score(y_test, pred)
    if score > last_score:
        clf = nb_classifier
    print("Alpha: {:.2f} Score: {:.5f}".format(alpha, score))

Alpha: 0.00 Score: 0.91132
Alpha: 0.10 Score: 0.93285
Alpha: 0.20 Score: 0.93487
Alpha: 0.30 Score: 0.93528
Alpha: 0.40 Score: 0.93501
Alpha: 0.50 Score: 0.93453
Alpha: 0.60 Score: 0.93460
Alpha: 0.70 Score: 0.93420
Alpha: 0.80 Score: 0.93386
Alpha: 0.90 Score: 0.93359


# Most important features in classification

In [86]:
feature_names = tfidf_vectorizer.get_feature_names()

# Print the "most real" words
sorted(zip(clf.coef_[0], feature_names), reverse=True)[:10]

[(-4.824434878891475, 'trump'),
 (-4.9664158215827445, 'says'),
 (-5.463495922186102, 'house'),
 (-5.805418006096416, 'russia'),
 (-5.846399288968043, 'north'),
 (-5.854134233547253, 'korea'),
 (-5.960795551602371, 'senate'),
 (-5.990360728482187, 'white'),
 (-6.014562240980712, 'china'),
 (-6.053748931513532, 'new')]

In [83]:
def most_informative_feature(vectorizer, classifier, n=100):
    """
    A function to print the most 'informative' words associated with fake and real news for binary classification.
    """
    class_labels = classifier.classes_
    feature_names = vectorizer.get_feature_names()
    topn_class1 = sorted(zip(classifier.coef_[0], feature_names))[:n]
    topn_class2 = sorted(zip(classifier.coef_[0], feature_names))[-n:]

    for coef, feat in topn_class1:
        print(class_labels[0], coef, feat)
    print('---------------------------------')
    for coef, feat in reversed(topn_class2):
        print(class_labels[1], coef, feat)


most_informative_feature(tfidf_vectorizer, linear_clf, n=30)

False -18.501315869574526 video
False -10.183033272142875 breaking
False -8.962783978277576 watch
False -8.695845982465602 racist
False -8.580605081204615 gop
False -7.958337354506038 just
False -7.615314532855993 hillary
False -7.612686163621778 molester
False -7.444991607397199 lied
False -7.132858458564095 joe
False -6.944876361052446 ck
False -6.80641834535091 dem
False -6.709122419175899 destroy
False -6.532911288921217 dumpster
False -6.4579459280415525 dems
False -6.337858502455757 begs
False -6.181717725218552 anonymous
False -6.115966363738506 wow
False -5.9842729083546065 images
False -5.925074557934142 james
False -5.872618046718923 illegals
False -5.807002449528325 lol
False -5.768693183974716 creative
False -5.7635075950422126 sharia
False -5.637259040804631 boiler
False -5.636065224425532 racism
False -5.634786524828486 bernie
False -5.600816961010544 kkk
False -5.569728780953048 ammo
False -5.532800040888096 caroline
-------------------------
True 7.4910269381597265 fact