In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Import basic libraries
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import gzip

In [4]:
test_df = pd.DataFrame(pd.read_csv('../00_Resources/fake_or_real_news.csv'))
test_df = test_df.drop(columns=['Unnamed: 0'])
test_df = test_df.replace({'FAKE': 'False', 'REAL': 'True'})
test_df

Unnamed: 0,title,text,label
0,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",False
1,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,False
2,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,True
3,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",False
4,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,True
...,...,...,...
6330,State Department says it can't find emails fro...,The State Department told the Republican Natio...,True
6331,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,False
6332,Anti-Trump Protesters Are Tools of the Oligarc...,Anti-Trump Protesters Are Tools of the Oligar...,False
6333,"In Ethiopia, Obama seeks progress on peace, se...","ADDIS ABABA, Ethiopia —President Obama convene...",True


In [18]:
with open('../00_Resources/true_fake.csv', 'rb') as fd:
    gzip_fd = gzip.GzipFile(fileobj=fd)
    df = pd.read_csv(gzip_fd)
df = df.drop(columns={'Unnamed: 0','subject','date'})
df = df.rename(columns={'category':'label'})
# df = df.append(test_df, ignore_index=True)
df = df.sample(frac=1).reset_index(drop=True)
# df = df.loc[df['label'] != 'unknown']
df

Unnamed: 0,title,text,label
0,Boiler Room #108 – Who’d Win in a Fight? Boile...,Tune in to the Alternate Current Radio Network...,False
1,Two-Thirds Of Mayors In America Support Raisi...,Republicans in Congress may not support raisin...,False
2,Senate Republican leader says plans vote on co...,WASHINGTON (Reuters) - U.S. Senate Majority Le...,True
3,#PresidentObamaNotBarry Protests Blatant Raci...,There is a hashtag trending on Twitter in prot...,False
4,Tech companies to meet on legal challenge to T...,SAN FRANCISCO (Reuters) - A group of technolog...,True
...,...,...,...
44893,Hyperventilating Christians Freak Out Over Ba...,The Satanic Temple just forced a total shutdow...,False
44894,WHOA! THIS DC DIVA Blows Off Traffic Laws In A...,What the heck! First it s Hillary Clinton who ...,False
44895,Oklahoma House Votes To Let Domestic Abusers ...,You know what would be really stupid? If a gro...,False
44896,North Korea would not commit to peace talks bu...,UNITED NATIONS (Reuters) - United Nations poli...,True


In [5]:
# ML libraries
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression, ElasticNet, Ridge, Lasso
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [7]:
# TODO: find out how to CountVectorize a multi-dimensional array.
X = df['text']
y = df['label']

In [8]:
# Split the data into training and testing
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

print(X.shape, y.shape)

(6335,) (6335,)


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the `tfidf_vectorizer` 
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7) 

# Fit and transform the training data 
tfidf_train = tfidf_vectorizer.fit_transform(X_train) 

# Transform the test set 
tfidf_test = tfidf_vectorizer.transform(X_test)

tfidf_train

<4244x56801 sparse matrix of type '<class 'numpy.float64'>'
	with 1103130 stored elements in Compressed Sparse Row format>

In [10]:
tfidf_df = pd.DataFrame(tfidf_train.A, columns=tfidf_vectorizer.get_feature_names())
tfidf_df

Unnamed: 0,00,000,0000,000000031,00000031,0001,000billion,000ft,000km,001,...,שתי,תאמצנה,תוצאה,תחל,תיירות,תנותק,תעודת,תתרכז,القادمون,عربي
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4239,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4240,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4241,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4242,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
from sklearn.svm import SVC
# Create a support vector machine linear classifer and fit it to the training data
model = SVC(C=1.0, kernel='linear', gamma=0.0001)
model.fit(tfidf_train, y_train)

# Print the model score using the test data
print(model.score(tfidf_train, y_train))
print(model.score(tfidf_test, y_test))

0.9901036757775683
0.9344811095169775


In [12]:
# Calculate the classification report
from sklearn.metrics import classification_report
preds = model.predict(tfidf_test)
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

       False       0.93      0.95      0.94      1071
        True       0.94      0.92      0.93      1020

    accuracy                           0.93      2091
   macro avg       0.93      0.93      0.93      2091
weighted avg       0.93      0.93      0.93      2091



In [49]:
# save model using joblib
import joblib
filename = '../05_Models/fake_title_SVM_model.sav'
joblib.dump(model, filename)

['../05_Models/fake_title_SVM_model.sav']

In [13]:
X = test_df['title']
y = test_df['label']

# Split the data into training and testing
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# Initialize the `tfidf_vectorizer` 
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7) 

# Fit and transform the training data 
tfidf_train = tfidf_vectorizer.fit_transform(X_train) 

# Transform the test set 
tfidf_test = tfidf_vectorizer.transform(X_test)

tfidf_df = pd.DataFrame(tfidf_train.A, columns=tfidf_vectorizer.get_feature_names())

# Create a support vector machine linear classifer and fit it to the training data
# model = SVC(C=1.0, kernel='linear', gamma=0.0001)
model.fit(tfidf_train, y_train)

# Print the model score using the test data
print(model.score(tfidf_train, y_train))
print(model.score(tfidf_test, y_test))

# Calculate the classification report
preds = model.predict(tfidf_test)
print(classification_report(y_test, preds))

0.9634778510838832
0.8010521281683405
              precision    recall  f1-score   support

       False       0.81      0.80      0.81      1071
        True       0.79      0.80      0.80      1020

    accuracy                           0.80      2091
   macro avg       0.80      0.80      0.80      2091
weighted avg       0.80      0.80      0.80      2091



In [51]:
preds = model.predict(tfidf_test)


Unnamed: 0,00,000,0000,00004,000063,00042,0009,000938,000american,000have,...,zzjjpdaivn,zzn3bqnfsk,zzpx_bzka40police,zzqvyk8xif,zzsg90pbf6,zztaine,zzuml4hkoc,zzzzaaaacccchhh,zzzzzzzz,émigré
0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.000000,0.030085,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30076,0.000000,0.116148,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
30077,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
30078,0.185293,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
30079,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# save model using joblib
import joblib
filename = '../05_Models/fake_body_SVM_model.sav'
joblib.dump(model, filename)

In [19]:
X = test_df['text']
y = test_df['label']

# Split the data into training and testing
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# A pipeline for the text model
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)
model = SVC(C=1.0, kernel='linear', gamma=0.0001, probability=True)
pipeline = make_pipeline(vectorizer, model)

# Fit vectorizer and model
pipeline.fit(X_train, y_train)

# Print model score
print(pipeline.score(X_train, y_train))
print(pipeline.score(X_test, y_test))

0.9901036757775683
0.9344811095169775


In [20]:
# save model using joblib
import joblib
filename = '../05_Models/fake_body_SVM_pipeline.sav'
joblib.dump(pipeline, filename)

['../05_Models/fake_body_SVM_pipeline.sav']