Reference: https://www.datacamp.com/community/tutorials/scikit-learn-fake-news
; https://s3.amazonaws.com/assets.datacamp.com/production/course_5064/slides/chapter4.pdf

In [40]:
import warnings
warnings.filterwarnings('ignore')

In [41]:
# Import basic libraries
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import gzip

In [42]:
test_df = pd.DataFrame(pd.read_csv('../00_Resources/fake_or_real_news.csv'))
test_df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [44]:
with open('../00_Resources/true_fake.csv', 'rb') as fd:
    gzip_fd = gzip.GzipFile(fileobj=fd)
    df = pd.read_csv(gzip_fd)
df = df.drop(columns={'Unnamed: 0'})
df

Unnamed: 0,title,text,subject,date,category
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",True
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",True
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",True
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",True
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",True
...,...,...,...,...,...
44893,McPain: John McCain Furious That Iran Treated ...,21st Century Wire says As 21WIRE reported earl...,Middle-east,"January 16, 2016",False
44894,JUSTICE? Yahoo Settles E-mail Privacy Class-ac...,21st Century Wire says It s a familiar theme. ...,Middle-east,"January 16, 2016",False
44895,Sunnistan: US and Allied ‘Safe Zone’ Plan to T...,Patrick Henningsen 21st Century WireRemember ...,Middle-east,"January 15, 2016",False
44896,How to Blow $700 Million: Al Jazeera America F...,21st Century Wire says Al Jazeera America will...,Middle-east,"January 14, 2016",False


In [45]:
# ML libraries
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression, ElasticNet, Ridge, Lasso
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [74]:
# TODO: find out how to CountVectorize a multi-dimensional array.

# Drop the `label` column
X = df['title']
y = df['category']

In [75]:
# Split the data into training and testing
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

print(X.shape, y.shape)

(44898,) (44898,)


In [76]:
# This cell is a test 
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer(stop_words='english')
count_train = count_vectorizer.fit_transform(X_train.values)
count_test = count_vectorizer.transform(X_test.values)


count_test

<14817x18285 sparse matrix of type '<class 'numpy.int64'>'
	with 130145 stored elements in Compressed Sparse Row format>

# CountVectorizer and TfidfVectorizer

In [77]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize the `count_vectorizer` 
count_vectorizer = CountVectorizer(stop_words='english')

# Fit and transform the training data 
count_train = count_vectorizer.fit_transform(X_train) 

# Transform the test set 
count_test = count_vectorizer.transform(X_test)

In [78]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the `tfidf_vectorizer` 
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7) 

# Fit and transform the training data 
tfidf_train = tfidf_vectorizer.fit_transform(X_train) 

# Transform the test set 
tfidf_test = tfidf_vectorizer.transform(X_test)

In [79]:
# Get the feature names of `tfidf_vectorizer` 
print(tfidf_vectorizer.get_feature_names()[-10:])

# Get the feature names of `count_vectorizer` 
print(count_vectorizer.get_feature_names()[:10])

['zoolander', 'zor', 'zschaepe', 'zucker', 'zuckerberg', 'zulia', 'zuma', 'zummar', 'zurich', 'état']
['00', '000', '000m', '0045', '0111', '0112', '0130', '0149', '02', '03']


In [80]:
count_df = pd.DataFrame(count_train.A, columns=count_vectorizer.get_feature_names())


In [81]:
tfidf_df = pd.DataFrame(tfidf_train.A, columns=tfidf_vectorizer.get_feature_names())


In [82]:
difference = set(count_df.columns) - set(tfidf_df.columns)
difference

set()

In [83]:
print(count_df.equals(tfidf_df))


False


In [84]:
count_df


Unnamed: 0,00,000,000m,0045,0111,0112,0130,0149,02,03,...,zoolander,zor,zschaepe,zucker,zuckerberg,zulia,zuma,zummar,zurich,état
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30076,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
30077,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
30078,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
30079,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# TextHero testing

Reference: https://pypi.org/project/texthero/

In [None]:
import texthero as hero

In [None]:
df['head_pca'] = (
    all_news_df['title']
    .pipe(hero.clean)
    .pipe(hero.tfidf)
    .pipe(hero.pca)
)
# df['head_pca'] = (df['text']
#                           .pipe(hero.tokenize)
#                           .pipe(hero.tfidf)
#                           .pipe(hero.pca)
#                           )