In [1]:
import nltk
import pandas as pd
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import PunktSentenceTokenizer

import numpy as np
import xgboost as xgb
from tqdm import tqdm

#Keras/TF
from sklearn.svm import SVC
from keras.models import Sequential
from keras.layers.recurrent import LSTM, GRU
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils

#SKLearn
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from keras.preprocessing import sequence, text
from keras.callbacks import EarlyStopping

#NLTK Functions
from nltk import word_tokenize
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

Using TensorFlow backend.


In [21]:
data = pd.read_csv('../Jason/real_fake_combo.csv')

In [22]:
data.head()

Unnamed: 0.1,Unnamed: 0,title,realfake
0,0,#2816: Clinton Pride’s 8(a) Pig Farm Bridge – ...,fake
1,1,#2817: Serco's Zulu Starnet Blackmail – Clinto...,fake
2,2,Roger Stone update on Stop the Steal exit poll...,fake
3,3,#2818: Serco's Zulu Bridge To Mumbai Pig Farm ...,fake
4,4,Trump Advocates the American People's Control ...,fake


In [23]:
#Import stopwords
stopWords = set(stopwords.words('english'))
data['title_tokenized'] = [word_tokenize(i) for i in data['title']]

filtered = []
for words in data['title_tokenized']:
    temp = []
    for w in words:
        if w not in stopWords:
            temp.append(w)
    filtered.append(temp)

data['title_no_stops'] = filtered

In [24]:
data.head()

Unnamed: 0.1,Unnamed: 0,title,realfake,title_tokenized,title_no_stops
0,0,#2816: Clinton Pride’s 8(a) Pig Farm Bridge – ...,fake,"[#, 2816, :, Clinton, Pride, ’, s, 8, (, a, ),...","[#, 2816, :, Clinton, Pride, ’, 8, (, ), Pig, ..."
1,1,#2817: Serco's Zulu Starnet Blackmail – Clinto...,fake,"[#, 2817, :, Serco, 's, Zulu, Starnet, Blackma...","[#, 2817, :, Serco, 's, Zulu, Starnet, Blackma..."
2,2,Roger Stone update on Stop the Steal exit poll...,fake,"[Roger, Stone, update, on, Stop, the, Steal, e...","[Roger, Stone, update, Stop, Steal, exit, poll..."
3,3,#2818: Serco's Zulu Bridge To Mumbai Pig Farm ...,fake,"[#, 2818, :, Serco, 's, Zulu, Bridge, To, Mumb...","[#, 2818, :, Serco, 's, Zulu, Bridge, To, Mumb..."
4,4,Trump Advocates the American People's Control ...,fake,"[Trump, Advocates, the, American, People, 's, ...","[Trump, Advocates, American, People, 's, Contr..."


In [25]:
nltk.pos_tag(data['title_no_stops'][0])

[('#', '#'),
 ('2816', 'CD'),
 (':', ':'),
 ('Clinton', 'NNP'),
 ('Pride', 'NNP'),
 ('’', 'VBD'),
 ('8', 'CD'),
 ('(', '('),
 (')', ')'),
 ('Pig', 'NNP'),
 ('Farm', 'NNP'),
 ('Bridge', 'NNP'),
 ('–', 'NNP'),
 ('Serco', 'NNP'),
 ('Zulu', 'NNP'),
 ('Server', 'NNP'),
 ('Snuff', 'NNP'),
 ('–Soros', 'NNP'),
 ("'s", 'POS'),
 ('Patented', 'NNP'),
 ('Voter', 'NNP'),
 ('Key', 'NNP')]

In [26]:
lbl_enc = preprocessing.LabelEncoder()
y = lbl_enc.fit_transform(data.realfake.values)

In [27]:
xtrain, xvalid, ytrain, yvalid = train_test_split(data.title.values, y, 
                                                  stratify=y, 
                                                  random_state=42, 
                                                  test_size=0.1, shuffle=True)

In [28]:
tfv = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = 'english')

In [29]:
# Fitting TF-IDF to both training and test sets (semi-supervised learning)
tfv.fit(list(xtrain) + list(xvalid))
xtrain_tfv =  tfv.transform(xtrain) 
xvalid_tfv = tfv.transform(xvalid)

In [30]:
def multiclass_logloss(actual, predicted, eps=1e-15):
    """Multi class version of Logarithmic Loss metric.
    :param actual: Array containing the actual target classes
    :param predicted: Matrix with class predictions, one probability per class
    """
    # Convert 'actual' to a binary array if it's not already:
    if len(actual.shape) == 1:
        actual2 = np.zeros((actual.shape[0], predicted.shape[1]))
        for i, val in enumerate(actual):
            actual2[i, val] = 1
        actual = actual2

    clip = np.clip(predicted, eps, 1 - eps)
    rows = actual.shape[0]
    vsota = np.sum(actual * np.log(clip))
    return -1.0 / rows * vsota

In [31]:
clf = LogisticRegression(C=1.0)
clf.fit(xtrain_tfv, ytrain)
predictions = clf.predict_proba(xvalid_tfv)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

logloss: 0.327 


In [32]:
ctv = CountVectorizer(analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), stop_words = 'english')

# Fitting Count Vectorizer to both training and test sets (semi-supervised learning)
ctv.fit(list(xtrain) + list(xvalid))
xtrain_ctv =  ctv.transform(xtrain) 
xvalid_ctv = ctv.transform(xvalid)

In [33]:
# Fitting a simple Logistic Regression on Counts
clf = LogisticRegression(C=1.0)
clf.fit(xtrain_ctv, ytrain)
predictions = clf.predict_proba(xvalid_ctv)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

logloss: 0.265 


In [34]:
# Fitting a simple Naive Bayes on TFIDF
clf = MultinomialNB()
clf.fit(xtrain_tfv, ytrain)
predictions = clf.predict_proba(xvalid_tfv)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

logloss: 0.288 


In [35]:
# Apply SVD, I chose 120 components. 120-200 components are good enough for SVM model.
svd = decomposition.TruncatedSVD(n_components=120)
svd.fit(xtrain_tfv)
xtrain_svd = svd.transform(xtrain_tfv)
xvalid_svd = svd.transform(xvalid_tfv)

# Scale the data obtained from SVD. Renaming variable to reuse without scaling.
scl = preprocessing.StandardScaler()
scl.fit(xtrain_svd)
xtrain_svd_scl = scl.transform(xtrain_svd)
xvalid_svd_scl = scl.transform(xvalid_svd)

In [36]:
# Fitting a simple SVM #FOOOOOORGET THIS MODEL... 
clf = SVC(C=1.0, probability=True) # since we need probabilities
clf.fit(xtrain_svd_scl, ytrain)
predictions = clf.predict_proba(xvalid_svd_scl)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

logloss: 0.392 


In [37]:
# Fitting a simple xgboost on tf-idf
clf = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, 
                        subsample=0.8, nthread=10, learning_rate=0.1)
clf.fit(xtrain_tfv.tocsc(), ytrain)
predictions = clf.predict_proba(xvalid_tfv.tocsc())

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

logloss: 0.400 


In [None]:
# Fitting a simple xgboost on tf-idf
clf = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, 
                        subsample=0.8, nthread=10, learning_rate=0.1)
clf.fit(xtrain_tfv.tocsc(), ytrain)
predictions = clf.predict_proba(xvalid_tfv.tocsc())

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

In [39]:
xtrain[0]

'Village people: The rural past that unites Russia and Finland'

In [38]:
print(xtrain_tfv[0])

  (0, 23280)	0.4326572358468783
  (0, 17580)	0.26492448494251886
  (0, 17570)	0.5079863614425548
  (0, 15109)	0.3171193350423869
  (0, 14961)	0.3732538150335934
  (0, 7968)	0.49466100214265557


In [31]:
print(xtrain_tfv[1])

  (0, 24372)	0.2676670259740929
  (0, 24355)	0.3931714244117074
  (0, 24354)	0.3931714244117074
  (0, 21020)	0.3658696691007898
  (0, 15789)	0.216195906242261
  (0, 15199)	0.3931714244117074
  (0, 13178)	0.40196062648603204
  (0, 13177)	0.3498991660400383


In [32]:
print(xtrain[0])

Village people: The rural past that unites Russia and Finland


In [33]:
print(xtrain[1])

Mnangagwa will be sworn in as Zimbabwe president: ZANU-PF
