## Import Dependencies

In [1]:
import nltk
import pandas as pd
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import PunktSentenceTokenizer

import numpy as np
import xgboost as xgb
from tqdm import tqdm

#Keras/TF
from sklearn.svm import SVC
from keras.models import Sequential
from keras.layers.recurrent import LSTM, GRU
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils

#SKLearn
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from keras.preprocessing import sequence, text
from keras.callbacks import EarlyStopping

#NLTK Functions
from nltk import word_tokenize
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC

Using TensorFlow backend.


## Define LogLoss Function

In [2]:
def multiclass_logloss(actual, predicted, eps=1e-15):
    """Multi class version of Logarithmic Loss metric.
    :param actual: Array containing the actual target classes
    :param predicted: Matrix with class predictions, one probability per class
    """
    # Convert 'actual' to a binary array if it's not already:
    if len(actual.shape) == 1:
        actual2 = np.zeros((actual.shape[0], predicted.shape[1]))
        for i, val in enumerate(actual):
            actual2[i, val] = 1
        actual = actual2

    clip = np.clip(predicted, eps, 1 - eps)
    rows = actual.shape[0]
    vsota = np.sum(actual * np.log(clip))
    return -1.0 / rows * vsota

## Read in Data

In [3]:
data = pd.read_csv('../real_fake_combo.csv')

In [4]:
#Import stopwords
stopWords = set(stopwords.words('english'))
data['title_tokenized'] = [word_tokenize(i) for i in data['title']]

filtered = []
for words in data['title_tokenized']:
    temp = []
    for w in words:
        if w not in stopWords:
            temp.append(w)
    filtered.append(temp)

data['title_no_stops'] = filtered

## Encode y's and train test split

In [5]:
lbl_enc = preprocessing.LabelEncoder()
y = lbl_enc.fit_transform(data.realfake.values)

In [6]:
xtrain, xvalid, ytrain, yvalid = train_test_split(data.title.values, y, 
                                                  stratify=y, 
                                                  random_state=42, 
                                                  test_size=0.1, shuffle=True)

## Use OOTB Vectorizer Functions

In [7]:
tfv = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = 'english')

# Fitting TF-IDF to both training and test sets (semi-supervised learning)
tfv.fit(list(xtrain) + list(xvalid))
xtrain_tfv =  tfv.transform(xtrain) 
xvalid_tfv = tfv.transform(xvalid)

In [8]:
ctv = CountVectorizer(analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), stop_words = 'english')

# Fitting Count Vectorizer to both training and test sets (semi-supervised learning)
ctv.fit(list(xtrain) + list(xvalid))
xtrain_ctv =  ctv.transform(xtrain) 
xvalid_ctv = ctv.transform(xvalid)

In [9]:
union = FeatureUnion([("tfv", tfv),("ctv", ctv)])

In [10]:
union.fit(list(xtrain)+list(xvalid))
xtrain_union = union.transform(xtrain)
xvalid_union = union.transform(xvalid)

## Logistic Function Classifier

In [11]:
# Fitting a simple Logistic Regression on TF-IDF
clf = LogisticRegression(C=1.0)
clf.fit(xtrain_tfv, ytrain)
predictions = clf.predict_proba(xvalid_tfv)
predictions_y = clf.predict(xvalid_tfv)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))
print (confusion_matrix(yvalid,predictions_y))
print (f'Score: {clf.score(xvalid_tfv,yvalid)}')

logloss: 0.355 
[[1334  327]
 [ 224 2095]]
Score: 0.8615577889447236




In [12]:
# Fitting a simple Logistic Regression on Counts
clf = LogisticRegression(C=1.0)
clf.fit(xtrain_ctv, ytrain)
predictions = clf.predict_proba(xvalid_ctv)
predictions_y = clf.predict(xvalid_ctv)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))
print (confusion_matrix(yvalid,predictions_y))
print (f'Score: {clf.score(xvalid_ctv,yvalid)}')

logloss: 0.308 
[[1330  331]
 [ 210 2109]]
Score: 0.864070351758794


## Naive Bayes

In [13]:
# Fitting a simple Naive Bayes on TFIDF
clf = MultinomialNB()
clf.fit(xtrain_tfv, ytrain)
predictions = clf.predict_proba(xvalid_tfv)
predictions_y = clf.predict(xvalid_tfv)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))
print (confusion_matrix(yvalid,predictions_y))
print (f'Score: {clf.score(xvalid_tfv,yvalid)}')
# print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

logloss: 0.330 
[[1320  341]
 [ 206 2113]]
Score: 0.8625628140703517


In [14]:
# Fitting a simple Naive Bayes on TFIDF
clf = MultinomialNB()
clf.fit(xtrain_ctv, ytrain)
predictions = clf.predict_proba(xvalid_ctv)
predictions_y = clf.predict(xvalid_ctv)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))
print (confusion_matrix(yvalid,predictions_y))
print (f'Score: {clf.score(xvalid_ctv,yvalid)}')

logloss: 0.387 
[[1407  254]
 [ 266 2053]]
Score: 0.8693467336683417


## XG BOOOOOOOST

In [16]:
# Fitting a simple xgboost on tf-idf
clf = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, 
                        subsample=0.8, nthread=10, learning_rate=0.1)
clf.fit(xtrain_tfv.tocsc(), ytrain)
predictions = clf.predict_proba(xvalid_tfv.tocsc())
predictions_y = clf.predict(xvalid_tfv.tocsc())

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))
print (confusion_matrix(yvalid,predictions_y))
print (f'Score: {clf.score(xvalid_tfv,yvalid)}')

logloss: 0.459 
[[ 971  690]
 [ 167 2152]]
Score: 0.7846733668341709


In [17]:
# Fitting a simple xgboost on ctv
clf = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, 
                        subsample=0.8, nthread=10, learning_rate=0.1)
clf.fit(xtrain_ctv.tocsc(), ytrain)
predictions = clf.predict_proba(xvalid_ctv.tocsc())
predictions_y = clf.predict(xvalid_ctv.tocsc())

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))
print (confusion_matrix(yvalid,predictions_y))
print (f'Score: {clf.score(xvalid_ctv,yvalid)}')

logloss: 0.461 
[[ 997  664]
 [ 175 2144]]
Score: 0.7891959798994975
