## Loading the Data

In [5]:
#Importing all the libraries
import pandas as pd
import numpy as np
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import gensim
import scipy
warnings.filterwarnings ("ignore")

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate

In [7]:
#Importing the Data
train = pd.read_csv('data/reddit_200k_train.csv', encoding = 'latin-1')
train = train[['body','REMOVED']]

test = pd.read_csv('data/reddit_200k_test.csv', encoding = 'latin-1')
test = test[['body','REMOVED']]

X_train = pd.DataFrame(train.body)
y_train = pd.DataFrame(train.REMOVED)

In [8]:
#Getting the info about the training set
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 167529 entries, 0 to 167528
Data columns (total 1 columns):
body    167529 non-null object
dtypes: object(1)
memory usage: 1.3+ MB


Since the dataset it unbalanced, we would be undersampling the data.

In [9]:
#Undersampling the Data
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(replacement=False)
X_train_subsample, y_train_subsample = rus.fit_sample(
    X_train, y_train)
print(X_train_subsample.shape)
print(y_train_subsample.shape)

(129476, 1)
(129476, 1)


## Task 1.1 Baseline Model 

In [5]:
#Creating a baseline model
count = CountVectorizer()
lr = LogisticRegression()
base_pipeline = make_pipeline(count,lr)

In [6]:
#Cross-Validation Score of the baseline
from sklearn.model_selection import cross_validate

scoring = {'auc': 'roc_auc', 'accuracy':'accuracy', 'average_precision': 'average_precision', 
           'precision':'precision', 'recall':'recall'}

cv_baseline = cross_validate(base_pipeline, X_train_subsample.ravel(), y_train_subsample.ravel(), cv=3, scoring=scoring)


In [7]:
#Getting the result of the baseline model
print("accuracy baseline: {}".format(np.mean(cv_baseline['test_accuracy'])))
print("average_precision baseline: {}".format(np.mean(cv_baseline['test_average_precision'])))
print("precision baseline: {}".format(np.mean(cv_baseline['test_precision'])))
print("recall baseline: {}".format(np.mean(cv_baseline['test_recall'])))
print("auc baseline: {}".format(np.mean(cv_baseline['test_auc'])))

accuracy baseline: 0.686103968430159
average_precision baseline: 0.7002991440316109
precision baseline: 0.662622560552093
recall baseline: 0.7583026878427024
auc baseline: 0.7456980476638077


## Task 1.2 Other Pre-processing steps

Now we will improve our baseline model. Here we will implement three steps:

- Remove the stop words (english stop words)
- Remove the the words that are in less than 100 documents
- Enforce an l2 penalty for the logistic regression

In [8]:
#Removing Stop words and Infrequent words
count_1 = CountVectorizer(stop_words='english', min_df=100)
base_pipeline_update = make_pipeline(count_1,LogisticRegression(penalty='l2'))

cv_baseline_update = cross_validate(base_pipeline_update, X_train_subsample.ravel(), y_train_subsample.ravel(), cv=5, scoring=scoring)

In [9]:
#Getting the result of the updated baseline model with stop words and infrequent words removed
print("accuracy updated_baseline: {}".format(np.mean(cv_baseline_update['test_accuracy'])))
print("average_precision updated_baseline: {}".format(np.mean(cv_baseline_update['test_average_precision'])))
print("precision updated_baseline: {}".format(np.mean(cv_baseline_update['test_precision'])))
print("recall updated_baseline: {}".format(np.mean(cv_baseline_update['test_recall'])))
print("auc updated_baseline: {}".format(np.mean(cv_baseline_update['test_auc'])))

accuracy updated_baseline: 0.6814389945663553
average_precision updated_baseline: 0.693983356147583
precision updated_baseline: 0.6471149396013883
recall updated_baseline: 0.7981247231679718
auc updated_baseline: 0.738510409921234


It appears from the above that all the metric except the recall (which increases by about 4%) show a minor decrease in the performance. Next we will experiment with some more techniques to fine-tune the BoW model. The following steps would be used below:

- Tfidf Vectorizer would be used with english stop words removed.
- Grid Search over the C parameter for logistic regression model.

CV=3 has been taken due to computational limitations.

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

#Creating Pipeline
tfidf = TfidfVectorizer(stop_words='english')
pipeline_tfidf = Pipeline (steps = [('tfidf', TfidfVectorizer(stop_words='english')),
                                    ('lr_tfidf', LogisticRegression())])

In [14]:
#Performing GridSearch with Tfidf
parameters = {'lr_tfidf__C': np.logspace(0,4,10)}

log_tfidf = GridSearchCV(pipeline_tfidf, param_grid = parameters, cv = 3, return_train_score = True, 
                         scoring=scoring,  refit='auc',n_jobs=-1)
log_tfidf.fit(X_train_subsample.ravel(), y_train_subsample.ravel())

#Cross-Validating the best model
cv_log_tfidf = cross_validate(log_tfidf.best_estimator_, X_train_subsample.ravel(), 
                                    y_train_subsample.ravel(), cv=3, scoring=scoring)

#Getting the results of the above model
print("accuracy log_tfidf: {}".format(np.mean(cv_log_tfidf['test_accuracy'])))
print("average_precision log_tfidf: {}".format(np.mean(cv_log_tfidf['test_average_precision'])))
print("precision log_tfidf: {}".format(np.mean(cv_log_tfidf['test_precision'])))
print("recall log_tfidf: {}".format(np.mean(cv_log_tfidf['test_recall'])))
print("auc log_tfidf: {}".format(np.mean(cv_log_tfidf['test_auc'])))

accuracy log_tfidf: 0.6913018545144155
average_precision log_tfidf: 0.7276490057083356
precision log_tfidf: 0.6751870425459776
recall log_tfidf: 0.7372949425667178
auc log_tfidf: 0.7597363230621014


It appears that the tfidf vectorizer does not improve the performance of the model, it remains almost the same. Now we will use a character n-gram model and a more comprehensive grid search. We will be tuning the following parameters in the model:

- The C parameter of the Logistic Regression
- N-grams: To take into account the neighboring words
- Normalizer - Turned on/off

The model will be run with CountVectorizer configuration with a char analyzer and English stop words removed. Note that a few parameters are tested due to the long model training time and limitations in the computation power.

In [11]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import Normalizer

#More Comprehensive Grid-Search

param_grid_final = {"logisticregression__C": [10, 1, 0.1],
              "countvectorizer__ngram_range": [(1, 1), (1, 2), (2, 3)],
              "normalizer": [None, Normalizer()]
             }

grid_final = GridSearchCV(make_pipeline(CountVectorizer(analyzer="char", stop_words='english'), Normalizer(), LogisticRegression(penalty='l2'),
                                  memory="cache_folder"),
                    param_grid=param_grid_final, cv=3, scoring=scoring, refit='auc',n_jobs=-1
                   )

grid_final.fit(X_train_subsample.ravel(), y_train_subsample.ravel())

cv_grid_final = cross_validate(grid_final.best_estimator_, X_train_subsample.ravel(), 
                                    y_train_subsample.ravel(), cv=3, scoring=scoring)

#Getting the results of the above model
print("accuracy grid_final: {}".format(np.mean(cv_grid_final['test_accuracy'])))
print("average_precision grid_final: {}".format(np.mean(cv_grid_final['test_average_precision'])))
print("precision grid_final: {}".format(np.mean(cv_grid_final['test_precision'])))
print("recall grid_final: {}".format(np.mean(cv_grid_final['test_recall'])))
print("auc grid_final: {}".format(np.mean(cv_grid_final['test_auc'])))

accuracy grid_final: 0.7049105725750858
average_precision grid_final: 0.763202981325995
precision grid_final: 0.6978890131400032
recall grid_final: 0.722651319218849
auc grid_final: 0.7820082290275384


In [12]:
print("grid_final best parameters: {}".format(grid_final.best_params_))

grid_final best parameters: {'countvectorizer__ngram_range': (2, 3), 'logisticregression__C': 10, 'normalizer': Normalizer(copy=True, norm='l2')}


As it can be observed, the scores have imporoved. Average precision and AUC increase the most (almost 5%). Now, we will create some new features and see if we can further enhance the model.

## Task 1.3 Creating New Features

We will now create new features and see if that affects the performance of our model. New features that will be created are:

- Length of the String (number of words in the string)
- Number of exclamation marks, full stops and commas
- Number of capitalization in the text - i.e the number of words that are all capital

In [13]:
#Creating a new dataframe
X_train_subsample_new = pd.DataFrame(X_train_subsample)
X_train_subsample_new.columns = ['body']

#Adding Features - Number of punctuations
X_train_subsample_new['comma_count'] = X_train_subsample_new.body.str.count(',')
X_train_subsample_new['exc_count'] = X_train_subsample_new.body.str.count('!')
X_train_subsample_new['dot_count'] = X_train_subsample_new.body.str.count('.')

#Adding Features - Number of words
X_train_subsample_new['split_body'] = X_train_subsample_new.body.str.split()
X_train_subsample_new['length'] = X_train_subsample_new.split_body.apply(lambda x: len(x))

#Number of Capitalizations in the string
X_train_subsample_new['capital'] = X_train_subsample_new.split_body.apply(lambda x: sum(map(str.isupper, x)))

X_train_subsample_new.drop(columns=['split_body'], inplace=True)
X_train_subsample_new.head()

Unnamed: 0,body,comma_count,exc_count,dot_count,length,capital
0,You don't say. So making ppl work 12-24 hrs ha...,0,0,87,16,0
1,I'm actually looking for a PCP for the first t...,1,0,609,107,8
2,Study: [Reconciling controversies about the â...,7,0,944,122,1
3,Journal reference:\r\n\r\nThe tumbling rotatio...,25,0,1766,241,6
4,"Why can't we have fun, responsibly?",1,0,36,6,0


In [14]:
from sklearn.preprocessing import FunctionTransformer

# Creating a function to vectorize the text and append the numberical features
def vectorizer(df):
    cvdf = CountVectorizer(stop_words='english').fit_transform(df.body)
    return scipy.sparse.hstack((cvdf, df.drop(columns='body')))

#Creating a transformed sample with the vectorizer
X_train_subsample_transformed = FunctionTransformer(func=vectorizer, 
                                                    validate=False).fit_transform(X_train_subsample_new)


cv_features = cross_validate(make_pipeline(LogisticRegression(penalty='l2')), X_train_subsample_transformed, y_train_subsample,
                             cv=3, scoring=scoring)

In [15]:
#Getting the result of the updated baseline model with stop words and infrequent words removed
print("accuracy cv_features: {}".format(np.mean(cv_features['test_accuracy'])))
print("average_precision cv_features: {}".format(np.mean(cv_features['test_average_precision'])))
print("precision cv_features: {}".format(np.mean(cv_features['test_precision'])))
print("recall cv_features: {}".format(np.mean(cv_features['test_recall'])))

accuracy cv_features: 0.6829373721201702
average_precision cv_features: 0.7004825478575661
precision cv_features: 0.6567325518653209
recall cv_features: 0.7666440886797358


Now we will run a more comprehensive grid-search tuning the 'C' parameter in the logistic regression and with normalizer turned on and off.

In [16]:
#More Comprehensive Grid-Search
param_grid_final = {"logisticregression__C": [10, 1, 0.1], "normalizer": [Normalizer(), None]}

grid_feat = GridSearchCV(make_pipeline(Normalizer(), LogisticRegression(penalty='l2')),
            param_grid=param_grid_final, cv=3, scoring=scoring, refit='auc',n_jobs=-1)

grid_feat.fit(X_train_subsample_transformed, y_train_subsample)

cv_grid_feat = cross_validate(grid_feat.best_estimator_, X_train_subsample_transformed, 
                                    y_train_subsample.ravel(), cv=3, scoring=scoring)

#Getting the results of the above model
print("accuracy grid_final: {}".format(np.mean(cv_grid_feat['test_accuracy'])))
print("average_precision grid_final: {}".format(np.mean(cv_grid_feat['test_average_precision'])))
print("precision grid_final: {}".format(np.mean(cv_grid_feat['test_precision'])))
print("recall grid_final: {}".format(np.mean(cv_grid_feat['test_recall'])))
print("auc grid_final: {}".format(np.mean(cv_grid_feat['test_auc'])))

accuracy grid_final: 0.689625851289676
average_precision grid_final: 0.7114884932106532
precision grid_final: 0.6553983153512535
recall grid_final: 0.7997775679604064
auc grid_final: 0.7523790685245612


In [17]:
print("grid_final best parameters: {}".format(grid_feat.best_params_))

grid_final best parameters: {'logisticregression__C': 0.1, 'normalizer': None}


It appears from the above that the Grid Search without the normalizer performs better than the grid search with the normalizer.

Through the BOW analysis, we conclude that the logistic regression model with parameters - {'countvectorizer__ngram_range': (2, 3), 'logisticregression__C': 10, 'normalizer': Normalizer(copy=True, norm='l2')} and without added features performs the best. However, note that there is not much significant different between this model and the baseline model.

All the other models that we tested in the analysis (tfidf, n-grams etc.) though they performance worse than this model, the difference is not significant.

Limitations: We were only able to search over a few hyperparameters due to the computational limitation and the long running times of the algorithm.

## Task 2 Word Vectors

In this section, instead of taking BOW model, we will use some pretrained word embedding to evaluate if the performance of our model improves.

We will use pretrained word vectors from FastText: https://fasttext.cc/docs/en/english-vectors.html

In [2]:
from gensim import models
#Loading the pre-trained word vector from FastText
from gensim.models.wrappers import FastText
model = gensim.models.KeyedVectors.load_word2vec_format('wiki-news-300d-1M.vec')

In this section, we will create a separate validation set (similar to the example in Prof. Mueller's notes), train the data on the training set and perform the validation on the validation set.

In [10]:
#Splitting a validation set from the original training set
from sklearn.model_selection import train_test_split
text_train_sub, text_val, y_train_sub, y_val = train_test_split(
    X_train_subsample, y_train_subsample, stratify=y_train_subsample, random_state=0)

In [11]:
#Initializing the CountVectorizer
vect_w2v = CountVectorizer(vocabulary=model.index2word)
docs = vect_w2v.inverse_transform(vect_w2v.transform(text_train_sub.ravel()))

Since not all documents are vectorized due by the w2v embeddings, we will create a function that removes documents that have a length of zero.

In [12]:
#Creating function to remove the docs that are not vectorized using word2vec
def removeBlankDocs(docInput, yInput):
    i = 0
    blankDocs = []
    for doc in docInput:
        if len(doc) == 0:
            blankDocs.append(i)
        i = i + 1
    return(np.delete(docInput, blankDocs, axis=0), np.delete(yInput, blankDocs, axis=0))

In [13]:
#Creating the revised data using the above function
X_train_w2v = removeBlankDocs(docs, y_train_sub)[0]
y_train_w2v = removeBlankDocs(docs, y_train_sub)[1]

X_train_final = np.vstack([np.mean(model[doc], axis=0) for doc in X_train_w2v])

In [14]:
#Creating and fitting the model
lr_w2v = LogisticRegression().fit(X_train_final, y_train_w2v)

In [15]:
# Performing the necessary preprocessing on the validation set
docs_val = vect_w2v.inverse_transform(vect_w2v.transform(text_val.ravel()))
X_val_w2v = removeBlankDocs(docs_val, y_val)[0]
y_val_w2v = removeBlankDocs(docs_val, y_val)[1]
X_val_final = np.vstack([np.mean(model[doc], axis=0) for doc in X_val_w2v])

Now let us calculate the performance of the model on the validation set.

In [16]:
#Calculating the validation scores

from sklearn.metrics import roc_auc_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score

print("Accuracy: " + str(lr_w2v.score(X_val_final, y_val_w2v)))
print("AUC: " + str(roc_auc_score(y_val_w2v, lr_w2v.predict(X_val_final))))
print("Average Precision: " + str(average_precision_score(y_val_w2v, lr_w2v.predict(X_val_final))))
print("Recall: " + str(recall_score(y_val_w2v, lr_w2v.predict(X_val_final))))

Accuracy: 0.6693353941267388
AUC: 0.6693371302898056
Average Precision: 0.6128822327216277
Recall: 0.6749536178107607


As it can be observed, the word2vec model does not improve the performance compared to the best model that was obtained in the previous part. Hence, BOW model is prefered for the this dataset with our current implementation.