In [1]:
import numpy as np
import pandas as pd
import string
import re
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.naive_bayes import MultinomialNB
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report, confusion_matrix, recall_score, roc_auc_score, accuracy_score
from sklearn.ensemble import RandomForestClassifier
import nltk

from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk import wordpunct_tokenize
from nltk import WordNetLemmatizer
from nltk import sent_tokenize
from nltk import pos_tag

from src.util import identity, model_report, regtokenize
from src.transformers import GildPreprocessor, WordCounterSubjectivity, WordCounter

# Load Data

In [2]:
df = pd.read_csv('data/askreddit_top_50.csv')
df_all = pd.read_csv('data/askreddit_all.csv')

In [3]:
# Top-level comment set : ~9% minority class, 5000 rows
X = df['body']
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)

In [4]:
# All comments set : ~.02% minority class, ~140,000 rows
Xa = df_all['body']
ya = df_all['target']
Xa_train, Xa_test, ya_train, ya_test = train_test_split(Xa, ya, test_size=0.2, random_state=1, stratify=ya)

# Baseline Pipeline

In [5]:
cv = CountVectorizer(tokenizer=regtokenize) # Uses NLTK's RegexpTokenizer

In [6]:
X_tr_vec = cv.fit_transform(X_train)
X_te_vec = cv.transform(X_test)

mnb = MultinomialNB()
mnb.fit(X_tr_vec, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [7]:
model_report(mnb, X_te_vec, y_test)

              precision    recall  f1-score   support

           0   0.912371  0.973597  0.941990       909
           1   0.200000  0.065934  0.099174        91

    accuracy                       0.891000      1000
   macro avg   0.556186  0.519766  0.520582      1000
weighted avg   0.847545  0.891000  0.865294      1000

Classes predicted:   [0 1]
Accuracy:   0.891
ROC AUC Score:   0.5110675902755111
True Pos:  6 False Pos:  24
TP/FP:  0.25
True Neg:  885 False Neg:  85


In [8]:
Xa_tr_vec = cv.fit_transform(Xa_train)
Xa_te_vec = cv.transform(Xa_test)

mnba = MultinomialNB()
mnba.fit(Xa_tr_vec, ya_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [9]:
model_report(mnba, Xa_te_vec, ya_test)

              precision    recall  f1-score   support

           0   0.997831  0.989283  0.993538     28365
           1   0.000000  0.000000  0.000000        61

    accuracy                       0.987160     28426
   macro avg   0.498915  0.494641  0.496769     28426
weighted avg   0.995690  0.987160  0.991406     28426

Classes predicted:   [0 1]
Accuracy:   0.987159642580736
ROC AUC Score:   0.33159111465584756
True Pos:  0 False Pos:  304
TP/FP:  0.0
True Neg:  28061 False Neg:  61


# Custom Preprocessor

In [23]:
model = Pipeline([
    ('normalizer', GildPreprocessor()),
    ('vectorizer', TfidfVectorizer(tokenizer=identity, preprocessor=None, lowercase=False)),
    ('bayes', MultinomialNB()),
])

In [11]:
model.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('normalizer', GildPreprocessor(lower=True, strip=True)),
                ('vectorizer',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=False, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function identity at 0x1a196d48c8>,
                                 use_idf=True, vocabulary=None)),
                ('bayes',
                 MultinomialNB(alp

In [12]:
model_report(model, X_test, y_test)

  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

           0   0.909000  1.000000  0.952331       909
           1   0.000000  0.000000  0.000000        91

    accuracy                       0.909000      1000
   macro avg   0.454500  0.500000  0.476166      1000
weighted avg   0.826281  0.909000  0.865669      1000

Classes predicted:   [0]
Accuracy:   0.909
ROC AUC Score:   0.4967782492534968
True Pos:  0 False Pos:  0
TP/FP:  nan
True Neg:  909 False Neg:  91


  print("TP/FP: ", (tp/fp))


# Cost-Sensitive Learning

In [13]:
model = Pipeline([
    ('normalizer', GildPreprocessor()),
    ('vectorizer', TfidfVectorizer(tokenizer=identity, preprocessor=None, lowercase=False)),
    ('trees', RandomForestClassifier(random_state=0, n_jobs=-1, class_weight='balanced'))
])

In [14]:
model.fit(X_train, y_train)



Pipeline(memory=None,
         steps=[('normalizer', GildPreprocessor(lower=True, strip=True)),
                ('vectorizer',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=False, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_wor...
                 RandomForestClassifier(bootstrap=True, class_weight='balanced',
                                        criterion='gini', max_depth=None,
                                        max_features='auto',
                                        max_leaf_nodes=None,
                                        min_impurity_decrease=0.0,
        

In [15]:
model_report(model, X_test, y_test)

  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

           0   0.909000  1.000000  0.952331       909
           1   0.000000  0.000000  0.000000        91

    accuracy                       0.909000      1000
   macro avg   0.454500  0.500000  0.476166      1000
weighted avg   0.826281  0.909000  0.865669      1000

Classes predicted:   [0]
Accuracy:   0.909
ROC AUC Score:   0.6163396559436164
True Pos:  0 False Pos:  0
TP/FP:  nan
True Neg:  909 False Neg:  91


  print("TP/FP: ", (tp/fp))


In [16]:
w = 50 # The weight for the positive (minority) class

model2 = Pipeline([
    ('normalizer', GildPreprocessor()),
    ('vectorizer', TfidfVectorizer(tokenizer=identity, preprocessor=None, lowercase=False)),
    ('trees', RandomForestClassifier(random_state=0, n_jobs=-1, class_weight={0: 1, 1: w}))
])

In [17]:
model2.fit(X_train, y_train)



Pipeline(memory=None,
         steps=[('normalizer', GildPreprocessor(lower=True, strip=True)),
                ('vectorizer',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=False, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_wor...
                 RandomForestClassifier(bootstrap=True,
                                        class_weight={0: 1, 1: 50},
                                        criterion='gini', max_depth=None,
                                        max_features='auto',
                                        max_leaf_nodes=None,
                                

In [18]:
model_report(model2, X_test, y_test)

              precision    recall  f1-score   support

           0   0.909461  0.983498  0.945032       909
           1   0.117647  0.021978  0.037037        91

    accuracy                       0.896000      1000
   macro avg   0.513554  0.502738  0.491034      1000
weighted avg   0.837406  0.896000  0.862404      1000

Classes predicted:   [0 1]
Accuracy:   0.896
ROC AUC Score:   0.6260532646671262
True Pos:  2 False Pos:  15
TP/FP:  0.13333333333333333
True Neg:  894 False Neg:  89


# Add Word Count, Subjectivity Score Features

In [5]:
w = 50 # The weight for the positive (minority) class

model3 = Pipeline([
    ('preprocess_union', FeatureUnion([
    		('term_freq_feature', Pipeline([
    			('normalizer', GildPreprocessor()),
    			('vectorizer', TfidfVectorizer(tokenizer=identity, preprocessor=None, lowercase=False))
    			])),
    		('word_counter', WordCounter())
    	])),
    ('trees', RandomForestClassifier(random_state=0, n_jobs=-1, class_weight={0: 1, 1: w}))
])

model4 = Pipeline([
    ('preprocess_union', FeatureUnion([
    		('term_freq_feature', Pipeline([
    			('normalizer', GildPreprocessor()),
    			('vectorizer', TfidfVectorizer(tokenizer=identity, preprocessor=None, lowercase=False))
    			])),
    		('word_counter', WordCounterSubjectivity())
    	])),
    ('trees', RandomForestClassifier(random_state=0, n_jobs=-1, class_weight={0: 1, 1: w}))
])

In [6]:
model3.fit(X_train, y_train)
model4.fit(X_train, y_train)



Pipeline(memory=None,
         steps=[('preprocess_union',
                 FeatureUnion(n_jobs=None,
                              transformer_list=[('term_freq_feature',
                                                 Pipeline(memory=None,
                                                          steps=[('normalizer',
                                                                  GildPreprocessor(lower=True,
                                                                                   strip=True)),
                                                                 ('vectorizer',
                                                                  TfidfVectorizer(analyzer='word',
                                                                                  binary=False,
                                                                                  decode_error='strict',
                                                                                  dtype=<class 'numpy.flo

In [8]:
model_report(model3, X_test, y_test)
print()
model_report(model4, X_test, y_test)

              precision    recall  f1-score   support

           0   0.908163  0.979098  0.942298       909
           1   0.050000  0.010989  0.018018        91

    accuracy                       0.891000      1000
   macro avg   0.479082  0.495043  0.480158      1000
weighted avg   0.830070  0.891000  0.858188      1000

Classes predicted:   [0 1]
Accuracy:   0.891
ROC AUC Score:   0.6212841064326213
True Pos:  1 False Pos:  19
TP/FP:  0.05263157894736842
True Neg:  890 False Neg:  90

              precision    recall  f1-score   support

           0   0.910387  0.983498  0.945531       909
           1   0.166667  0.032967  0.055046        91

    accuracy                       0.897000      1000
   macro avg   0.538527  0.508233  0.500289      1000
weighted avg   0.842708  0.897000  0.864497      1000

Classes predicted:   [0 1]
Accuracy:   0.897
ROC AUC Score:   0.6487324556631487
True Pos:  3 False Pos:  15
TP/FP:  0.2
True Neg:  894 False Neg:  88
