In [1]:
import numpy as np
import pandas as pd
import string
import re
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.naive_bayes import MultinomialNB
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report, confusion_matrix, recall_score, roc_auc_score, accuracy_score
from sklearn.ensemble import RandomForestClassifier
import nltk

from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk import wordpunct_tokenize
from nltk import WordNetLemmatizer
from nltk import sent_tokenize
from nltk import pos_tag

from src.util import identity, model_report, regtokenize
from src.transformers import GildPreprocessor, WordCounterSubjectivity, WordCounter

In [2]:
w = 50 # The weight for the positive (minority) class

# With word count feature
model3 = Pipeline([
    ('preprocess_union', FeatureUnion([
    		('term_freq_feature', Pipeline([
    			('normalizer', GildPreprocessor()),
    			('vectorizer', TfidfVectorizer(tokenizer=identity, preprocessor=None, lowercase=False))
    			])),
    		('word_counter', WordCounter())
    	])),
    ('trees', RandomForestClassifier(random_state=0, n_jobs=-1, class_weight={0: 1, 1: w}))
])

# With word count & subjectivity score feature
model4 = Pipeline([
    ('preprocess_union', FeatureUnion([
    		('term_freq_feature', Pipeline([
    			('normalizer', GildPreprocessor()),
    			('vectorizer', TfidfVectorizer(tokenizer=identity, preprocessor=None, lowercase=False))
    			])),
    		('word_counter', WordCounterSubjectivity())
    	])),
    ('trees', RandomForestClassifier(random_state=0, n_jobs=-1, class_weight={0: 1, 1: w}))
])

# Load Data

In [3]:
df = pd.read_csv('data/askreddit_top_50.csv')

In [4]:
# Top-level comment set : ~9% minority class
X = df['body']
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)

# Baseline run

In [5]:
model4.fit(X_train, y_train)



Pipeline(memory=None,
         steps=[('preprocess_union',
                 FeatureUnion(n_jobs=None,
                              transformer_list=[('term_freq_feature',
                                                 Pipeline(memory=None,
                                                          steps=[('normalizer',
                                                                  GildPreprocessor(lower=True,
                                                                                   strip=True)),
                                                                 ('vectorizer',
                                                                  TfidfVectorizer(analyzer='word',
                                                                                  binary=False,
                                                                                  decode_error='strict',
                                                                                  dtype=<class 'numpy.flo

In [6]:
model_report(model4, X_test, y_test)

              precision    recall  f1-score   support

           0   0.910387  0.983498  0.945531       909
           1   0.166667  0.032967  0.055046        91

    accuracy                       0.897000      1000
   macro avg   0.538527  0.508233  0.500289      1000
weighted avg   0.842708  0.897000  0.864497      1000

Classes predicted:   [0 1]
Accuracy:   0.897
ROC AUC Score:   0.6487324556631487
True Pos:  3 False Pos:  15
FP to TP Ratio::   5.0
True Neg:  894 False Neg:  88


# GridSearch

In [7]:
search = GridSearchCV(model4, param_grid={
    'preprocess_union__term_freq_feature__vectorizer__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'trees__n_estimators': [10, 50, 100],
}, scoring='precision')

In [8]:
search.fit(X_train, y_train)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('preprocess_union',
                                        FeatureUnion(n_jobs=None,
                                                     transformer_list=[('term_freq_feature',
                                                                        Pipeline(memory=None,
                                                                                 steps=[('normalizer',
                                                                                         GildPreprocessor(lower=True,
                                                                                                          strip=True)),
                                                                                        ('vectorizer',
                                                                                         TfidfVectorizer(analyzer='word',
                   

In [10]:
search.best_score_

0.28894166666666665

In [13]:
cvres = pd.DataFrame(search.cv_results_)

In [14]:
cvres

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_preprocess_union__term_freq_feature__vectorizer__ngram_range,param_trees__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,8.32146,0.581647,4.108285,0.163185,"(1, 1)",10,{'preprocess_union__term_freq_feature__vectori...,0.130435,0.090909,0.130435,0.117263,0.018631,5
1,8.431424,0.338721,4.109115,0.154141,"(1, 1)",50,{'preprocess_union__term_freq_feature__vectori...,0.272727,0.142857,0.26087,0.225496,0.058624,3
2,8.021349,0.094532,3.918089,0.23859,"(1, 1)",100,{'preprocess_union__term_freq_feature__vectori...,0.375,0.166667,0.208333,0.250031,0.090021,2
3,8.091803,0.581536,3.908885,0.150554,"(1, 2)",10,{'preprocess_union__term_freq_feature__vectori...,0.5,0.2,0.166667,0.288942,0.149916,1
4,8.406243,0.226818,3.842981,0.239762,"(1, 2)",50,{'preprocess_union__term_freq_feature__vectori...,0.0,0.0,0.0,0.0,0.0,6
5,8.386229,0.279473,3.726926,0.219988,"(1, 2)",100,{'preprocess_union__term_freq_feature__vectori...,0.0,0.0,0.0,0.0,0.0,6
6,7.892056,0.170424,3.718215,0.206701,"(1, 3)",10,{'preprocess_union__term_freq_feature__vectori...,0.0,0.0,0.5,0.166625,0.235688,4
7,9.037331,0.15222,4.05988,0.424411,"(1, 3)",50,{'preprocess_union__term_freq_feature__vectori...,0.0,0.0,0.0,0.0,0.0,6
8,9.782123,0.251414,4.180034,0.048772,"(1, 3)",100,{'preprocess_union__term_freq_feature__vectori...,0.0,0.0,0.0,0.0,0.0,6


In [15]:
cvmodel = search.best_estimator_ 

In [16]:
cvmodel.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preprocess_union',
                 FeatureUnion(n_jobs=None,
                              transformer_list=[('term_freq_feature',
                                                 Pipeline(memory=None,
                                                          steps=[('normalizer',
                                                                  GildPreprocessor(lower=True,
                                                                                   strip=True)),
                                                                 ('vectorizer',
                                                                  TfidfVectorizer(analyzer='word',
                                                                                  binary=False,
                                                                                  decode_error='strict',
                                                                                  dtype=<class 'numpy.flo

In [17]:
model_report(cvmodel, X_test, y_test)

              precision    recall  f1-score   support

           0   0.908818  0.997800  0.951232       909
           1   0.000000  0.000000  0.000000        91

    accuracy                       0.907000      1000
   macro avg   0.454409  0.498900  0.475616      1000
weighted avg   0.826115  0.907000  0.864670      1000

Classes predicted:   [0 1]
Accuracy:   0.907
ROC AUC Score:   0.5918591859185919
True Pos:  0 False Pos:  2
FP to TP Ratio::   inf
True Neg:  907 False Neg:  91


  print("FP to TP Ratio::  ", (fp/tp))
