In [29]:
import numpy as np
import pandas as pd
import string
import re
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.naive_bayes import MultinomialNB
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report, confusion_matrix, recall_score, roc_auc_score, accuracy_score
from sklearn.ensemble import RandomForestClassifier
import nltk

from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk import wordpunct_tokenize
from nltk import WordNetLemmatizer
from nltk import sent_tokenize
from nltk import pos_tag

from src.preprocessor import GildPreprocessor

In [82]:
class WordCounter(BaseEstimator, TransformerMixin):

    def __init__(self):
        return None
        
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        Z = np.array([
            self.count_text(doc) for doc in X
        ])
        return Z.reshape(-1, 1)
    
    def count_text(self, doc):
        clean1 = re.sub(r'['+string.punctuation + '’—”'+']', "", doc.lower())
        clean2 = re.sub(r'\W+', ' ', clean1)
        return len(clean2.split())

In [87]:
class WordCounterSubjectivity(BaseEstimator, TransformerMixin):

    def __init__(self):
        self.subj_toks = ["i", "me", "ive", "im", "my", "mine"]
        return None
        
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        word_counts = np.array([
            len((self.simplify_text(doc)).split()) for doc in X
        ])
        subj_scores = np.array([
        	self.subjectivity_score(doc, word_counts[i]) for i, doc in enumerate(X)
        ])
        word_counts = word_counts.reshape(-1, 1)
        subj_scores = subj_scores.reshape(-1, 1)
        return np.hstack((word_counts, subj_scores))
    
    def simplify_text(self, doc):
        clean1 = re.sub(r'['+string.punctuation + '’—”'+']', "", doc.lower())
        clean2 = re.sub(r'\W+', ' ', clean1)
        return clean2

    def subjectivity_score(self, doc, wc):
        subj_counter = 0
        for word in self.simplify_text(doc):
            if word in self.subj_toks:
                subj_counter += 1
        return subj_counter/wc

In [132]:
def identity(words):
    return words

w = 50 # The weight for the positive class

model = Pipeline([
    ('normalizer', GildPreprocessor()),
    ('vectorizer', TfidfVectorizer(tokenizer=identity, preprocessor=None, lowercase=False)),
    ('trees', RandomForestClassifier(random_state=0, n_jobs=-1, class_weight={0: 1, 1: w}))
])

In [83]:
model2 = Pipeline([
    ('preprocess_union', FeatureUnion([
    		('term_freq_feature', Pipeline([
    			('normalizer', GildPreprocessor()),
    			('vectorizer', TfidfVectorizer(tokenizer=identity, preprocessor=None, lowercase=False))
    			])),
    		('word_counter', WordCounter())
    	])),
    ('trees', RandomForestClassifier(random_state=0, n_jobs=-1, class_weight={0: 1, 1: w}))
])

In [92]:
model3 = Pipeline([
    ('preprocess_union', FeatureUnion([
    		('term_freq_feature', Pipeline([
    			('normalizer', GildPreprocessor()),
    			('vectorizer', TfidfVectorizer(tokenizer=identity, preprocessor=None, lowercase=False))
    			])),
    		('word_counter', WordCounterSubjectivity())
    	])),
    ('trees', RandomForestClassifier(random_state=0, n_jobs=-1, class_weight={0: 1, 1: w}))
])

## Load Data

In [None]:
df = pd.read_csv('data/askreddit_top_15_sm.csv')

In [48]:
#X = df['body'].tolist()
#y = df['target'].tolist()
X = df['body']
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)

In [98]:
model.fit(X_train, y_train)



Pipeline(memory=None,
         steps=[('normalizer', GildPreprocessor(lower=True, strip=True)),
                ('vectorizer',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=False, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_wor...
                 RandomForestClassifier(bootstrap=True,
                                        class_weight={0: 1, 1: 50},
                                        criterion='gini', max_depth=None,
                                        max_features='auto',
                                        max_leaf_nodes=None,
                                

In [101]:
pred = model.predict(X_test)
print(classification_report(y_test, pred, digits=6))
# Is our model still predicting just one class?
print( np.unique( pred ) )
 
# How's our accuracy?
print( accuracy_score(y_test, pred) )
 
# What about AUROC?
prob_y = model.predict_proba(X_test)
prob_y = [p[1] for p in prob_y]
print( roc_auc_score(y_test, prob_y) )

tn, fp, fn, tp = confusion_matrix(y_test, pred).ravel()
print(tp, fp, tn, fn)

              precision    recall  f1-score   support

           0   0.891697  0.935606  0.913124       264
           1   0.260870  0.166667  0.203390        36

    accuracy                       0.843333       300
   macro avg   0.576283  0.551136  0.558257       300
weighted avg   0.815997  0.843333  0.827956       300

[0 1]
0.8433333333333334
0.537773569023569
6 17 247 30


In [65]:
model.steps[2][1].feature_importances_

array([0.00000000e+00, 1.09118354e-04, 1.64289661e-07, ...,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00])

In [84]:
model2.fit(X_train, y_train)



Pipeline(memory=None,
         steps=[('preprocess_union',
                 FeatureUnion(n_jobs=None,
                              transformer_list=[('term_freq_feature',
                                                 Pipeline(memory=None,
                                                          steps=[('normalizer',
                                                                  GildPreprocessor(lower=True,
                                                                                   strip=True)),
                                                                 ('vectorizer',
                                                                  TfidfVectorizer(analyzer='word',
                                                                                  binary=False,
                                                                                  decode_error='strict',
                                                                                  dtype=<class 'numpy.flo

In [102]:
pred = model2.predict(X_test)
print(classification_report(y_test, pred, digits=6))
# Is our model still predicting just one class?
print( np.unique( pred ) )
 
# How's our accuracy?
print( accuracy_score(y_test, pred) )
 
# What about AUROC?
prob_y = model.predict_proba(X_test)
prob_y = [p[1] for p in prob_y]
print( roc_auc_score(y_test, prob_y) )

tn, fp, fn, tp = confusion_matrix(y_test, pred).ravel()
print(tp, fp, tn, fn)

              precision    recall  f1-score   support

           0   0.883636  0.920455  0.901670       264
           1   0.160000  0.111111  0.131148        36

    accuracy                       0.823333       300
   macro avg   0.521818  0.515783  0.516409       300
weighted avg   0.796800  0.823333  0.809207       300

[0 1]
0.8233333333333334
0.537773569023569
4 21 243 32


In [93]:
model3.fit(X_train, y_train)



Pipeline(memory=None,
         steps=[('preprocess_union',
                 FeatureUnion(n_jobs=None,
                              transformer_list=[('term_freq_feature',
                                                 Pipeline(memory=None,
                                                          steps=[('normalizer',
                                                                  GildPreprocessor(lower=True,
                                                                                   strip=True)),
                                                                 ('vectorizer',
                                                                  TfidfVectorizer(analyzer='word',
                                                                                  binary=False,
                                                                                  decode_error='strict',
                                                                                  dtype=<class 'numpy.flo

In [103]:
pred = model3.predict(X_test)
print(classification_report(y_test, pred, digits=6))
# Is our model still predicting just one class?
print( np.unique( pred ) )
 
# How's our accuracy?
print( accuracy_score(y_test, pred) )
 
# What about AUROC?
prob_y = model.predict_proba(X_test)
prob_y = [p[1] for p in prob_y]
print( roc_auc_score(y_test, prob_y) )

tn, fp, fn, tp = confusion_matrix(y_test, pred).ravel()
print(tp, fp, tn, fn)

              precision    recall  f1-score   support

           0   0.884615  0.958333  0.920000       264
           1   0.214286  0.083333  0.120000        36

    accuracy                       0.853333       300
   macro avg   0.549451  0.520833  0.520000       300
weighted avg   0.804176  0.853333  0.824000       300

[0 1]
0.8533333333333334
0.537773569023569
3 11 253 33


## GridSearch

In [145]:
search = GridSearchCV(model3, param_grid={
    'preprocess_union__term_freq_feature__vectorizer__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'trees__n_estimators': [50, 100, 200],
}, scoring='recall')

In [146]:
search.fit(X_train, y_train)



GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('preprocess_union',
                                        FeatureUnion(n_jobs=None,
                                                     transformer_list=[('term_freq_feature',
                                                                        Pipeline(memory=None,
                                                                                 steps=[('normalizer',
                                                                                         GildPreprocessor(lower=True,
                                                                                                          strip=True)),
                                                                                        ('vectorizer',
                                                                                         TfidfVectorizer(analyzer='word',
                   