In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report as clsr
import string
import nltk

from nltk.corpus import stopwords as sw
from nltk.corpus import wordnet as wn
from nltk import wordpunct_tokenize
from nltk import WordNetLemmatizer
from nltk import sent_tokenize
from nltk import pos_tag

## Pipeline

In [12]:
class MyPreprocessor(BaseEstimator, TransformerMixin):

    def __init__(self,
                 lower=True, strip=True):
        self.lower      = lower
        self.strip      = strip
        self.stopwords  = set(sw.words('english'))
        self.punct      = set(string.punctuation)
        self.lemmatizer = WordNetLemmatizer()

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return [
            list(self.tokenize(doc)) for doc in X
        ]

    def tokenize(self, document):
        for sent in sent_tokenize(document):
            for token, tag in pos_tag(wordpunct_tokenize(sent)):
                token = token.lower() if self.lower else token
                token = token.strip() if self.strip else token
                token = token.strip('_') if self.strip else token
                token = token.strip('*') if self.strip else token

                if token in self.stopwords:
                    continue
                
                if token in ["thanks", "gold", "edit", "obligatory", "gild", "gilded"]:
                    continue

                if all(char in self.punct for char in token):
                    continue

                lemma = self.lemmatize(token, tag)
                yield lemma

    def lemmatize(self, token, tag):
        tag = {
            'N': wn.NOUN,
            'V': wn.VERB,
            'R': wn.ADV,
            'J': wn.ADJ
        }.get(tag[0], wn.NOUN)

        return self.lemmatizer.lemmatize(token, tag)

In [27]:
def identity(words):
    return words

model = Pipeline([
    ('normalizer', MyPreprocessor()),
    ('vectorizer', TfidfVectorizer(tokenizer=identity, preprocessor=None, lowercase=False)),
    ('bayes', MultinomialNB()),
])

## Fit & Score

In [None]:
df = pd.read_csv('data/df_gildtest.csv')
df.loc[df['gilded'] >= 1, 'target'] = 1
df.loc[df['gilded'] == 0, 'target'] = 0
df['target'] = df['target'].astype(int)
X = df['body'].tolist()
y = df['target'].tolist()
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [7]:
model.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('normalizer', MyPreprocessor(lower=True, strip=True)),
                ('vectorizer',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=False, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function identity at 0x1a22a7e598>,
                                 use_idf=True, vocabulary=None)),
                ('bayes',
                 MultinomialNB(alpha

In [8]:
model.score(X_test, y_test)

0.9444444444444444

In [14]:
df = pd.read_csv('data/askreddit_12.csv')
df.loc[df['gilded'] >= 1, 'target'] = 1
df.loc[df['gilded'] == 0, 'target'] = 0
df['target'] = df['target'].astype(int)
X = df['body'].tolist()
y = df['target'].tolist()
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [16]:
model.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('normalizer', MyPreprocessor(lower=True, strip=True)),
                ('vectorizer',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=False, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function identity at 0x1a30c077b8>,
                                 use_idf=True, vocabulary=None)),
                ('bayes',
                 MultinomialNB(alpha

In [11]:
model.score(X_test, y_test)

0.9993679502139245

In [20]:
y_pred = model.predict(X_test)

In [23]:
from sklearn.metrics import classification_report
|
print(classification_report(y_test, y_pred, digits=6))

              precision    recall  f1-score   support

           0   0.999368  1.000000  0.999684     20555
           1   0.000000  0.000000  0.000000        13

    accuracy                       0.999368     20568
   macro avg   0.499684  0.500000  0.499842     20568
weighted avg   0.998736  0.999368  0.999052     20568



In [24]:
from sklearn.metrics import recall_score

print(recall_score(y_test, y_pred))

0.0


In [26]:
df = pd.read_csv('data/askreddit_tops.csv')
df.loc[df['gilded'] >= 1, 'target'] = 1
df.loc[df['gilded'] == 0, 'target'] = 0
df['target'] = df['target'].astype(int)
X = df['body'].tolist()
y = df['target'].tolist()
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [28]:
model.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('normalizer', MyPreprocessor(lower=True, strip=True)),
                ('vectorizer',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=False, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function identity at 0x1a31e69488>,
                                 use_idf=True, vocabulary=None)),
                ('bayes',
                 MultinomialNB(alpha

In [29]:
model.score(X_test, y_test)

0.96

In [30]:
y_pred = model.predict(X_test)

In [31]:
print(classification_report(y_test, y_pred, digits=6))

              precision    recall  f1-score   support

           0   0.960000  1.000000  0.979592       240
           1   0.000000  0.000000  0.000000        10

    accuracy                       0.960000       250
   macro avg   0.480000  0.500000  0.489796       250
weighted avg   0.921600  0.960000  0.940408       250



  'precision', 'predicted', average, warn_for)


In [32]:
print(recall_score(y_test, y_pred))

0.0


## GridSearch (hyperparameter tuning)

In [45]:
search = GridSearchCV(model, param_grid={
    'vectorizer__max_features': [None, 20000],
    'bayes__alpha': [0.0, 1.0],
})

In [47]:
search.fit(X, y)

  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)


GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('normalizer',
                                        MyPreprocessor(lower=True, strip=True)),
                                       ('vectorizer',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=False,
                                                        max_df=1.0,
                                                        max_features=None,
                                     

In [48]:
search.cv_results_

{'mean_fit_time': array([51.6429824 , 50.86932818, 49.44741535, 47.64937782]),
 'std_fit_time': array([12.57975814, 11.59230337, 11.37953344, 11.113098  ]),
 'mean_score_time': array([24.63709569, 25.46759383, 24.00808144, 23.86866434]),
 'std_score_time': array([10.97257155, 11.43602056, 10.62008113, 10.68915501]),
 'param_bayes__alpha': masked_array(data=[0.0, 0.0, 1.0, 1.0],
              mask=[False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_vectorizer__max_features': masked_array(data=[None, 20000, None, 20000],
              mask=[False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'bayes__alpha': 0.0, 'vectorizer__max_features': None},
  {'bayes__alpha': 0.0, 'vectorizer__max_features': 20000},
  {'bayes__alpha': 1.0, 'vectorizer__max_features': None},
  {'bayes__alpha': 1.0, 'vectorizer__max_features': 20000}],
 'split0_test_score': array([0.99934366, 0.99938013, 0.99938013, 0.99938013]),
 'split1