In [25]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report as clsr
import string
import nltk

In [14]:
df = pd.read_csv('data/df_gildtest.csv')
df.loc[df['gilded'] >= 1, 'target'] = 1
df.loc[df['gilded'] == 0, 'target'] = 0
df['target'] = df['target'].astype(int)
X = df['body'].tolist()
y = df['target'].tolist()
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [23]:
class NLTKPreprocessor(BaseEstimator, TransformerMixin):

    def __init__(self, stopwords=None, punct=None,
                 lower=True, strip=True):
        self.lower      = lower
        self.strip      = strip
        self.stopwords  = stopwords or set(sw.words('english'))
        self.punct      = punct or set(string.punctuation)
        self.lemmatizer = WordNetLemmatizer()

    def fit(self, X, y=None):
        return self

    def inverse_transform(self, X):
        return [" ".join(doc) for doc in X]

    def transform(self, X):
        return [
            list(self.tokenize(doc)) for doc in X
        ]

    def tokenize(self, document):
        # Break the document into sentences
        for sent in sent_tokenize(document):
            # Break the sentence into part of speech tagged tokens
            for token, tag in pos_tag(wordpunct_tokenize(sent)):
                # Apply preprocessing to the token
                token = token.lower() if self.lower else token
                token = token.strip() if self.strip else token
                token = token.strip('_') if self.strip else token
                token = token.strip('*') if self.strip else token

                # If stopword, ignore token and continue
                if token in self.stopwords:
                    continue

                # If punctuation, ignore token and continue
                if all(char in self.punct for char in token):
                    continue

                # Lemmatize the token and yield
                lemma = self.lemmatize(token, tag)
                yield lemma

    def lemmatize(self, token, tag):
        tag = {
            'N': wn.NOUN,
            'V': wn.VERB,
            'R': wn.ADV,
            'J': wn.ADJ
        }.get(tag[0], wn.NOUN)

        return self.lemmatizer.lemmatize(token, tag)

In [24]:
def identity(words):
    return words

model = Pipeline([
    ('normalizer', NLTKPreprocessor()),
    #('normalizer', TextNormalizer()),
    ('vectorizer', TfidfVectorizer(tokenizer=identity, preprocessor=None, lowercase=False)),
    #('vectorizer', GensimVectorizer()),
    ('bayes', MultinomialNB()),
])

In [26]:
ert = NLTKPreprocessor()

In [28]:
bert = ert.transform(X)

In [31]:
tfidf = TfidfVectorizer(tokenizer=identity, preprocessor=None, lowercase=False)

In [33]:
dert = tfidf.transform(bert)

NotFittedError: TfidfVectorizer - Vocabulary wasn't fitted.

In [4]:
def tokenize(text):
    stem = nltk.stem.SnowballStemmer('english')
    text = text.lower()

    for token in nltk.word_tokenize(text):
        if token in string.punctuation: continue
        yield stem.stem(token)

In [19]:
toks = [nltk.word_tokenize(r) for r in X]

In [20]:
toks[:10]

[['Until',
  'the',
  'papers',
  'release',
  'the',
  'name',
  ',',
  'keep',
  'it',
  'off',
  'here',
  '.'],
 ['The', 'Notorious', 'Rapist', '.'],
 ['Hmmmmm',
  'I',
  'wonder',
  'which',
  'sports',
  'star',
  'this',
  'could',
  'be',
  '.',
  'The',
  'mind',
  'boggles'],
 ['Yes', 'it', "'s", 'him', '.'],
 ['I',
  'still',
  'smile',
  'at',
  'the',
  'time',
  'last',
  'December',
  'when',
  'all',
  'this',
  'went',
  'on',
  'and',
  'the',
  'mods',
  'said',
  'the',
  'person',
  'could',
  "n't",
  'be',
  'named',
  'here',
  '.',
  'So',
  'people',
  'started',
  'going',
  '-',
  '``',
  'It',
  "'s",
  'Sonia',
  "O'Sullivan",
  "''",
  'and',
  'left',
  'the',
  'comment',
  'up',
  'yet',
  'the',
  'comments',
  'naming',
  'the',
  'person',
  'was',
  'removed',
  ',',
  'therefore',
  'the',
  'mods',
  'basically',
  'confirmed',
  'who',
  'it',
  'was',
  '.'],
 ['I',
  'feel',
  'like',
  'we',
  "'ve",
  'been',
  'here',
  'before',
  'with',


In [35]:
model.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('normalizer',
                 NLTKPreprocessor(lower=True,
                                  punct={'!', '"', '#', '$', '%', '&', "'", '(',
                                         ')', '*', '+', ',', '-', '.', '/', ':',
                                         ';', '<', '=', '>', '?', '@', '[',
                                         '\\', ']', '^', '_', '`', '{', '|', ...},
                                  stopwords={'a', 'about', 'above', 'after',
                                             'again', 'against', 'ain', 'all',
                                             'am', 'an', 'and', 'any', 'are',
                                             'aren', "aren't", 'as', 'at', 'be',
                                             'because', 'been', 'b...
                                 lowercase=False, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preproc

In [36]:
model.score(X_test, y_test)

0.9444444444444444