In [4]:
!pip install nltk
!pip install numpy
!pip install pandas

Collecting nltk
  Downloading nltk-3.2.5.tar.gz (1.2MB)
[K    100% |████████████████████████████████| 1.2MB 561kB/s ta 0:00:011
Building wheels for collected packages: nltk
  Running setup.py bdist_wheel for nltk ... [?25ldone
[?25h  Stored in directory: /home/jovyan/.cache/pip/wheels/18/9c/1f/276bc3f421614062468cb1c9d695e6086d0c73d67ea363c501
Successfully built nltk
Installing collected packages: nltk
Successfully installed nltk-3.2.5


In [21]:
import nltk as nltk
import numpy as np
import pandas as pd
import collections as coll

from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.utils.multiclass import unique_labels

def string_to_ngrams(s, n):
    # text = str(s).decode('utf-8').lower()
    text = str(s).lower()
    text = text.replace(' ', '')
    ngrams = nltk.ngrams([c for c in text], n)
    return [''.join(g) for g in ngrams]

class NaiveBayesClassifier(BaseEstimator, ClassifierMixin):  
    def __init__(self,
                 n_gram=1,
                 multimap=True,
                 count_threshold=0,
                 use_uniform_prior=False,
                 laplace_smoothing=1.):
        self.n_gram = n_gram
        self.multimap = multimap
        self.count_threshold = count_threshold
        self.use_uniform_prior = use_uniform_prior
        self.laplace_smoothing = laplace_smoothing

    def fit(self, X=None, y=None):
        W = pd.DataFrame({'X': [X], 'y': [y]})
        
        # Count features
        counts = {}
        for (xx, yy) in zip(X, y):
            category = np.zeros(5)
            category[yy] = 1
            ngrams = string_to_ngrams(xx, self.n_gram)
            if not self.multimap:
                ngrams = list(set(ngrams))
            for ngram in ngrams:
                if ngram in counts:
                    counts[ngram] = counts[ngram] + category
                else:
                    counts[ngram] = category
        counts = pd.DataFrame(counts).transpose()

        # Filter low counts
        keep = counts.apply(lambda row: sum(row) >= self.count_threshold, axis = 1)
        counts = counts[keep == 1]
        
        # Apply Laplace smoothing by adding a letter
        counts[counts.columns[-5:]] += self.laplace_smoothing
        
        # Count the # of n-grams observed in each language.
        class_counts = np.array(counts[counts.columns[-5:]].apply(lambda x: np.sum(x) * 1., axis = 0).values)
        
        # Define P(Y = y) as the proportion of n-grams observed in each language.
        if self.use_uniform_prior:
            class_priors = np.full(5, .1/5)
        else:
            class_priors = class_counts / np.sum(class_counts)
            
        # self.likelihood_ = counts.div(class_counts)
        self.likelihood_ = np.log(counts.div(class_counts)) ###
        self.counts_ = counts
        self.class_priors_ = class_priors
        
        return self

    def predict(self, X):
        predictions = []

        for obs in X:
            levels = string_to_ngrams(obs, n = self.n_gram)
            
            # joint_likelihood = np.full(5, 1.0)
            joint_likelihood = np.full(5, 0.)
            
            # Calculate joint probability
            for level in levels:
                if not level in self.likelihood_.index:
                    continue
                    
                # Calculate likelihood for X
                likelihood = self.likelihood_[self.likelihood_.index == level][:1].reset_index().values
                likelihood = np.array(np.delete(likelihood, 0).astype(float))
                
                # joint_likelihood = np.multiply(joint_likelihood, likelihood)
                joint_likelihood = joint_likelihood + likelihood
                
            # Calculate joint likelihood * class prior
            prop_posterior = np.multiply(joint_likelihood, self.class_priors_)
            
            # Calculate posterior probability
            posterior = prop_posterior # / np.sum(prop_posterior)
            prediction = np.argmax(posterior)
            
            predictions = predictions + [prediction]
        
        return predictions
    
    
    def classify(self, inputs):
        return
 

In [10]:
# from sklearn.pipeline import Pipeline, FeatureUnion

X_train = pd.read_csv("data/train_set_x.csv")['Text'].values
Y_train = pd.read_csv("data/train_set_y.csv")['Category'].values
X_test  = pd.read_csv("data/test_set_x.csv")['Text'].values
nbayes = NaiveBayesClassifier()
nbayes.set_params(n_gram=1,
                  multimap=True,
                  count_threshold=25,
                  use_uniform_prior=False,
                  laplace_smoothing=10.0)
nbayes.fit(X_train, Y_train)
nbayes.likelihood_


Unnamed: 0,0,1,2,3,4
0,0.001344,0.001753,0.001752,0.001522,0.000093
1,0.001772,0.002259,0.001526,0.001774,0.000084
2,0.001578,0.001910,0.001180,0.001220,0.000074
3,0.001046,0.001202,0.000818,0.000781,0.000061
4,0.000723,0.000963,0.000624,0.000420,0.000039
5,0.000850,0.001399,0.000721,0.000559,0.000063
6,0.000602,0.001318,0.000605,0.000625,0.000039
7,0.000550,0.001704,0.000566,0.000431,0.000035
8,0.000514,0.000851,0.000534,0.000290,0.000032
9,0.000363,0.001270,0.000595,0.000377,0.000030


In [57]:
len(nbayes.likelihood_)
len(X_test)
len(X_train)

276517

In [62]:
true_x = X_train[250000:250100]
true_y = Y_train[250000:250100]
pred_y = nbayes.predict(true_x)

loss = np.full((5,5), 1.0)
for i in range(len(pred_y)):
    loss[pred_y[i], true_y[i]] += 1
    
TPR = loss.trace() / loss.sum()
FNR = 1 - TPR

print(loss)
print(TPR)
print(FNR)
print(accuracy_score(pred_y, true_y))
print(precision_score(pred_y, true_y, average = 'macro'))

[[  8.   2.   2.   1.   2.]
 [  1.  42.   6.   3.   1.]
 [  1.   9.  12.   2.   1.]
 [  1.   4.   4.   8.   2.]
 [  1.   1.   1.   1.   9.]]
0.632
0.368
0.74
0.764716981132


In [52]:
#### RANDOM TRAIN-TEST SPLIT

import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split

import random

xx = X_train
yy = Y_train

param_grid = [
  {
        'n_gram': [1, 2, 3],
        'laplace_smoothing': [1., 10.],
        'cv_count_threshold': [0., 10., 25.]
  }
 ]

scoring = {'acc':        'accuracy',
           'prec_macro': 'precision_macro',
           'rec_micro':  'recall_macro'}

def train_test_split_model(model, xxx, yyy, test_size=0.005):
    xxx_train, xxx_test, yyy_train, yyy_test = train_test_split(xxx, yyy, test_size=test_size, random_state=0)
    model.fit(xxx_train, yyy_train)
    yyy_pred = model.predict(xxx_test)
    return({
            'test_accuracy': accuracy_score(yyy_test, yyy_pred),
            'test_precision': precision_score(yyy_test, yyy_pred, average = 'macro'),
            'test_recall': recall_score(yyy_test, yyy_pred, average = 'macro')
           })

tts_scores = {}

In [None]:
print(train_test_split_model(NaiveBayesClassifier(n_gram=2, laplace_smoothing=10., count_threshold=25), xx, yy))

In [None]:
for i in [1, 2, 3]:
    k = "n_gram={}".format(i)
    v = train_test_split_model(NaiveBayesClassifier(n_gram=i), xx, yy)
    tts_scores[k] = v

In [54]:
for i in [1., 10.]:
    k = "laplace_smoothing={}".format(i)
    v = train_test_split_model(NaiveBayesClassifier(laplace_smoothing=i), xx, yy)
    tts_scores[k] = v

In [55]:
for i in [1, 10, 25]:
    k = "count_threshold={}".format(i)
    v = train_test_split_model(NaiveBayesClassifier(count_threshold=i), xx, yy)
    tts_scores[k] = v

In [56]:
pd.DataFrame(tts_scores)

Unnamed: 0,count_threshold=0.0,count_threshold=10.0,count_threshold=25.0,laplace_smoothing=1.0,laplace_smoothing=10.0,n_gram=1,n_gram=2,n_gram=3
test_accuracy,0.054152,0.054152,0.054152,0.054152,0.050542,0.054152,0.057762,0.061372
test_precision,0.075943,0.075943,0.075943,0.075943,0.069191,0.075943,0.096794,0.147704
test_recall,0.232258,0.232258,0.232258,0.232258,0.219355,0.232258,0.245161,0.258065


In [26]:
#### K FOLD IS TOO SLOW

import numpy as np
from sklearn.model_selection import KFold
from sklearn.cross_validation import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.metrics import accuracy_score, precision_score, recall_score

import random

xx = X_train[:1000]
yy = Y_train[:1000]

scoring = {'acc': 'accuracy',
           'prec_macro': 'precision_macro',
           'rec_micro': 'recall_macro'}

print("cv_n_gram")
cv_n_gram = None
for i in range(1, 3):
    print(i)
    a = cross_validate(NaiveBayesClassifier(n_gram=i),
                       xx,
                       yy,
                       fit_params={},
                       scoring=scoring,
                       cv=2,
                       return_train_score=True)
    a_mean = pd.DataFrame(a).mean(axis=0)
    cv_n_gram = pd.concat((cv_n_gram,
                           a_mean.rename("n_gram={}".format(i))), axis=1)

print("cv_laplace_smoothing")
cv_laplace_smoothing = None
for i in [1., 5., 10.]:
    print(i)
    a = cross_validate(NaiveBayesClassifier(laplace_smoothing=i),
                       xx,
                       yy,
                       fit_params={},
                       scoring=scoring,
                       cv=2,
                       return_train_score=True)
    a_mean = pd.DataFrame(a).mean(axis=0)
    cv_laplace_smoothing = pd.concat((cv_laplace_smoothing,
                                      a_mean.rename("laplace_smoothing={}".format(i))), axis=1)
    
print("cv_count_threshold")
cv_count_threshold = None
for i in [0., 10., 25.]:
    print(i)
    a = cross_validate(NaiveBayesClassifier(count_threshold=i),
                       xx,
                       yy,
                       fit_params={},
                       scoring=scoring,
                       cv=2,
                       return_train_score=True)
    a_mean = pd.DataFrame(a).mean(axis=0)
    cv_count_threshold = pd.concat((cv_count_threshold,
                                    a_mean.rename("count_threshold={}".format(i))), axis=1)

0.853658536585
0.146341463415
cv_n_gram
1


KeyboardInterrupt: 

In [None]:
print(cv_n_gram)
print(cv_laplace_smoothing)
print(cv_count_threshold)