In [4]:
!pip install nltk
!pip install numpy
!pip install pandas

Collecting nltk
  Downloading nltk-3.2.5.tar.gz (1.2MB)
[K    100% |████████████████████████████████| 1.2MB 561kB/s ta 0:00:011
Building wheels for collected packages: nltk
  Running setup.py bdist_wheel for nltk ... [?25ldone
[?25h  Stored in directory: /home/jovyan/.cache/pip/wheels/18/9c/1f/276bc3f421614062468cb1c9d695e6086d0c73d67ea363c501
Successfully built nltk
Installing collected packages: nltk
Successfully installed nltk-3.2.5


In [92]:
import nltk as nltk
import numpy as np
import pandas as pd
import collections as coll

from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.utils.multiclass import unique_labels

def string_to_ngrams(s, n):
    # text = str(s).decode('utf-8').lower()
    text = str(s).lower()
    text = text.replace(' ', '')
    ngrams = nltk.ngrams([c for c in text], n)
    return [''.join(g) for g in ngrams]

class NaiveBayesClassifier(BaseEstimator, ClassifierMixin):  
    def __init__(self,
                 n_gram=1,
                 multimap=True,
                 count_threshold=0,
                 use_uniform_prior=False,
                 laplace_smoothing=1.):
        self.n_gram = n_gram
        self.multimap = multimap
        self.count_threshold = count_threshold
        self.use_uniform_prior = use_uniform_prior
        self.laplace_smoothing = laplace_smoothing

    def fit(self, X=None, y=None):
        W = pd.DataFrame({'X': [X], 'y': [y]})
        
        # Count features
        counts = {}
        for (xx, yy) in zip(X, y):
            category = np.zeros(5)
            category[yy] = 1
            ngrams = string_to_ngrams(xx, self.n_gram)
            if not self.multimap:
                ngrams = list(set(ngrams))
            for ngram in ngrams:
                if ngram in counts:
                    counts[ngram] = counts[ngram] + category
                else:
                    counts[ngram] = category
        counts = pd.DataFrame(counts).transpose()

        # Filter low counts
        keep = counts.apply(lambda row: sum(row) >= self.count_threshold, axis = 1)
        counts = counts[keep == 1]
        
        # Apply Laplace smoothing by adding a letter
        counts[counts.columns[-5:]] += self.laplace_smoothing
        
        # Count the # of n-grams observed in each language.
        class_counts = np.array(counts[counts.columns[-5:]].apply(lambda x: np.sum(x) * 1., axis = 0).values)
        
        # Define P(Y = y) as the proportion of n-grams observed in each language.
        if self.use_uniform_prior:
            class_priors = np.full(5, .1/5)
        else:
            class_priors = class_counts / np.sum(class_counts)
            
        ## self.likelihood_ = counts.div(class_counts)
        self.likelihood_ = np.log(counts.div(class_counts))
        self.counts_ = counts
        ## self.class_priors_ = class_priors
        self.class_priors_ = np.log(class_priors)
        
        return self

    def predict(self, X):
        predictions = []

        for obs in X:
            levels = string_to_ngrams(obs, n = self.n_gram)
            
            joint_likelihood = np.full(5, 1.0)
            ## joint_likelihood = np.full(5, 0.)
            
            # Calculate joint probability
            for level in levels:
                if not level in self.likelihood_.index:
                    continue
                    
                # Calculate likelihood for X
                likelihood = self.likelihood_[self.likelihood_.index == level][:1].reset_index().values
                likelihood = np.array(np.delete(likelihood, 0).astype(float))
                
                ## joint_likelihood = np.multiply(joint_likelihood, likelihood)
                joint_likelihood = joint_likelihood + likelihood
                
            # Calculate joint likelihood * class prior
            ## prop_posterior = np.multiply(joint_likelihood, self.class_priors_)
            prop_posterior = joint_likelihood + self.class_priors_

            # Calculate posterior probability
            posterior = prop_posterior #/ np.sum(prop_posterior)
            prediction = np.argmax(posterior)
            
            predictions = predictions + [prediction]
        
        return predictions
    
    
    def classify(self, inputs):
        return
 

In [93]:
# from sklearn.pipeline import Pipeline, FeatureUnion

X_train = pd.read_csv("data/train_set_x.csv")['Text'].values
Y_train = pd.read_csv("data/train_set_y.csv")['Category'].values
X_test  = pd.read_csv("data/test_set_x.csv")['Text'].values
nbayes = NaiveBayesClassifier()
nbayes.set_params(n_gram=1,
                  multimap=True,
                  count_threshold=25,
                  use_uniform_prior=False,
                  laplace_smoothing=10.0)
nbayes.fit(X_train, Y_train)
nbayes.likelihood_


Unnamed: 0,0,1,2,3,4
0,-6.654122,-6.384704,-6.371843,-6.502534,-9.365380
1,-6.377911,-6.131082,-6.510420,-6.349567,-9.463019
2,-6.493825,-6.298926,-6.767197,-6.723717,-9.600220
3,-6.905085,-6.761981,-7.134177,-7.169426,-9.794376
4,-7.274748,-6.984214,-7.404901,-7.789680,-10.236209
5,-7.112115,-6.610124,-7.259857,-7.504370,-9.759285
6,-7.457988,-6.670123,-7.434524,-7.393207,-10.236209
7,-7.547255,-6.413297,-7.501418,-7.764717,-10.353992
8,-7.615818,-7.107474,-7.559687,-8.160217,-10.418530
9,-7.963319,-6.706963,-7.452328,-7.898029,-10.487523


In [94]:
len(nbayes.likelihood_)
len(X_test)

len(X_train)

276517

In [95]:
## GENERALIZED CONFUSION MATRIX

true_x = X_train[250000:250100]
true_y = Y_train[250000:250100]
pred_y = nbayes.predict(true_x)

loss = np.full((5,5), 1.0)
for i in range(len(pred_y)):
    loss[pred_y[i], true_y[i]] += 1
    
TPR = loss.trace() / loss.sum()
FNR = 1 - TPR

print(loss)
print(TPR)
print(FNR)
print(accuracy_score(pred_y, true_y))
print(precision_score(pred_y, true_y, average = 'macro'))

[[  8.   1.   2.   1.   1.]
 [  1.  50.   5.   2.   1.]
 [  1.   4.  14.   3.   1.]
 [  1.   2.   3.   8.   1.]
 [  1.   1.   1.   1.  11.]]
0.728
0.272
0.86
0.854905660377


In [137]:
#### RANDOM TRAIN-TEST SPLIT

import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split

import random

xx = X_train
yy = Y_train

param_grid = [
  {
        'n_gram': [1, 2, 3],
        'laplace_smoothing': [1., 10.],
        'cv_count_threshold': [0., 10., 25.]
  }
 ]

scoring = {'acc':        'accuracy',
           'prec_macro': 'precision_macro',
           'rec_micro':  'recall_macro'}

def train_test_split_model(model, xxx, yyy, test_size=0.004):
    xxx_train, xxx_test, yyy_train, yyy_test = train_test_split(xxx, yyy, test_size=test_size)
    model.fit(xxx_train, yyy_train)
    yyy_pred = model.predict(xxx_test)
    test_accuracy = accuracy_score(yyy_test, yyy_pred)
    test_precision = precision_score(yyy_test, yyy_pred, average = 'macro')
    test_recall = recall_score(yyy_test, yyy_pred, average = 'macro')
    number_of_features = len(model.likelihood_)
    return({
            'n': model.n_gram,
            'lam': model.laplace_smoothing,
            'c': model.count_threshold,
            'number_of_features': number_of_features,
            'test_accuracy': test_accuracy,
            'test_precision': test_precision,
            'test_recall': test_recall,
            'test_average': np.mean([test_accuracy, test_precision, test_recall])
           })

tts_scores = {}

In [138]:
# print(train_test_split_model(NaiveBayesClassifier(n_gram=1, laplace_smoothing=10., count_threshold=25), xx, yy))

In [139]:
#for i_n in [1, 2, 3]:
#    k = "n={},lam={},c={}".format(i_n, 1., 0)
#    v = train_test_split_model(NaiveBayesClassifier(n_gram=i_n), xx, yy)
#    tts_scores[k] = v

In [140]:
#for i_lam in [1., 25., 50.]:
#    k = "n={},lam={},c={}".format(1, i_lam, 0)
#    v = train_test_split_model(NaiveBayesClassifier(laplace_smoothing=i_lam), xx, yy)
#    tts_scores[k] = v

In [141]:
#for i_c in [1, 25, 50]:
#    k = "n={},lam={},c={}".format(1, 1., i_c)
#    v = train_test_split_model(NaiveBayesClassifier(count_threshold=i_c), xx, yy)
#    tts_scores[k] = v

In [142]:
tts_superscore2 = {}

for i_n in [1, 2, 3]:
    for i_lam in [1., 25., 50.]:
        for i_c in [1, 25, 50]:
            k = "n={},lam={},c={}".format(i_n, i_lam, i_c)
            print(k)
            v = train_test_split_model(NaiveBayesClassifier(n_gram=i_n, laplace_smoothing=i_lam, count_threshold=i_c), xx, yy)
            tts_superscore2[k] = v

n=1,lam=1.0,c=1
n=1,lam=1.0,c=25
n=1,lam=1.0,c=50
n=1,lam=25.0,c=1
n=1,lam=25.0,c=25
n=1,lam=25.0,c=50
n=1,lam=50.0,c=1
n=1,lam=50.0,c=25
n=1,lam=50.0,c=50
n=2,lam=1.0,c=1
n=2,lam=1.0,c=25
n=2,lam=1.0,c=50
n=2,lam=25.0,c=1
n=2,lam=25.0,c=25
n=2,lam=25.0,c=50
n=2,lam=50.0,c=1
n=2,lam=50.0,c=25
n=2,lam=50.0,c=50
n=3,lam=1.0,c=1
n=3,lam=1.0,c=25
n=3,lam=1.0,c=50
n=3,lam=25.0,c=1
n=3,lam=25.0,c=25
n=3,lam=25.0,c=50
n=3,lam=50.0,c=1
n=3,lam=50.0,c=25
n=3,lam=50.0,c=50


In [143]:
#### K FOLD IS TOO SLOW

#import numpy as np
#from sklearn.model_selection import KFold
#from sklearn.cross_validation import cross_val_score
#from sklearn.model_selection import cross_validate
#from sklearn.metrics import accuracy_score, precision_score, recall_score

#import random

#xx = X_train[:1000]
#yy = Y_train[:1000]

#scoring = {'acc': 'accuracy',
#           'prec_macro': 'precision_macro',
#           'rec_micro': 'recall_macro'}

#print("cv_n_gram")
#cv_n_gram = None
#for i in range(1, 3):
##    print(i)
#    a = cross_validate(NaiveBayesClassifier(n_gram=i),
#                       xx,
#                       yy,
#                       fit_params={},
#                       scoring=scoring,
#                       cv=2,
#                       return_train_score=True)
#    a_mean = pd.DataFrame(a).mean(axis=0)
#    cv_n_gram = pd.concat((cv_n_gram,
#                           a_mean.rename("n_gram={}".format(i))), axis=1)

#print("cv_laplace_smoothing")
#cv_laplace_smoothing = None
#for i in [1., 5., 10.]:
#    print(i)
#    a = cross_validate(NaiveBayesClassifier(laplace_smoothing=i),
#                       xx,
#                       yy,
#                       fit_params={},
#                       scoring=scoring,
#                       cv=2,
#                       return_train_score=True)
#    a_mean = pd.DataFrame(a).mean(axis=0)
#    cv_laplace_smoothing = pd.concat((cv_laplace_smoothing,
#                                      a_mean.rename("laplace_smoothing={}".format(i))), axis=1)
    
#print("cv_count_threshold")
#cv_count_threshold = None
#for i in [0., 10., 25.]:
#    print(i)
#    a = cross_validate(NaiveBayesClassifier(count_threshold=i),
#                       xx,
#                       yy,
#                       fit_params={},
#                       scoring=scoring,
#                       cv=2,
#                       return_train_score=True)
#    a_mean = pd.DataFrame(a).mean(axis=0)
#    cv_count_threshold = pd.concat((cv_count_threshold,
#                                    a_mean.rename("count_threshold={}".format(i))), axis=1)

In [146]:
pd.DataFrame(tts_superscore2).transpose()

Unnamed: 0,c,lam,n,number_of_features,test_accuracy,test_average,test_precision,test_recall
"n=1,lam=1.0,c=1",1.0,1.0,1.0,131.0,0.878049,0.872551,0.881566,0.85804
"n=1,lam=1.0,c=25",25.0,1.0,1.0,114.0,0.873532,0.876884,0.880161,0.876959
"n=1,lam=1.0,c=50",50.0,1.0,1.0,106.0,0.865402,0.869021,0.871959,0.869701
"n=1,lam=25.0,c=1",1.0,25.0,1.0,131.0,0.854562,0.855466,0.846593,0.865242
"n=1,lam=25.0,c=25",25.0,25.0,1.0,114.0,0.841915,0.851823,0.854716,0.858837
"n=1,lam=25.0,c=50",50.0,25.0,1.0,106.0,0.853659,0.853456,0.85917,0.847539
"n=1,lam=50.0,c=1",1.0,50.0,1.0,131.0,0.859079,0.86302,0.862295,0.867685
"n=1,lam=50.0,c=25",25.0,50.0,1.0,114.0,0.852755,0.841529,0.838411,0.833421
"n=1,lam=50.0,c=50",50.0,50.0,1.0,106.0,0.865402,0.864366,0.864952,0.862743
"n=2,lam=1.0,c=1",1.0,1.0,2.0,4094.0,0.953026,0.95132,0.947365,0.953569


In [147]:
#X_train = pd.read_csv("data/train_set_x.csv")['Text'].values
#Y_train = pd.read_csv("data/train_set_y.csv")['Category'].values
#X_test  = pd.read_csv("data/test_set_x.csv")['Text'].values
nbayest = NaiveBayesClassifier()
nbayest.set_params(n_gram=3,
                   multimap=True,
                   count_threshold=25,
                   use_uniform_prior=False,
                   laplace_smoothing=1.0)
nbayest.fit(X_train, Y_train)
pred = nbayest.predict(X_test)