In [1]:
import nltk as nltk
import numpy as np
import pandas as pd
import collections as coll

from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.utils.multiclass import unique_labels

def string_to_ngrams(s, n):
    text = str(s).decode('utf-8').lower()
    text = text.replace(' ', '')
    ngrams = nltk.ngrams([c for c in text], n)
    return [''.join(g) for g in ngrams]

class NaiveBayesClassifier(BaseEstimator, ClassifierMixin):  
    def __init__(self,
                 n_gram=1,
                 multimap=True,
                 count_threshold=0,
                 use_uniform_prior=False,
                 laplace_smoothing=1.):
        self.n_gram = n_gram
        self.multimap = multimap
        self.count_threshold = count_threshold
        self.use_uniform_prior = use_uniform_prior
        self.laplace_smoothing = laplace_smoothing

    def fit(self, X=None, y=None):
        W = pd.DataFrame({'X': [X], 'y': [y]})
        
        # Count features
        counts = {}
        for (xx, yy) in zip(X, y):
            category = np.zeros(5)
            category[yy] = 1
            ngrams = string_to_ngrams(xx, self.n_gram)
            if not self.multimap:
                ngrams = list(set(ngrams))
            for ngram in ngrams:
                if ngram in counts:
                    counts[ngram] = counts[ngram] + category
                else:
                    counts[ngram] = category
        counts = pd.DataFrame(counts).transpose()

        # Filter low counts
        keep = counts.apply(lambda row: sum(row) >= self.count_threshold, axis = 1)
        counts = counts[keep == 1]
        
        # Apply Laplace smoothing by adding a letter
        counts[counts.columns[-5:]] += self.laplace_smoothing
        
        # Count the # of n-grams observed in each language.
        class_counts = np.array(counts[counts.columns[-5:]].apply(lambda x: np.sum(x) * 1., axis = 0).values)
        
        # Define P(Y = y) as the proportion of n-grams observed in each language.
        if self.use_uniform_prior:
            class_priors = np.full(5, .1/5)
        else:
            class_priors = class_counts / np.sum(class_counts)
            
        self.likelihood_ = counts.div(class_counts)
        self.counts_ = counts
        self.class_priors_ = class_priors
        
        return self

    def predict(self, X):
        predictions = []

        for obs in X:
            levels = string_to_ngrams(obs, n = self.n_gram)
            
            joint_likelihood = np.full(5, 1.0)
            # joint_likelihood_log = np.log(joint_likelihood)
            
            # Calculate joint probability
            for level in levels:
                if not level in self.likelihood_.index:
                    continue
                    
                # Calculate likelihood for X
                likelihood = self.likelihood_[self.likelihood_.index == level][:1].reset_index().values
                likelihood = np.array(np.delete(likelihood, 0).astype(float))
                
                joint_likelihood = np.multiply(joint_likelihood, likelihood)
                # joint_likelihood_log = joint_likelihood_log + np.log(likelihood)
                
            # Calculate joint likelihood * class prior
            prop_posterior = np.multiply(joint_likelihood, self.class_priors_)
            
            # Calculate posterior probability
            posterior = prop_posterior / np.sum(prop_posterior)
            prediction = np.argmax(posterior)
            
            predictions = predictions + [prediction]
        
        return predictions
    
    
    def classify(self, inputs):
        return
 

In [2]:
# from sklearn.pipeline import Pipeline, FeatureUnion

X_train = pd.read_csv("data/train_set_x.csv")['Text'].values
Y_train = pd.read_csv("data/train_set_y.csv")['Category'].values
X_test  = pd.read_csv("data/test_set_x.csv")['Text'].values
nbayes = NaiveBayesClassifier()
nbayes.set_params(n_gram=1,
                  multimap=True,
                  count_threshold=5,
                  use_uniform_prior=False,
                  laplace_smoothing=1.0)
nbayes.fit(X_train[:1000], Y_train[:1000])
nbayes.likelihood_


Unnamed: 0,0,1,2,3,4
0,0.000824,0.001101,0.000869,0.002100,0.000574
1,0.001237,0.003015,0.000869,0.001750,0.001148
2,0.001237,0.002010,0.000760,0.002275,0.000574
3,0.000412,0.001340,0.000760,0.001050,0.000574
4,0.000824,0.000766,0.000435,0.002100,0.000574
5,0.000412,0.001819,0.000760,0.001050,0.000574
6,0.000412,0.001388,0.000543,0.001225,0.000574
7,0.000412,0.001340,0.000217,0.001400,0.000574
8,0.000412,0.000814,0.000435,0.000350,0.000574
9,0.000412,0.001484,0.000435,0.000700,0.000574


In [None]:
true_x = X_train[1000:1100]
true_y = Y_train[1000:1100]
pred_y = nbayes.predict(true_x)

loss = np.full((5,5), 1.0)
for i in range(len(pred_y)):
    loss[pred_y[i], true_y[i]] += 1
    
TPR = loss.trace() / loss.sum()
FNR = 1 - TPR

print(loss)
print(TPR)
print(FNR)

[[  5.   1.   3.   1.   1.]
 [  1.  56.   6.   2.   1.]
 [  1.   3.  13.   1.   1.]
 [  1.   1.   2.  15.   1.]
 [  1.   1.   1.   1.   5.]]
0.752
0.248


In [None]:
import numpy as np
from sklearn.model_selection import KFold
from sklearn.cross_validation import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.metrics import accuracy_score, precision_score, recall_score

import random

xx = X_train[:1000].copy()
yy = Y_train[:1000].copy()


TPR = loss.trace() / loss.sum()
FNR = 1 - TPR
print(TPR)
print(FNR)

scoring = {'acc': 'accuracy',
           'prec_macro': 'precision_macro',
           'rec_micro': 'recall_macro'}

print("cv_n_gram")
cv_n_gram = None
for i in range(1, 3 + 1):
    print(i)
    a = cross_validate(NaiveBayesClassifier(n_gram=i),
                       xx,
                       yy,
                       fit_params={},
                       scoring=scoring,
                       cv=2,
                       return_train_score=True)
    a_mean = pd.DataFrame(a).mean(axis=0)
    cv_n_gram = pd.concat((cv_n_gram,
                           a_mean.rename("n_gram={}".format(i))), axis=1)

print("cv_laplace_smoothing")
cv_laplace_smoothing = None
for i in [1., 10., 100.]:
    print(i)
    a = cross_validate(NaiveBayesClassifier(laplace_smoothing=i),
                       xx,
                       yy,
                       fit_params={},
                       scoring=scoring,
                       cv=2,
                       return_train_score=True)
    a_mean = pd.DataFrame(a).mean(axis=0)
    cv_laplace_smoothing = pd.concat((cv_laplace_smoothing,
                                      a_mean.rename("laplace_smoothing={}".format(i))), axis=1)
    
print("cv_count_threshold")
cv_count_threshold = None
for i in [1., 10., 100.]:
    print(i)
    a = cross_validate(NaiveBayesClassifier(count_threshold=i),
                       xx,
                       yy,
                       fit_params={},
                       scoring=scoring,
                       cv=2,
                       return_train_score=True)
    a_mean = pd.DataFrame(a).mean(axis=0)
    cv_count_threshold = pd.concat((cv_count_threshold,
                                    a_mean.rename("count_threshold={}".format(i))), axis=1)

0.752
0.248
cv_n_gram
1
2




3

In [None]:
print(cv_n_gram)
print(cv_laplace_smoothing)