In [411]:
import nltk as nltk
import numpy as np
import pandas as pd
import collections as coll

from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.utils.multiclass import unique_labels

def string_to_ngrams(s, n):
    text = str(s).decode('utf-8').lower()
    text = text.replace(' ', '') # remove spaces
    ngrams = nltk.ngrams([c for c in text], n)
    return [''.join(g) for g in ngrams]

class NaiveBayesClassifier(BaseEstimator, ClassifierMixin):  
    def __init__(self,
                 n_gram=1,
                 multimap=True,
                 count_threshold=0,
                 
                 use_uniform_prior=False,
                 laplace_smoothing=1.):
        self.n_gram = n_gram
        self.multimap = multimap
        self.count_threshold = count_threshold
        
        self.use_uniform_prior = use_uniform_prior
        self.laplace_smoothing = laplace_smoothing # Bayesian prior for Naive Bayes of laplace_smoothing / 5

    def fit(self, X=None, y=None):
        # X, y = check_X_y(X, y)
        
        #
        # Calculate frequency matrix
        #
        X = pd.concat([X, y], axis=1)
        
        Z = {}
        # Construct hash of arrays.
        for index, row in X.iterrows():
            # Code the language of the observation
            category = np.zeros(5)
            category[row['Category']] = 1
            # Break the text into n-grams
            ngrams = string_to_ngrams(row['Text'], self.n_gram)
            if not self.multimap:
                ngrams = list(set(ngrams))
            for ngram in ngrams:
                if ngram in Z:
                    # Sum element-wise with entries.
                    Z[ngram] = Z[ngram] + category # for some reason += works by reference and glitches
                else:
                    Z[ngram] = category
        
        # Convert into data frame
        Z = pd.DataFrame(Z).transpose()
        
        #
        # Postprocessing
        #
        
        # Filter low counts
        keep = Z.apply(lambda row: sum(row) >= self.count_threshold, axis = 1)
        Z = Z[keep == 1]
        # Apply Laplace smoothing by adding a letter
        Z[Z.columns[-5:]] += self.laplace_smoothing
        # Count the # of n-grams observed in each language.
        CY = np.array(Z[Z.columns[-5:]].apply(lambda x: np.sum(x) * 1., axis = 0).values)
        # Define P(Y = y) as the proportion of n-grams observed in each language.
        if self.use_uniform_prior:
            PY = np.full(5, .1/5)
        else:
            PY = CY / np.sum(CY)
            
        self.X_ = Z
        self.CY_ = CY
        self.PY_ = PY
        
        return self

    def predict(self, X):
        predictions = []

        for idx, row in X.iterrows():
            yi = string_to_ngrams(row['Text'], n = self.n_gram)
            # Initialize P(X1 = x1, X2 = x2, ..., Xn = xn | Y = y)
            PX_Y = np.array([1., 1., 1., 1., 1.])
            # Numerically stable? # PX_Y_log = np.log(PX_Y)
            # Calculate P(X1 = x1, X2 = x2, ..., Xn = xn | Y = y) = prod_i P(Xi = xi | Y = y)
            for yil in yi:
                # Calculate P(X1 = x1 | Y = y)
                if yil in self.X_.index:
                    Px_Y = None
                    Px_Y = self.X_[self.X_.index == yil][:1].reset_index().values
                    Px_Y = np.delete(Px_Y, 0).astype(float)
                    Px_Y = np.array(Px_Y)
                else:
                    continue # ignore unknown n-grams
                # Obtain P(X = x | Y = y) by calculating per-category frequency
                # of current letter
                Px_Y = Px_Y / self.CY_
                # Push to accumulator.
                PX_Y = np.multiply(PX_Y, Px_Y)
                # Numerically stable? # PX_Y_log = PX_Y_log + np.log(Px_Y)
            # Throw in prior: P(X... | Y = y)P(Y)
            PX_Y_PY = np.multiply(PX_Y, self.PY_)
            # Get the posterior: P(Y|X) = P(X|Y) P(Y)/P(X)
            # where P(X) = sum_i P(X... | Y = y_i)P(Y_i) and use
            PY_X = PX_Y_PY / np.sum(PX_Y_PY)
            predictions = predictions + [np.argmax(PY_X)]
        
        return pd.DataFrame({'Id': X['Id'],
                             'Category': predictions},
                            columns=['Id', 'Category'])
 

In [450]:
# from sklearn.pipeline import Pipeline, FeatureUnion

X_train = pd.read_csv("data/train_set_x.csv")
Y_train = pd.read_csv("data/train_set_y.csv")
X_test  = pd.read_csv("data/test_set_x.csv")

nbayes = NaiveBayesClassifier()
nbayes.set_params(n_gram=1,
                  multimap=False,
                  count_threshold=100,
                  use_uniform_prior=True,
                  laplace_smoothing=10.)
nbayes.fit(X_train[:3000], Y_train[:3000])


NaiveBayesClassifier(count_threshold=100, laplace_smoothing=10.0,
           multimap=False, n_gram=1, use_uniform_prior=True)

In [451]:
true_y = Y_train[3000:3100]
pred_x = X_train[3000:3100]
pred_y = nbayes.predict(pred_x)

loss = np.full((5,5), 1.)

for i in pred_y.index:
    loss[pred_y['Category'][i], true_y['Category'][i]] += 1
    
TPR = loss.trace() / loss.sum()
FNR = 1 - TPR



print(loss)
print(TPR)
print(FNR)

0.616
0.384
