In [309]:
from sklearn.base import BaseEstimator, ClassifierMixin

class NaiveBayesClassifier(BaseEstimator, ClassifierMixin):  
    def __init__(self, use_uniform_prior = False, laplace_smoothing = 1.):
        self.use_uniform_prior = use_uniform_prior
        self.laplace_smoothing = laplace_smoothing # Bayesian prior for Naive Bayes of laplace_smoothing / 5

    def fit(self, X=None, y=None):
        # Apply Laplace smoothing by adding a letter
        X[X.columns[-5:]] += self.laplace_smoothing
        # Count the # of n-grams observed in each language.
        CY = np.array(X[X.columns[-5:]].apply(lambda x: np.sum(x) * 1., axis = 0).values)
        # Define P(Y = y) as the proportion of n-grams observed in each language.
        if self.use_uniform_prior:
            PY = np.full(5, .1/5)
        else:
            PY = CY / np.sum(CY)
        
        # "confusion" function: predicted in rows, true in cols
        confusion = np.zeros((5,5), float)
        
        for _, row in y.iterrows():
            # Assume unigram for now
            yi = [c for c in str(row['Text'])]
            # Initialize P(X1 = x1, X2 = x2, ..., Xn = xn | Y = y)
            PX_Y = np.array([1., 1., 1., 1., 1.])
            # Numerically stable? # PX_Y_log = np.log(PX_Y)
            # Calculate P(X1 = x1, X2 = x2, ..., Xn = xn | Y = y) = prod_i P(Xi = xi | Y = y)
            for yil in yi:
                # Calculate P(X1 = x1 | Y = y)
                if yil in X.index:
                    Px_Y = None
                    Px_Y = X[X.index == yil][:1].reset_index().values
                    Px_Y = np.delete(Px_Y, 0).astype(float)
                    Px_Y = np.array(Px_Y)
                else:
                    continue # ignore unknown n-grams
                # Obtain P(X = x | Y = y) by calculating per-category frequency
                # of current letter
                Px_Y = Px_Y / CY
                # Push to accumulator.
                PX_Y = np.multiply(PX_Y, Px_Y)
                # Numerically stable? # PX_Y_log = PX_Y_log + np.log(Px_Y)
            # Throw in prior: P(X... | Y = y)P(Y)
            PX_Y_PY = np.multiply(PX_Y, PY)
            # Get the posterior: P(Y|X) = P(X|Y) P(Y)/P(X)
            # where P(X) = sum_i P(X... | Y = y_i)P(Y_i) and use
            PY_X = PX_Y_PY / np.sum(PX_Y_PY)
            
            pred_label = PY_X.argmax()
            true_label = row['Category']
            
            confusion[pred_label, true_label] += 1
            
        print(confusion)
        loss_score = (np.trace(confusion) / np.sum(confusion))
        print(loss_score)
        return self
    
    def transform(self, X):
        return X

    def predict(self, X):
        return []
 

In [312]:
from sklearn.pipeline import Pipeline, FeatureUnion
from NGramGenerator import *

Xf = pd.read_csv("data/train_set_x.csv")
T  = pd.read_csv("data/train_set_y.csv")
X  = pd.merge(Xf, T, on = 'Id')
Y  = pd.read_csv("data/test_set_x.csv")

ngram = NGramGenerator(1, True, 0, False)
nbayes = NaiveBayesClassifier()

p = Pipeline(
    [( 'ngram', ngram )] +
    [( 'nbayes', nbayes )]
)

p.set_params(nbayes__use_uniform_prior = False)
p.set_params(nbayes__laplace_smoothing = .1)
print(p.get_params())
A = p.fit_transform(X[:1000], X[:200])


{'nbayes__laplace_smoothing': 0.1, 'nbayes': NaiveBayesClassifier(laplace_smoothing=0.1, use_uniform_prior=False), 'ngram': <NGramGenerator.NGramGenerator object at 0x10fa83d10>, 'nbayes__use_uniform_prior': False, 'memory': None, 'steps': [('ngram', <NGramGenerator.NGramGenerator object at 0x10fa83d10>), ('nbayes', NaiveBayesClassifier(laplace_smoothing=0.1, use_uniform_prior=False))]}
[[ 11.   0.   3.   0.   0.]
 [  0.  93.  15.   4.   1.]
 [  0.  14.  26.   1.   1.]
 [  0.   3.   0.  22.   1.]
 [  0.   0.   1.   0.   4.]]
0.78
