In [1]:
%load_ext autoreload
%autoreload 2

In [316]:
import os
import logging
from collections import defaultdict, Counter

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from hydra import initialize, compose
import hydra

from cgpos.utils.util import import_pkl, export_pkl, get_abs_dir, flatten

In [3]:
# Reset hydra
hydra.core.global_hydra.GlobalHydra.instance().clear()
# Load hydra params
initialize("../conf/", version_base=None)
config = compose(config_name='main')
# Init logger
logging.basicConfig(level=logging.INFO) 

In [4]:
import math

from sklearn.model_selection import train_test_split

from cgpos.models.multinomial_naive_bayes import ngrams, count_vectors

In [5]:
uid, text, targets = import_pkl(config.data.cleaned)
features = import_pkl(config.data.features)
target_map = import_pkl(config.reference.target_map)

INFO:cgpos.utils.util:Importing /home/tejomay/cgpos/data/processed/cleaned.pkl
INFO:cgpos.utils.util:Importing /home/tejomay/cgpos/data/processed/features.pkl
INFO:cgpos.utils.util:Importing /home/tejomay/cgpos/data/reference/target_map.pkl


In [616]:
d = Counter()

In [721]:
class multinomial_naive_bayes():
    def __init__(self, alpha=1, ngram_range=(1,1)):
        self.alpha = 1
        self.ngram_range = ngram_range

    def fit(self, X, y):
        X_grams = [count_vectors(word, self.ngram_range) for word in X]
        N = len(y)
        V = len(set(flatten(X_grams)))
        n_classes = max(y) + 1
        class_counts = [0] * n_classes
        feature_counts = defaultdict(Counter)
        for i in range(N):
            class_i = y[i]
            features_i = X_grams[i]
            class_counts[class_i] += 1
            feature_counts[class_i].update(features_i)

        log_priors = [math.log(class_counts[class_i] / N) for class_i in range(n_classes)]

        def create_default_factory(value):
            return lambda: value
        
        log_likelihoods = defaultdict(defaultdict)
        for class_i in range(n_classes):
            feature_total = sum(feature_counts[class_i].values())
            denominator = feature_total + self.alpha * V
            for key, value in feature_counts[class_i].items():
                numerator = value + self.alpha
                log_likelihood = math.log(numerator / denominator)
                log_likelihoods[class_i][key] = log_likelihood
            laplace = math.log(self.alpha / denominator)
            log_likelihoods[class_i].default_factory = create_default_factory(laplace)

        self.V = V
        self.n_classes = n_classes
        self.class_counts = class_counts
        self.feature_counts = feature_counts
        self.log_priors = log_priors
        self.log_likelihoods = log_likelihoods

    def predict(self, X):
        X_grams = [ngrams(word, self.ngram_range) for word in X]
        preds = []
        for x in X_grams:
            probs = self.log_priors.copy()
            for gram in x:
                for class_i in range(self.n_classes):
                    probs[class_i] += self.log_likelihoods[class_i][gram]
            max_prob = float("-inf") 
            argmax = None
            for i, prob in enumerate(probs):
                if prob > max_prob:
                    max_prob = prob
                    argmax = i
            preds.append(argmax)
                    
        return preds

In [707]:
class_i = 0
X_train, X_test, text_train, text_test, y_train, y_test = train_test_split(
    features,
    np.array([" ".join(syllables) for syllables in text]),
    np.array([target[class_i] for target in targets]),
    train_size=0.8, random_state=20
)
target_names = target_map[1][class_i][1]

In [713]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

text_clf = Pipeline([
    ('vect', CountVectorizer(lowercase=False, ngram_range=(1,1), token_pattern=r"\b\w+\b")),
    ('clf', MultinomialNB()),
])

In [722]:
mnb = multinomial_naive_bayes(alpha=1.0, ngram_range=(1,1))
mnb.fit(X_train, y_train)
y_pred = mnb.predict(X_test)

In [723]:
np.mean(y_pred == y_test)

0.8187721101117669

In [703]:
text_clf = Pipeline([
    ('vect', CountVectorizer(lowercase=False, ngram_range=(1,1), token_pattern=r"\b\w+\b")),
    ('clf', MultinomialNB()),
])

X_train_skl = [" ".join([str(syllable) for syllable in word]) for word in X_train]
X_test_skl = [" ".join([str(syllable) for syllable in word]) for word in X_test]
text_clf.fit(X_train_skl[:i], y_train[:i])

In [683]:
text_clf[1].feature_log_prob_

array([[-1.25276297, -1.25276297, -1.94591015, -1.25276297],
       [-1.60943791, -1.60943791, -0.91629073, -1.60943791]])

In [700]:
text_clf.predict(X_test_skl[:j])

array([ 6,  5,  2,  5,  8, 12,  5,  2,  1,  2])

In [704]:
mnb.predict(X_test[:j])

[6, 6, 6, 6, 6, 6, 6, 6, 6, 6]

In [684]:
mnb.log_likelihoods

defaultdict(collections.defaultdict,
            {5: defaultdict(<function __main__.multinomial_naive_bayes.fit.<locals>.create_default_factory.<locals>.<lambda>()>,
                         {(365,): -1.252762968495368,
                          (1349,): -1.252762968495368,
                          (570,): -1.252762968495368}),
             6: defaultdict(<function __main__.multinomial_naive_bayes.fit.<locals>.create_default_factory.<locals>.<lambda>()>,
                         {(432,): -0.916290731874155})})