In [72]:
import numpy as np
import pandas as pd
from collections import Counter, defaultdict
from sympy import *

In [37]:
pd.options.display.max_columns = None

In [67]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

<IPython.core.display.Javascript object>

In [152]:
class NaiveBayes:
    def __init__(self, **kwargs):
        self.smoothing = kwargs.get('smoothing')
        self.model = kwargs.get('model') if kwargs.get('model') else 'bernoulli'

    def fit(self, X, y):
        self.X = X
        self.y = y

        if not y:
            self.y = self.X[:,-1]
            self.X = self.X[:,:-1]



        self.classes, self.class_counts = np.unique(self.y, return_counts=True)
        self.total_count = len(self.y)
        class_cols = []
        class_vals = [[]]

        for i, c in enumerate(self.classes):
            class_cols.append(f'p(y={c})')
            class_vals[0].append(np.round(self.class_counts[i]/self.total_count,4))

        self.prior_df = pd.DataFrame(class_vals, columns=class_cols)

        print('Class priors:')
        #prior_df.style.hide_index()
        print(self.prior_df.to_string(index=False))

        if self.is_bag_of_words():
            self.handle_bag_of_words()
        else:
            print('hi')
    
    def predict(self, X_test):
        self.X_test = X_test
        
        if self.is_bag_of_words():
            self.predict_bag_of_words()
        
    def predict_bag_of_words(self):
        print(f'Test Document, D = {self.X_test}')
        print('\nUsing')
        probs = []
        if self.model == 'bernoulli':
            display(Symbol('P(C_{k}|b)\u221D P(X|C_{k})P(C_{k}))'))
            display(Symbol('P(C_{k}|b)\u221D P(C_{k})\u220F_{t=1}^{|V|} [b_tP(w_t|C_k) + (1-b_t)(1-P(w_t|C_k))]'))
            print('where b= feature vector of document D \nb\u209C = {0,1} => abscence or presence of word w\u209C in the document')
            b =  [int(word in self.X_test) for word in self.vocab]
            print(f'b = {b}')
            for cls in self.classes:
                prob = self.prior_df[f'p(y={cls})'].iloc[0]
                prob_string = f'P(y={cls}|b) \u221D {prob}'
                prob_string2 = f'P(y={cls}|X) \u221D p(y={cls})'
                
                for word in self.vocab:
                    if word in self.X_test:
                        prob *= self.probabilities[cls][word]
                        prob_string += f'*{self.str_probabilities[cls][word]}'
                        prob_string2 += f'*P({word}|y={cls})'
                    else:
                        prob *= 1 - self.probabilities[cls][word]
                        prob_string += f'*(1 - {self.str_probabilities[cls][word]})'
                        prob_string2 += f'*(1 - P({word}|y={cls}))'
                print()
                print(prob_string)
                print(f'= {np.round(prob,4)}')
                probs.append(prob)
                print('=========================================================')
        else:
            display(Symbol('P(C_{k}|D)\u221D P(X|C_{k})P(C_{k}))'))
            display(Symbol('P(C_{k}|D)\u221D P(C_{k})\u220F_{j=1}^{len(X)} P(u_j|C_k)'))
            print('where u - each word in test document X')

            for cls in self.classes:                
                prob = self.prior_df[f'p(y={cls})'].iloc[0]
                prob_string = f'P(y={cls}|X) \u221D {prob}'
                prob_string2 = f'P(y={cls}|X) \u221D p(y={cls})'
                for word in self.X_test.split():                    
                    prob *= self.probabilities[cls][word]
                    prob_string += f'*{self.str_probabilities[cls][word]}'
                    prob_string2 += f'*P({word}|y={cls})'
                   
                print()
                print(prob_string2)
                print(prob_string)
                print(f'= {np.round(prob,4)}')
                probs.append(prob)
                print('=========================================================')
        print(f'\nThe test document D belongs to class y = {self.classes[np.argmax(probs)]}')
        
    def is_bag_of_words(self):
        return np.array(self.X).ndim == 1 \
               and np.array(self.X).dtype.kind in ['S', 'U'] and len(np.unique(self.X)) == self.total_count

    def handle_bag_of_words(self):
        words = [word for sentence in self.X for word in sentence.split()]

        word_counts = Counter(words)

        self.vocab = set(word_counts.keys())
        #print(self.vocab)
        if self.model == 'bernoulli':
            bag_data = [[int(word in sentence) for word in self.vocab] for sentence in self.X]
        else:
            bag_data = []
            for sentence in self.X:
                wc = Counter(sentence.split())
                bag_data.append([wc.get(word) if wc.get(word) else 0 for word in self.vocab ])
        #print(bag_data)
        df_bag = pd.DataFrame(bag_data, columns=self.vocab, index = self.X)
        df_bag['Class(Y)'] = self.y
        print('Data:')
        display(df_bag)
        print('==================================================================================')
        
        self.probabilities = defaultdict(dict)
        self.str_probabilities = defaultdict(dict)
        
        for cls in self.classes:
            df_bag_cls = df_bag[df_bag['Class(Y)'] == cls]
            print(f'\n\nFor Y = {cls}')
            #print(df_bag_cls.sum().sum())
            display(df_bag_cls)
            idx_cls = df_bag_cls.index
            words_cls = [word for sentence in idx_cls for word in sentence.split()]
            for word in self.vocab:
                if self.model == 'bernoulli':
                    if self.smoothing:
                        print(f'\np({word}|y={cls}) = ({df_bag_cls[word].sum()} + 1)/({df_bag_cls.shape[0]} + 2)',
                              f'= {df_bag_cls[word].sum() + 1}/{df_bag_cls.shape[0] +2}',
                              f'= {np.round((df_bag_cls[word].sum() + 1)/(df_bag_cls.shape[0] +2), 4)}')
                        self.probabilities[cls][word] = np.round((df_bag_cls[word].sum() + 1)/(df_bag_cls.shape[0] +2), 4)
                        self.str_probabilities[cls][word] = f'{df_bag_cls[word].sum() + 1}/{df_bag_cls.shape[0] +2}'
                    else:
                        print(f'\np({word}|y={cls}) = {df_bag_cls[word].sum()}/{df_bag_cls.shape[0]}',
                              f'= {np.round((df_bag_cls[word].sum())/(df_bag_cls.shape[0]),4)}')
                        self.probabilities[cls][word] = np.round((df_bag_cls[word].sum())/(df_bag_cls.shape[0]),4)
                        self.str_probabilities[cls][word] = f'{df_bag_cls[word].sum()}/{df_bag_cls.shape[0]}'
                else:                    
                    if self.smoothing:
                        print(f'\np({word}|y={cls}) = ({df_bag_cls[word].sum()} + 1)/({len(words_cls)} + {len(self.vocab)})',
                              f'= {df_bag_cls[word].sum() + 1}/{len(words_cls) +len(self.vocab)}',
                              f'= {np.round((df_bag_cls[word].sum() + 1)/(len(words_cls) +len(self.vocab)), 4)}')
                        self.probabilities[cls][word] = np.round((df_bag_cls[word].sum() + 1)/(len(words_cls) +len(self.vocab)), 4)
                        self.str_probabilities[cls][word] = f'{df_bag_cls[word].sum() + 1}/{len(words_cls) +len(self.vocab)}'
                    else:
                        print(f'\np({word}|y={cls}) = {df_bag_cls[word].sum()}/{len(words_cls)}',
                              f'= {np.round((df_bag_cls[word].sum())/(len(words_cls)),4)}')
                        self.probabilities[cls][word] = np.round((df_bag_cls[word].sum())/(len(words_cls)),4)
                        self.str_probabilities[cls][word] = f'{df_bag_cls[word].sum()}/{len(words_cls)}'
                        
            print(f'\n=====================================================================================\n')
            

## Bag of Words : Multinomial

In [153]:
model = NaiveBayes(model = 'multi', smoothing = True)
model.fit(X=['Chinese Beijing Chinese ', 'Chinese Chinese Shanghai', 'Chinese Macau', 'Tokyo Japan Chinese',
                    ],
                 y=['Yes','Yes','Yes','No'])

Class priors:
 p(y=No)  p(y=Yes)
    0.25      0.75
Data:


Unnamed: 0,Shanghai,Beijing,Macau,Chinese,Japan,Tokyo,Class(Y)
Chinese Beijing Chinese,0,1,0,2,0,0,Yes
Chinese Chinese Shanghai,1,0,0,2,0,0,Yes
Chinese Macau,0,0,1,1,0,0,Yes
Tokyo Japan Chinese,0,0,0,1,1,1,No




For Y = No


Unnamed: 0,Shanghai,Beijing,Macau,Chinese,Japan,Tokyo,Class(Y)
Tokyo Japan Chinese,0,0,0,1,1,1,No



p(Shanghai|y=No) = (0 + 1)/(3 + 6) = 1/9 = 0.1111

p(Beijing|y=No) = (0 + 1)/(3 + 6) = 1/9 = 0.1111

p(Macau|y=No) = (0 + 1)/(3 + 6) = 1/9 = 0.1111

p(Chinese|y=No) = (1 + 1)/(3 + 6) = 2/9 = 0.2222

p(Japan|y=No) = (1 + 1)/(3 + 6) = 2/9 = 0.2222

p(Tokyo|y=No) = (1 + 1)/(3 + 6) = 2/9 = 0.2222




For Y = Yes


Unnamed: 0,Shanghai,Beijing,Macau,Chinese,Japan,Tokyo,Class(Y)
Chinese Beijing Chinese,0,1,0,2,0,0,Yes
Chinese Chinese Shanghai,1,0,0,2,0,0,Yes
Chinese Macau,0,0,1,1,0,0,Yes



p(Shanghai|y=Yes) = (1 + 1)/(8 + 6) = 2/14 = 0.1429

p(Beijing|y=Yes) = (1 + 1)/(8 + 6) = 2/14 = 0.1429

p(Macau|y=Yes) = (1 + 1)/(8 + 6) = 2/14 = 0.1429

p(Chinese|y=Yes) = (5 + 1)/(8 + 6) = 6/14 = 0.4286

p(Japan|y=Yes) = (0 + 1)/(8 + 6) = 1/14 = 0.0714

p(Tokyo|y=Yes) = (0 + 1)/(8 + 6) = 1/14 = 0.0714




In [154]:
model.predict('Chinese Chinese Chinese Tokyo Japan')

Test Document, D = Chinese Chinese Chinese Tokyo Japan

Using


P(C_{k}|D)∝ P(X|C_{k})P(C_{k}))

P(C_{k}|D)∝ P(C_{k})∏_{j=1}^{len(X)} P(u_j|C_k)

where u - each word in test document X

P(y=No|X) ∝ p(y=No)*P(Chinese|y=No)*P(Chinese|y=No)*P(Chinese|y=No)*P(Tokyo|y=No)*P(Japan|y=No)
P(y=No|X) ∝ 0.25*2/9*2/9*2/9*2/9*2/9
= 0.0001

P(y=Yes|X) ∝ p(y=Yes)*P(Chinese|y=Yes)*P(Chinese|y=Yes)*P(Chinese|y=Yes)*P(Tokyo|y=Yes)*P(Japan|y=Yes)
P(y=Yes|X) ∝ 0.75*6/14*6/14*6/14*1/14*1/14
= 0.0003

The test document D belongs to class y = Yes


## Bag of Words : Bernoulli

In [155]:
model = NaiveBayes(model = 'bernoulli', smoothing = True)
model.fit(X=['Chinese Beijing Chinese ', 'Chinese Chinese Shanghai', 'Chinese Macau', 'Tokyo Japan Chinese',
                    ],
                 y=['Yes','Yes','Yes','No'])

Class priors:
 p(y=No)  p(y=Yes)
    0.25      0.75
Data:


Unnamed: 0,Shanghai,Beijing,Macau,Chinese,Japan,Tokyo,Class(Y)
Chinese Beijing Chinese,0,1,0,1,0,0,Yes
Chinese Chinese Shanghai,1,0,0,1,0,0,Yes
Chinese Macau,0,0,1,1,0,0,Yes
Tokyo Japan Chinese,0,0,0,1,1,1,No




For Y = No


Unnamed: 0,Shanghai,Beijing,Macau,Chinese,Japan,Tokyo,Class(Y)
Tokyo Japan Chinese,0,0,0,1,1,1,No



p(Shanghai|y=No) = (0 + 1)/(1 + 2) = 1/3 = 0.3333

p(Beijing|y=No) = (0 + 1)/(1 + 2) = 1/3 = 0.3333

p(Macau|y=No) = (0 + 1)/(1 + 2) = 1/3 = 0.3333

p(Chinese|y=No) = (1 + 1)/(1 + 2) = 2/3 = 0.6667

p(Japan|y=No) = (1 + 1)/(1 + 2) = 2/3 = 0.6667

p(Tokyo|y=No) = (1 + 1)/(1 + 2) = 2/3 = 0.6667




For Y = Yes


Unnamed: 0,Shanghai,Beijing,Macau,Chinese,Japan,Tokyo,Class(Y)
Chinese Beijing Chinese,0,1,0,1,0,0,Yes
Chinese Chinese Shanghai,1,0,0,1,0,0,Yes
Chinese Macau,0,0,1,1,0,0,Yes



p(Shanghai|y=Yes) = (1 + 1)/(3 + 2) = 2/5 = 0.4

p(Beijing|y=Yes) = (1 + 1)/(3 + 2) = 2/5 = 0.4

p(Macau|y=Yes) = (1 + 1)/(3 + 2) = 2/5 = 0.4

p(Chinese|y=Yes) = (3 + 1)/(3 + 2) = 4/5 = 0.8

p(Japan|y=Yes) = (0 + 1)/(3 + 2) = 1/5 = 0.2

p(Tokyo|y=Yes) = (0 + 1)/(3 + 2) = 1/5 = 0.2




In [156]:
model.predict('Chinese Chinese Chinese Tokyo Japan')

Test Document, D = Chinese Chinese Chinese Tokyo Japan

Using


P(C_{k}|b)∝ P(X|C_{k})P(C_{k}))

P(C_{k}|b)∝ P(C_{k})∏_{t=1}^{|V|} [b_tP(w_t|C_k) + (1-b_t)(1-P(w_t|C_k))]

where b= feature vector of document D 
bₜ = {0,1} => abscence or presence of word wₜ in the document
b = [0, 0, 0, 1, 1, 1]

P(y=No|b) ∝ 0.25*(1 - 1/3)*(1 - 1/3)*(1 - 1/3)*2/3*2/3*2/3
= 0.022

P(y=Yes|b) ∝ 0.75*(1 - 2/5)*(1 - 2/5)*(1 - 2/5)*4/5*1/5*1/5
= 0.0052

The test document D belongs to class y = No
