In [1]:
import numpy as np
import pandas as pd
from collections import Counter, defaultdict
from sympy import *

In [2]:
pd.options.display.max_columns = None

In [3]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

<IPython.core.display.Javascript object>

In [152]:
class NaiveBayes:
    def __init__(self, **kwargs):
        self.smoothing = kwargs.get('smoothing')
        self.model = kwargs.get('model') if kwargs.get('model') else 'bernoulli'
        self.rounding_digit = kwargs.get('rounding_digit') if kwargs.get('rounding_digit') else 6

    def fit(self, X, y = None):
        self.X = X
        self.y = y
        self.y_col = 'Class(Y)'

        if not y:
            self.y_col = self.X.columns[-1]            
            self.y = self.X.iloc[:,-1]
            self.X = self.X.iloc[:,:-1]


        self.classes, self.class_counts = np.unique(self.y, return_counts=True)
        self.total_count = len(self.y)
        class_cols = []
        class_vals = [[]]

        for i, c in enumerate(self.classes):
            class_cols.append(f'p(y={c})')
            class_vals[0].append(np.round(self.class_counts[i]/self.total_count,self.rounding_digit))

        self.prior_df = pd.DataFrame(class_vals, columns=class_cols)

        print('Class priors:')
        #prior_df.style.hide_index()
        print(self.prior_df.to_string(index=False))

        if self.is_bag_of_words():
            self.fit_bag_of_words()
        else:
            self.fit_data()
    
    def fit_data(self):
        print('\nData:')
        df_nb = self.X
        df_nb[self.y_col] = self.y
        display(df_nb)
        
        self.probabilities = defaultdict(dict)
        self.str_probabilities = defaultdict(dict)

        
        for col in self.X.columns[:-1]:
            print(f'\nFeature: {col}')
            #print(self.X[col].dtype.kind)
            if self.X[col].dtype.kind in {'i', 'u', 'S', 'U', 'O'}:
                print(f'{col} is categorical')
                print('\nUsing:')
                display(Eq(Symbol('P(X_{i}=x|Y=y_{k})'), Mul(Symbol('D(X_{i}=x\u2229Y=y_{k})'),Pow(Symbol('D(Y=y_{k})'),-1))))
                print('Where D denoted no of items in dataset')
                for cls in self.classes:
                    df_nb_cls = df_nb[df_nb[self.y_col] == cls]
                    values, counts = np.unique(df_nb_cls[col], return_counts=True)
                    self.probabilities[col][cls] = dict(zip(values, counts / df_nb_cls.shape[0]))
#                     print(f'{counts} / {df_nb_cls.shape[0]}')
#                     self.str_probabilities[col][cls] = dict(zip(values, f'{counts} / {df_nb_cls.shape[0]}'))
#                 print(self.str_probabilities[col])
#                 print(self.probabilities[col])
                df_prob = pd.DataFrame(self.probabilities[col])
                df_prob.columns = [f'{self.y_col}={col}' for col in  df_prob.columns]
                df_prob.index.name = col
                print('\nProbability Lookup Table:')
                display(df_prob)
            else:
                print(f'{col} is continuous') 
                print('\nUsing:')
                display(Eq(Symbol('P(X_{i}=x|Y=y_{k})'), Mul(Mul(1,Pow(sqrt(Symbol('2\u03C0\u03C3_{ik}^2')),-1)),Pow(Symbol('e'),
                                                Mul(Symbol('-(x-\u03BC_{ik})^2'),Pow(Symbol('2\u03C3_{ik}^2'),-1))))))
                for cls in self.classes:
                    df_nb_cls = df_nb[df_nb[self.y_col] == cls]                    
                    self.probabilities[col][cls] = {"mean": df_nb_cls[col].mean(), "variance": df_nb_cls[col].var()}
                
                df_prob = pd.DataFrame(self.probabilities[col])
                df_prob.columns = [f'{self.y_col}={col}' for col in  df_prob.columns]
                df_prob.index.name = col
                print('\nMean Variance Table Table:')
                display(df_prob)
            print('==================================================================================')
                
    
    def predict(self, X_test):
        self.X_test = X_test
        
        if self.is_bag_of_words():
            self.predict_bag_of_words()
        else:
            self.predict_data()
    
    def predict_data(self):        
        print(f'Test Data = {self.X_test}')
        print('\nUsing')
        probs = []
        display(Eq(Symbol('Y_{new}'), Symbol('argmax_{y_{k}}P(Y=y_{k})\u220F_{i}P(X_{i}|Y=y_{k})')))
        y = self.y_col
        for cls in self.classes:
            prob = self.prior_df[f'p(y={cls})'].iloc[0]
            prob_string = f'P(y={cls}|X) \u221D {prob}'
            prob_string2 = f'P(y={cls}|X) \u221D p(y={cls})'
            for X in self.X_test:
                #print(X)
                if self.X_test.get(X) in self.probabilities[X][cls]:
                    prob*= self.probabilities[X][cls][self.X_test.get(X)]
                    prob_string += f'*{self.probabilities[X][cls][self.X_test.get(X)]}'
                    prob_string2 += f'*P({X} = {self.X_test.get(X)}|y={cls})'
                else:
                    mean = self.probabilities[X][cls]["mean"]
                    var = self.probabilities[X][cls]["variance"]
                    p = np.exp(- (self.X_test.get(X)-mean)**2 / (2 * var)) / np.sqrt(2 * np.pi * var)
                    prob *= p
                    prob_string += f'*{p}'
                    prob_string2 += f'*P({X} = {self.X_test.get(X)}|y={cls})'
            print()
            print(prob_string2)
            print(prob_string)
            print(f'= {prob}')
            probs.append(prob)
            print('=========================================================')
        
        print(f'\nThe test data belongs to class y = {self.classes[np.argmax(probs)]}')
        
    def predict_bag_of_words(self):
        print(f'Test Document, D = {self.X_test}')
        self.X_test = self.X_test.lower().replace(',',' ').replace('.',' ').replace('!',' ')
        print('\nUsing')
        probs = []
        if self.model == 'bernoulli':
            display(Symbol('P(C_{k}|b)\u221D P(X|C_{k})P(C_{k}))'))
            display(Symbol('P(C_{k}|b)\u221D P(C_{k})\u220F_{t=1}^{|V|} [b_tP(w_t|C_k) + (1-b_t)(1-P(w_t|C_k))]'))
            print('where b = feature vector of document D \nb\u209C = {0,1} => abscence or presence of word w\u209C in the document')
            b =  [int(word in self.X_test) for word in self.vocab]
            print(f'\nb = {b}')
            for cls in self.classes:
                df_bag_cls = self.df_bag[self.df_bag['Class(Y)'] == cls]
                prob = self.prior_df[f'p(y={cls})'].iloc[0]
                prob_string = f'P(y={cls}|b) \u221D {prob}'
                prob_string2 = f'P(y={cls}|b) \u221D p(y={cls})'
                
                for word in self.vocab:
                    if not self.probabilities[cls].get(word):
                            self.probabilities[cls][word] = 1/(df_bag_cls.shape[0] + 2)
                            self.str_probabilities[cls][word] = f'1/({df_bag_cls.shape[0]} + 2)'
                    if word in self.X_test:                        
                        prob *= self.probabilities[cls][word]
                        prob_string += f'*{self.str_probabilities[cls][word]}'
                        prob_string2 += f'*P({word}|y={cls})'
                    else:
                        prob *= 1 - self.probabilities[cls][word]
                        prob_string += f'*(1 - {self.str_probabilities[cls][word]})'
                        prob_string2 += f'*(1 - P({word}|y={cls}))'
                print()
                print(prob_string2)
                print(prob_string)
                print(f'= {prob}')
                probs.append(prob)
                print('=========================================================')
        else:
            display(Symbol('P(C_{k}|D)\u221D P(X|C_{k})P(C_{k}))'))
            display(Symbol('P(C_{k}|D)\u221D P(C_{k})\u220F_{j=1}^{len(X)} P(u_j|C_k)'))
            print('where u - each word in test document X')

            for cls in self.classes:  
                df_bag_cls = self.df_bag[self.df_bag['Class(Y)'] == cls]
                idx_cls = df_bag_cls.index
                words_cls = [word for sentence in idx_cls for word in sentence.split()]
                prob = self.prior_df[f'p(y={cls})'].iloc[0]
                prob_string = f'P(y={cls}|X) \u221D {prob}'
                prob_string2 = f'P(y={cls}|X) \u221D p(y={cls})'
                for word in self.X_test.split():
                    if not self.probabilities[cls].get(word):
                            self.probabilities[cls][word] = 1/(len(words_cls) + len(self.vocab))
                            self.str_probabilities[cls][word] = f'1/({len(words_cls)} + {len(self.vocab)})'
                    prob *= self.probabilities[cls][word]
                    prob_string += f'*{self.str_probabilities[cls][word]}'
                    prob_string2 += f'*P({word}|y={cls})'
                   
                print()
                print(prob_string2)
                print(prob_string)
                print(f'= {prob}')
                probs.append(prob)
                print('=========================================================')
        print(f'\nThe test document D belongs to class y = {self.classes[np.argmax(probs)]}')
        
    def is_bag_of_words(self):
        return np.array(self.X).ndim == 1 \
               and np.array(self.X).dtype.kind in ['S', 'U'] and len(np.unique(self.X)) == self.total_count

    def fit_bag_of_words(self):
        self.X = [x.lower().replace(',',' ').replace('.',' ').replace('!',' ') for x in self.X]
        words = [word for sentence in self.X for word in sentence.split()]

        word_counts = Counter(words)

        self.vocab = set(word_counts.keys())
        print(f'\nTotal no of unique words in vocab: {len(self.vocab)}')
        #print(self.vocab)
        if self.model == 'bernoulli':
            bag_data = [[int(word in sentence) for word in self.vocab] for sentence in self.X]
        else:
            bag_data = []
            for sentence in self.X:
                wc = Counter(sentence.split())
                bag_data.append([wc.get(word) if wc.get(word) else 0 for word in self.vocab ])
        #print(bag_data)
        df_bag = pd.DataFrame(bag_data, columns=self.vocab, index = self.X)
        df_bag['Class(Y)'] = self.y
        print('\nData:')
        display(df_bag)
        self.df_bag = df_bag
        print('==================================================================================')
        
        self.probabilities = defaultdict(dict)
        self.str_probabilities = defaultdict(dict)
        
        for cls in self.classes:
            df_bag_cls = df_bag[df_bag['Class(Y)'] == cls]
            print(f'\n\nFor Y = {cls}')
            #print(df_bag_cls.sum().sum())
            display(df_bag_cls)
            idx_cls = df_bag_cls.index
            words_cls = [word for sentence in idx_cls for word in sentence.split()]
            print(f'\nTNo of words in {cls}: {len(words_cls)}')
            for word in self.vocab:
                if self.model == 'bernoulli':
                    if self.smoothing:
                        print(f'\np({word}|y={cls}) = ({df_bag_cls[word].sum()} + 1)/({df_bag_cls.shape[0]} + 2)',
                              f'= {df_bag_cls[word].sum() + 1}/{df_bag_cls.shape[0] +2}',
                              f'= {np.round((df_bag_cls[word].sum() + 1)/(df_bag_cls.shape[0] +2), 4)}')
                        self.probabilities[cls][word] = np.round((df_bag_cls[word].sum() + 1)/(df_bag_cls.shape[0] +2), 4)
                        self.str_probabilities[cls][word] = f'{df_bag_cls[word].sum() + 1}/{df_bag_cls.shape[0] +2}'
                    else:
                        print(f'\np({word}|y={cls}) = {df_bag_cls[word].sum()}/{df_bag_cls.shape[0]}',
                              f'= {np.round((df_bag_cls[word].sum())/(df_bag_cls.shape[0]),4)}')
                        self.probabilities[cls][word] = np.round((df_bag_cls[word].sum())/(df_bag_cls.shape[0]),4)
                        self.str_probabilities[cls][word] = f'{df_bag_cls[word].sum()}/{df_bag_cls.shape[0]}'
                else:                    
                    if self.smoothing:
                        print(f'\np({word}|y={cls}) = ({df_bag_cls[word].sum()} + 1)/({len(words_cls)} + {len(self.vocab)})',
                              f'= {df_bag_cls[word].sum() + 1}/{len(words_cls) +len(self.vocab)}',
                              f'= {np.round((df_bag_cls[word].sum() + 1)/(len(words_cls) +len(self.vocab)), 4)}')
                        self.probabilities[cls][word] = np.round((df_bag_cls[word].sum() + 1)/(len(words_cls) +len(self.vocab)), 4)
                        self.str_probabilities[cls][word] = f'{df_bag_cls[word].sum() + 1}/{len(words_cls) +len(self.vocab)}'
                    else:
                        print(f'\np({word}|y={cls}) = {df_bag_cls[word].sum()}/{len(words_cls)}',
                              f'= {np.round((df_bag_cls[word].sum())/(len(words_cls)),4)}')
                        self.probabilities[cls][word] = np.round((df_bag_cls[word].sum())/(len(words_cls)),4)
                        self.str_probabilities[cls][word] = f'{df_bag_cls[word].sum()}/{len(words_cls)}'
                        
            print(f'\n=====================================================================================\n')
            

## Bag of Words : Multinomial

In [138]:
model = NaiveBayes(model = 'multi', smoothing = True)
model.fit(X=['Chinese Beijing Chinese ', 'Chinese Chinese Shanghai', 'Chinese Macau', 'Tokyo Japan Chinese',
                    ],
                 y=['Yes','Yes','Yes','No'])

Class priors:
 p(y=No)  p(y=Yes)
    0.25      0.75

Data:


Unnamed: 0,chinese,beijing,japan,tokyo,shanghai,macau,Class(Y)
chinese beijing chinese,2,1,0,0,0,0,Yes
chinese chinese shanghai,2,0,0,0,1,0,Yes
chinese macau,1,0,0,0,0,1,Yes
tokyo japan chinese,1,0,1,1,0,0,No




For Y = No


Unnamed: 0,chinese,beijing,japan,tokyo,shanghai,macau,Class(Y)
tokyo japan chinese,1,0,1,1,0,0,No



p(chinese|y=No) = (1 + 1)/(3 + 6) = 2/9 = 0.2222

p(beijing|y=No) = (0 + 1)/(3 + 6) = 1/9 = 0.1111

p(japan|y=No) = (1 + 1)/(3 + 6) = 2/9 = 0.2222

p(tokyo|y=No) = (1 + 1)/(3 + 6) = 2/9 = 0.2222

p(shanghai|y=No) = (0 + 1)/(3 + 6) = 1/9 = 0.1111

p(macau|y=No) = (0 + 1)/(3 + 6) = 1/9 = 0.1111




For Y = Yes


Unnamed: 0,chinese,beijing,japan,tokyo,shanghai,macau,Class(Y)
chinese beijing chinese,2,1,0,0,0,0,Yes
chinese chinese shanghai,2,0,0,0,1,0,Yes
chinese macau,1,0,0,0,0,1,Yes



p(chinese|y=Yes) = (5 + 1)/(8 + 6) = 6/14 = 0.4286

p(beijing|y=Yes) = (1 + 1)/(8 + 6) = 2/14 = 0.1429

p(japan|y=Yes) = (0 + 1)/(8 + 6) = 1/14 = 0.0714

p(tokyo|y=Yes) = (0 + 1)/(8 + 6) = 1/14 = 0.0714

p(shanghai|y=Yes) = (1 + 1)/(8 + 6) = 2/14 = 0.1429

p(macau|y=Yes) = (1 + 1)/(8 + 6) = 2/14 = 0.1429




In [139]:
model.predict('Chinese Chinese Chinese Tokyo Japan')

Test Document, D = Chinese Chinese Chinese Tokyo Japan

Using


P(C_{k}|D)∝ P(X|C_{k})P(C_{k}))

P(C_{k}|D)∝ P(C_{k})∏_{j=1}^{len(X)} P(u_j|C_k)

where u - each word in test document X

P(y=No|X) ∝ p(y=No)*P(chinese|y=No)*P(chinese|y=No)*P(chinese|y=No)*P(tokyo|y=No)*P(japan|y=No)
P(y=No|X) ∝ 0.25*2/9*2/9*2/9*2/9*2/9
= 0.0001354129756629241

P(y=Yes|X) ∝ p(y=Yes)*P(chinese|y=Yes)*P(chinese|y=Yes)*P(chinese|y=Yes)*P(tokyo|y=Yes)*P(japan|y=Yes)
P(y=Yes|X) ∝ 0.75*6/14*6/14*6/14*1/14*1/14
= 0.0003010330557273464

The test document D belongs to class y = Yes


## Bag of Words : Bernoulli

In [194]:
model = NaiveBayes(model = 'bernoulli', smoothing = True)
model.fit(X=['Chinese Beijing Chinese ', 'Chinese Chinese Shanghai', 'Chinese Macau', 'Tokyo Japan Chinese',
                    ],
                 y=['Yes','Yes','Yes','No'])

Class priors:
 p(y=No)  p(y=Yes)
    0.25      0.75

Data:


Unnamed: 0,Shanghai,Beijing,Macau,Chinese,Japan,Tokyo,Class(Y)
Chinese Beijing Chinese,0,1,0,1,0,0,Yes
Chinese Chinese Shanghai,1,0,0,1,0,0,Yes
Chinese Macau,0,0,1,1,0,0,Yes
Tokyo Japan Chinese,0,0,0,1,1,1,No




For Y = No


Unnamed: 0,Shanghai,Beijing,Macau,Chinese,Japan,Tokyo,Class(Y)
Tokyo Japan Chinese,0,0,0,1,1,1,No



p(Shanghai|y=No) = (0 + 1)/(1 + 2) = 1/3 = 0.3333

p(Beijing|y=No) = (0 + 1)/(1 + 2) = 1/3 = 0.3333

p(Macau|y=No) = (0 + 1)/(1 + 2) = 1/3 = 0.3333

p(Chinese|y=No) = (1 + 1)/(1 + 2) = 2/3 = 0.6667

p(Japan|y=No) = (1 + 1)/(1 + 2) = 2/3 = 0.6667

p(Tokyo|y=No) = (1 + 1)/(1 + 2) = 2/3 = 0.6667




For Y = Yes


Unnamed: 0,Shanghai,Beijing,Macau,Chinese,Japan,Tokyo,Class(Y)
Chinese Beijing Chinese,0,1,0,1,0,0,Yes
Chinese Chinese Shanghai,1,0,0,1,0,0,Yes
Chinese Macau,0,0,1,1,0,0,Yes



p(Shanghai|y=Yes) = (1 + 1)/(3 + 2) = 2/5 = 0.4

p(Beijing|y=Yes) = (1 + 1)/(3 + 2) = 2/5 = 0.4

p(Macau|y=Yes) = (1 + 1)/(3 + 2) = 2/5 = 0.4

p(Chinese|y=Yes) = (3 + 1)/(3 + 2) = 4/5 = 0.8

p(Japan|y=Yes) = (0 + 1)/(3 + 2) = 1/5 = 0.2

p(Tokyo|y=Yes) = (0 + 1)/(3 + 2) = 1/5 = 0.2




In [195]:
model.predict('Chinese Chinese Chinese Tokyo Japan')

Test Document, D = Chinese Chinese Chinese Tokyo Japan

Using


P(C_{k}|b)∝ P(X|C_{k})P(C_{k}))

P(C_{k}|b)∝ P(C_{k})∏_{t=1}^{|V|} [b_tP(w_t|C_k) + (1-b_t)(1-P(w_t|C_k))]

where b = feature vector of document D 
bₜ = {0,1} => abscence or presence of word wₜ in the document

b = [0, 0, 0, 1, 1, 1]

P(y=No|X) ∝ p(y=No)*(1 - P(Shanghai|y=No))*(1 - P(Beijing|y=No))*(1 - P(Macau|y=No))*P(Chinese|y=No)*P(Japan|y=No)*P(Tokyo|y=No)
P(y=No|b) ∝ 0.25*(1 - 1/3)*(1 - 1/3)*(1 - 1/3)*2/3*2/3*2/3
= 0.021954458984965713

P(y=Yes|X) ∝ p(y=Yes)*(1 - P(Shanghai|y=Yes))*(1 - P(Beijing|y=Yes))*(1 - P(Macau|y=Yes))*P(Chinese|y=Yes)*P(Japan|y=Yes)*P(Tokyo|y=Yes)
P(y=Yes|b) ∝ 0.75*(1 - 2/5)*(1 - 2/5)*(1 - 2/5)*4/5*1/5*1/5
= 0.005184

The test document D belongs to class y = No


## Categorize Data using Naive Bayes

In [135]:
df = pd.DataFrame({'Study Hours' : [4.5, 7, 2, 4, 2.5, 3, 8.3, 8, 9], 
                   'Gender': ['Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male'],
                  'Result': ['Pass', 'Pass', 'Fail', 'Fail', 'Fail', 'Fail', 'Fail', 'Pass', 'Pass']})
model = NaiveBayes()
model.fit(X=df)

Class priors:
 p(y=Fail)  p(y=Pass)
  0.555556   0.444444

Data:


Unnamed: 0,Study Hours,Gender,Result
0,4.5,Male,Pass
1,7.0,Female,Pass
2,2.0,Male,Fail
3,4.0,Female,Fail
4,2.5,Male,Fail
5,3.0,Female,Fail
6,8.3,Male,Fail
7,8.0,Female,Pass
8,9.0,Male,Pass



Feature: Study Hours
Study Hours is continuous

Using:


Eq(P(X_{i}=x|Y=y_{k}), e**(-(x-μ_{ik})^2/2σ_{ik}^2)/sqrt(2πσ_{ik}^2))


Mean Variance Table Table:


Unnamed: 0_level_0,Result=Fail,Result=Pass
Study Hours,Unnamed: 1_level_1,Unnamed: 2_level_1
mean,3.96,7.125
variance,6.433,3.729167



Feature: Gender
Gender is categorical

Using:


Eq(P(X_{i}=x|Y=y_{k}), D(X_{i}=x∩Y=y_{k})/D(Y=y_{k}))

Where D denoted no of items in dataset

Probability Lookup Table:


Unnamed: 0_level_0,Result=Fail,Result=Pass
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1
Female,0.4,0.5
Male,0.6,0.5




In [136]:
model.predict({'Study Hours' : 3.5, 'Gender': 'Male'})

Test Data = {'Study Hours': 3.5, 'Gender': 'Male'}

Using


Eq(Y_{new}, argmax_{y_{k}}P(Y=y_{k})∏_{i}P(X_{i}|Y=y_{k}))


P(y=Fail|X) ∝ p(y=Fail)*P(Study Hours = 3.5|y=Fail)*P(Gender = Male|y=Fail)
P(y=Fail|X) ∝ 0.555556*0.1547250702351274*0.6
= 0.05157506467172787

P(y=Pass|X) ∝ p(y=Pass)*P(Study Hours = 3.5|y=Pass)*P(Gender = Male|y=Pass)
P(y=Pass|X) ∝ 0.444444*0.035475873294745566*0.5
= 0.00788351951530495

The test data belongs to class y = Fail
