# ナイーブベイズ推定器の評価関数
（正規化項は省略、事前分布に一様分布を仮定）

$$
\begin{align}
    P(y|x_1, \dots, x_m)\
        &\propto P(x_1, \dots, x_m|y)\\
        &= \prod_{i=1}^{m} P(x_i|y) \\
    \log P(y|x_1, \dots, x_m)\
        &\propto \sum_{i=1}^{m} \log P(x_i|y)\\
        &=\sum_{i \in \text{categorical}}\log P(x_i|y)\
            + \sum_{i \in \text{numerical}}\log P(x_i|y) \\
        &=\sum_{i \in \text{categorical}} \left\{ \
                \log c(x_i|y) - \log c(y) \
            \right\} \
            + \sum_{i \in \text{numerical}}\log X(x_i|y)
\end{align}
$$
$c$は出現回数、$X$は正規分布に従う

In [1]:
import seaborn
import pandas as pd
import pandas_ml
import numpy as np
from scipy.stats import norm

from sklearn.metrics import confusion_matrix

pd.options.display.max_rows = 8

In [2]:
class HbkNaiveBayes:
    def __init__(self, model_frame, numerical=None):
        if not isinstance(model_frame, pandas_ml.core.frame.ModelFrame):
            raise ValueError()

        self._target_name = model_frame.target.name
        self._class_labels = model_frame.target.unique()
        
        self._numerical = numerical
            
        if not numerical is None:
            self._categorical = set(model_frame.data.columns) - set(numerical) 
        else:
            self._categorical = set(model_frame.data.columns)
            
    def train(self, train_mf):
        if not isinstance(train_mf, pandas_ml.core.frame.ModelFrame):
            raise ValueError()
            
        self._train_mf = train_mf
        
    def _condition(self, col_name, value): # -> list of boolean
        return self._train_mf[col_name] == value
        
    def _estimate_by_single_row(self, row):
        def eval_func(class_label, row):
            cl = class_label
            
            eval_value = 0.0
            for c in self._categorical:
                eval_value += np.log(
                    (self._condition(self._target_name, cl) & self._condition(c, row[c])).sum() + 1
                )
                eval_value -= np.log(
                    (self._condition(self._target_name, cl)).sum() + 1
                )
                
            for n in self._numerical:
                cond = self._condition(self._target_name, cl)
                mu = self._train_mf[cond][n].mean()
                sigma = self._train_mf[cond][n].std()
                eval_value += norm.logpdf(row[n], loc=mu, scale=sigma)
                
            return eval_value
            
        eval_values = np.array([eval_func(i, row) for i in self._class_labels])
        return self._class_labels[eval_values.argmax()]
    
    def estimate(self, test):
        return test.apply(self._estimate_by_single_row, axis=1)      

In [3]:
df = seaborn.load_dataset('titanic')
mf = pandas_ml.ModelFrame(df, target='survived')
train, test = mf.model_selection.train_test_split(test_size=0.3)

In [4]:
train

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
497,0,3,male,,0,0,15.1,S,Third,man,True,,Southampton,no,True
111,0,3,female,14.5,1,0,14.4542,C,Third,child,False,,Cherbourg,no,False
480,0,3,male,9,5,2,46.9,S,Third,child,False,,Southampton,no,False
870,0,3,male,26,0,0,7.8958,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64,0,1,male,,0,0,27.7208,C,First,man,True,,Cherbourg,no,True
403,0,3,male,28,1,0,15.85,S,Third,man,True,,Southampton,no,False
416,1,2,female,34,1,1,32.5,S,Second,woman,False,,Southampton,yes,False
394,1,3,female,24,0,2,16.7,S,Third,woman,False,G,Southampton,yes,False


In [5]:
hbk = HbkNaiveBayes(mf, numerical=['age', 'fare'])
hbk.train(train)

In [6]:
est = hbk.estimate(test)
est

601    0
264    0
198    0
514    0
      ..
160    0
186    0
33     0
751    1
Length: 268, dtype: int64

In [7]:
test['survived']

601    0
264    0
198    1
514    0
      ..
160    0
186    1
33     0
751    1
Name: survived, Length: 268, dtype: int64

In [8]:
confusion_matrix(list(test['survived']), list(est))

array([[156,   3],
       [ 13,  96]])