# 単純ベイズ分類器

In [91]:
import pprint
import numpy as np
from fractions import Fraction

# 上位K件
TOP_K = 3
# スムージングパラメタ
ALPHA = 1
# クラス数
N = 2
# 各特徴量がとりうる値のユニーク数
M = [2, 2, 2, 2, 2, 2]
# しきい値
THETA = 0.5

Du = np.array([
               [1, 0, 0, 0, 1, 0, +1],
               [0, 1, 0, 0, 1, 0, +1],
               [1, 1, 0, 0, 1, 0, +1],
               [1, 0, 0, 1, 1, 0, +1],
               [1, 0, 0, 0, 0, 1, +1],
               [0, 1, 0, 1, 0, 1, +1],
               [0, 0, 1, 0, 1, 0, -1],
               [0, 0, 1, 1, 1, 0, -1],
               [0, 1, 0, 0, 1, 1, -1],
               [0, 0, 1, 0, 0, 1, -1],
               [1, 1, 0, 1, 1, 0, np.nan],
               [0, 0, 1, 0, 1, 1, np.nan],
               [0, 1, 1, 1, 1, 0, np.nan],
])
I = np.arange(Du.shape[0])
x = Du[:,:-1]
ru = Du[:,-1]

Iu = I[~np.isnan(ru)]
Iu_not = np.setdiff1d(I, Iu)
DuL = Du[Iu]
xL = x[Iu]
ruL = ru[Iu]
DuU = Du[Iu_not]
xU = x[Iu_not]

In [92]:
#事前確率(r=1)
num = ruL[ruL == 1].size
den = ruL.size
pR = num / den
pR

0.6

In [145]:
# 事前確率(r=-1)
num = ruL[ruL == -1].size
den = ruL.size
nR = num / den
nR

0.4

In [126]:
# 単純ベイズ仮定で嗜好予測
for i in Iu_not:
    for r in [1,-1]:
        R = ruL[ruL==r].size/ruL.size
        p = np.zeros(x.shape[1]);
        for k in np.arange(x.shape[1]):
            num = DuL[ruL==r][xL[:,k][ruL==r]==x[i,k]].shape[0]
            den = DuL[ruL==r].shape[0]
            p[k] = num/den
        print('P(X{}=x{},{}|R={:+}) = {}'.format(k, i, k, r, R*np.prod(p)))

P(X5=x10,5|R=+1) = 0.029629629629629627
P(X5=x10,5|R=-1) = 0.0
P(X5=x11,5|R=+1) = 0.0
P(X5=x11,5|R=-1) = 0.06328125
P(X5=x12,5|R=+1) = 0.0
P(X5=x12,5|R=-1) = 0.00703125


In [127]:
# ラプラススムージングあり
for i in Iu_not:
    for r in [1,-1]:
        R = (ruL[ruL==r].size + ALPHA)/(ruL.size + ALPHA*N)
        p = np.zeros(x.shape[1]);
        for k in np.arange(x.shape[1]):
            num = DuL[ruL==r][xL[:,k][ruL==r]==x[i,k]].shape[0] + ALPHA
            den = DuL[ruL==r].shape[0] + ALPHA*M[k]
            p[k] = num/den
        print('P(X{}=x{},{}|R={:+}) = {}'.format(k, i, k, r, R*np.prod(p)))

P(X5=x10,5|R=+1) = 0.023365020751953125
P(X5=x10,5|R=-1) = 0.0008573388203017832
P(X5=x11,5|R=+1) = 0.002002716064453125
P(X5=x11,5|R=-1) = 0.034293552812071325
P(X5=x12,5|R=+1) = 0.002002716064453125
P(X5=x12,5|R=-1) = 0.008573388203017831


# 推薦
スコア関数

$$
\mathrm{score}(u, i) = \frac{P(R = +1) \prod_{k=1}^{d} P(X_{k} = x_{i,k} \mid R = +1)}{P(R = +1) \prod_{k=1}^{d} P(X_{k} = x_{i,k} \mid R = +1) + P(R = -1) \prod_{k=1}^{d} P(X_{k} = x_{i,k} \mid R = -1)}
$$

In [141]:
scores = {}
pR = (ruL[ruL==1].size + ALPHA)/(ruL.size + ALPHA*N)
nR = (ruL[ruL==-1].size + ALPHA)/(ruL.size + ALPHA*N)
for i in Iu_not:
    pp = np.zeros(x.shape[1]);
    pk = np.zeros(x.shape[1]);
    for r in [1,-1]:
        for k in np.arange(x.shape[1]):
            num = DuL[ruL==r][xL[:,k][ruL==r]==x[i,k]].shape[0] + ALPHA
            den = DuL[ruL==r].shape[0] + ALPHA*M[k]
            if r == 1:
                pp[k] = num/den
            else:
                pk[k] = num/den
    
    scores.update({i: pR*np.prod(pp)/(pR*np.prod(pp)+nR*np.prod(pk))})

In [142]:
scores

{10: 0.9646054787625311, 11: 0.055176912846500135, 12: 0.18936236007174226}

In [143]:
scores = {i: p for i, p in scores.items() if p >= THETA}
rec_list = dict(sorted(scores.items(), key=lambda x:x[1], reverse=True)[:TOP_K])

In [144]:
rec_list

{10: 0.9646054787625311}