### 나이브 베이즈

참고 사이트  
: https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html#sklearn.naive_bayes.GaussianNB  
: https://datascienceschool.net/view-notebook/c19b48e3c7b048668f2bb0a113bd25f7/

scikit-learn 구현된 나이브 베이즈 분류기
- GaussianNB : 정규분포 나이브 베이즈 (연속 데이터)
- BernoulliNB : 베르누이분포 나이브 베이즈 (이진 데이터)
- MultinomialNB : 다항분포 나이브 베이즈 (카운트 데이터)

#### GaussianNB 

In [1]:
import pandas as pd
from sklearn import preprocessing
import warnings

warnings.filterwarnings(action='ignore')

In [2]:
water_melon_data_3 = pd.read_pickle('water_melon_data_3')

In [3]:
len(water_melon_data_3)

17

In [4]:
le = preprocessing.LabelEncoder()

# String label을 숫자로 변경
color=le.fit_transform(water_melon_data_3['color'])
tap_shape=le.fit_transform(water_melon_data_3['tap_shape'])
sound=le.fit_transform(water_melon_data_3['sound'])
stripe=le.fit_transform(water_melon_data_3['stripe'])
navel_shape=le.fit_transform(water_melon_data_3['navel_shape'])
texture=le.fit_transform(water_melon_data_3['texture'])
ripe=le.fit_transform(water_melon_data_3['ripe'])

In [5]:
density = water_melon_data_3['density']
sweet = water_melon_data_3['sweet']

In [6]:
water_melon_data_3['color'] = color
water_melon_data_3['tap_shape'] = tap_shape
water_melon_data_3['sound'] = sound
water_melon_data_3['stripe'] = stripe
water_melon_data_3['navel_shape']= navel_shape
water_melon_data_3['texture']= texture
water_melon_data_3['ripe'] = ripe
water_melon_data_3.head()

Unnamed: 0,num,color,tap_shape,sound,stripe,navel_shape,texture,density,sweet,ripe
0,1,2,1,2,0,1,0,0.697,0.46,1
1,2,1,1,0,0,1,0,0.774,0.376,1
2,3,1,1,2,0,1,0,0.634,0.264,1
3,4,2,1,0,0,1,0,0.608,0.318,1
4,5,0,1,2,0,1,0,0.556,0.215,1


In [7]:
label=le.fit_transform(water_melon_data_3['ripe'])

In [8]:
features = zip(color,tap_shape,sound,stripe,navel_shape,texture,density,sweet)
features = list(features)

In [9]:
from sklearn.naive_bayes import GaussianNB,BernoulliNB,MultinomialNB

# Parameter priors=None, var_smoothing=1e-09
model = GaussianNB()

model.fit(features,label)
predicted = model.predict([[2, 1, 2, 0, 1, 0, 0.697, 0.46]]) # 0:안익음, 2:익음

In [10]:
print('predicted',predicted)
print('model.classes_',model.classes_)
print('model.class_count_',model.class_count_)
print('model.class_prior_',model.class_prior_)

# theta_: 정규분포의 기댓값  μ 
# sigma_: 정규분포의 분산  σ2
print('model.theta_[0]',model.theta_[0])
print('model.sigma_[0]',model.sigma_[0])

predicted [1]
model.classes_ [0 1]
model.class_count_ [9. 8.]
model.class_prior_ [0.52941176 0.47058824]
model.theta_[0] [0.88888889 1.22222222 1.11111111 1.11111111 1.11111111 0.22222222
 0.49611111 0.15422222]
model.sigma_[0] [0.7654321  0.61728395 0.7654321  0.54320988 0.7654321  0.17283951
 0.03370254 0.01032862]


In [11]:
from sklearn.model_selection import cross_val_score,KFold

gnb = GaussianNB(var_smoothing=1e-09)

kfold = KFold(n_splits=17)
scores = cross_val_score(gnb, water_melon_data_3[['color','tap_shape','sound','stripe','navel_shape','texture',\
                                                   'density','sweet']], \
                         water_melon_data_3[['ripe']], cv=kfold)
scores, scores.mean()

(array([1., 1., 1., 1., 0., 1., 0., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1.]),
 0.8235294117647058)

In [12]:
# var_smoothing=1e-09 : 0.8235...
# var_smoothing=1e-01 : 0.6470...

In [13]:
bnb = BernoulliNB()

kfold = KFold(n_splits=17)
scores = cross_val_score(bnb, water_melon_data_3[['color','tap_shape','sound','stripe','navel_shape','texture',\
                                                   'density','sweet']], \
                         water_melon_data_3[['ripe']], cv=kfold)
scores, scores.mean()

(array([1., 1., 1., 1., 0., 1., 0., 1., 1., 0., 1., 1., 1., 1., 0., 1., 1.]),
 0.7647058823529411)

In [14]:
mnb = MultinomialNB()

kfold = KFold(n_splits=17)
scores = cross_val_score(mnb, water_melon_data_3[['color','tap_shape','sound','stripe','navel_shape','texture',\
                                                   'density','sweet']], \
                         water_melon_data_3[['ripe']], cv=kfold)
scores, scores.mean()

(array([1., 0., 1., 1., 1., 1., 0., 1., 1., 0., 1., 1., 0., 1., 0., 1., 1.]),
 0.7058823529411765)