# 朴素贝叶斯分类器

In [94]:
import numpy as np
import pandas as pd

In [95]:
data = pd.read_csv('西瓜数据集.csv',engine='python') # 由于数据集中包含中文，而python3是支持中文的，只是需要指定engine为python 

In [96]:
data

Unnamed: 0,色泽,根蒂,敲声,纹理,脐部,触感,密度,含糖率,好瓜
0,青绿,蜷缩,浊响,清晰,凹陷,硬滑,0.697,0.46,是
1,乌黑,蜷缩,沉闷,清晰,凹陷,硬滑,0.774,0.376,是
2,乌黑,蜷缩,浊响,清晰,凹陷,硬滑,0.634,0.264,是
3,青绿,蜷缩,沉闷,清晰,凹陷,硬滑,0.608,0.318,是
4,浅白,蜷缩,浊响,清晰,凹陷,硬滑,0.556,0.215,是
5,青绿,蜷缩,浊响,清晰,稍凹,软粘,0.403,0.237,是
6,乌黑,稍蜷,浊响,稍糊,稍凹,软粘,0.481,0.149,是
7,乌黑,稍蜷,浊响,清晰,稍凹,硬滑,0.437,0.211,是
8,乌黑,稍蜷,沉闷,稍糊,稍凹,硬滑,0.666,0.091,否
9,青绿,硬挺,清脆,清晰,平坦,软粘,0.243,0.267,否


### 测试样本 
No.17：$x_t$ = [青绿 蜷缩 浊响 清晰 凹陷 硬滑 0.697 0.460 ？]

### 属性编码
这里我们首先对离散特征进行编码

In [97]:
print(data['色泽'].unique())
print(data['根蒂'].unique())
print(data['敲声'].unique())
print(data['脐部'].unique())
print(data['触感'].unique())

['青绿' '乌黑' '浅白']
['蜷缩' '稍蜷' '硬挺']
['浊响' '沉闷' '清脆']
['凹陷' '稍凹' '平坦']
['硬滑' '软粘']


In [100]:
features = ['色泽','根蒂','敲声','脐部','触感','纹理']
for i in features:
    for j,k in enumerate(data[i].unique()):
        data[i].replace(k,j,inplace=True)

In [101]:
data

Unnamed: 0,色泽,根蒂,敲声,纹理,脐部,触感,密度,含糖率,好瓜
0,0,0,0,0,0,0,0.697,0.46,是
1,1,0,1,0,0,0,0.774,0.376,是
2,1,0,0,0,0,0,0.634,0.264,是
3,0,0,1,0,0,0,0.608,0.318,是
4,2,0,0,0,0,0,0.556,0.215,是
5,0,0,0,0,1,1,0.403,0.237,是
6,1,1,0,1,1,1,0.481,0.149,是
7,1,1,0,0,1,0,0.437,0.211,是
8,1,1,1,1,1,0,0.666,0.091,否
9,0,2,2,0,2,1,0.243,0.267,否


In [106]:
test= data.loc[17] # 最后一个样本为测试用例
test['色泽']

0

### 计算测试样本是好瓜的概率
即计算$$P(y\ |\ x) = \frac{P(x\ |\ y)P(y)}{P(x)}，$$朴素贝叶斯假设属性之间相互独立，则有：  
$$P(y\ |\ x) = \frac{P(y)}{P(x)}\prod_{i=1}^dP(x_i\ |\ c)$$

### 按类别划分数据集

In [135]:
good_num = np.sum(data['好瓜'] == '是') # 好瓜数量统计
bad_num = np.sum(data['好瓜'] == '否') # 坏瓜数量统计
class_pro = good_num / (len(data) - 1)
joint_pro = [0] * 6
feature_pro = [0] * 6 # 属性条件概率统计
for index,i in enumerate(data.columns[:-3]):
    for j in range(len(data)):
        if (data.loc[j][i] == test[i]): 
                joint_pro[index] += 1
                if data.loc[j]['好瓜'] == '是': # 离散变量
                    feature_pro[index] += 1
print(joint_pro)       
print(feature_pro)

feature_pro = feature_pro / good_num # 利用python广播计算属性条件概率
joint_pro = np.array(joint_pro) / (len(data) - 1)

[7, 10, 11, 10, 8, 13]
[3, 6, 6, 7, 5, 6]


### 计算$P(c\ |\ x)$
$P(c\ |\ x) = \frac{P(x\ |\ c)P(c)}{P(x)}$

In [136]:
from functools import reduce
def prod(x,y):
    return x * y

feature_pro = reduce(prod,feature_pro)
joint_pro = reduce(prod,joint_pro)

In [137]:
feature_pro * class_pro / joint_pro

1.2271952275629645

In [138]:
class_pro = good_num / (len(data) - 1)
joint_pro = [0] * 6
feature_pro = [0] * 6 # 属性条件概率统计
for index,i in enumerate(data.columns[:-3]):
    for j in range(len(data)):
        if (data.loc[j][i] == test[i]): 
                joint_pro[index] += 1
                if data.loc[j]['好瓜'] == '否': # 离散变量
                    feature_pro[index] += 1
print(joint_pro)       
print(feature_pro)

feature_pro = feature_pro / good_num # 利用python广播计算属性条件概率
joint_pro = np.array(joint_pro) / (len(data) - 1)

[7, 10, 11, 10, 8, 13]
[3, 3, 4, 2, 2, 6]


In [139]:
feature_pro = reduce(prod,feature_pro)
joint_pro = reduce(prod,joint_pro)

In [140]:
feature_pro * class_pro / joint_pro

0.04675029438335103