# 朴素贝叶斯算法实现

通过身高、体重、脚的尺寸来判断性别。

In [1]:
import pandas as pd
import numpy as np


In [2]:
data = pd.DataFrame()

# 定义标签
data['Gender'] = ['male', 'male', 'male', 'male','female', 'female', 'female', 'female']

# 定义属性
data['Height'] = [6, 5.92, 5.58, 5.92, 5, 5.5, 5.42, 5.75]
data['Weight'] = [180, 190, 170, 165, 100, 150, 130, 150]
data['FootSize'] = [12, 11, 12, 10, 6, 8, 7, 9]

In [3]:
data

Unnamed: 0,Gender,Height,Weight,FootSize
0,male,6.0,180,12
1,male,5.92,190,11
2,male,5.58,170,12
3,male,5.92,165,10
4,female,5.0,100,6
5,female,5.5,150,8
6,female,5.42,130,7
7,female,5.75,150,9


In [4]:
# 建立一个空的dataframe，也就是做一个待预测样本。
person = pd.DataFrame()

# 填写待预测值的属性。
person['Height'] = [6]
person['Weight'] = [130]
person['FootSize'] = [8]

In [5]:
person

Unnamed: 0,Height,Weight,FootSize
0,6,130,8


按照上课对朴素贝叶斯的定义来做计算。

In [6]:
# 统计男人的总数。
n_male = data['Gender'][data['Gender'] == 'male'].count()

# 女人的总数。
n_female = data['Gender'][data['Gender'] == 'female'].count()

# 全部的人数。
total_ppl = data['Gender'].count()

计算两个最简单的概率。
这里计算出了男性的先验概率和女性的先验概率。这个先验概率就是说不给定相关的特征，计算出已知数据中有多少男性和女性的概率。

In [9]:
p_male = n_male/total_ppl
p_female = n_female/total_ppl

算两个指标数据，便于接下来的计算。
首选计算所有特征的平均值。

In [10]:
data_means = data.groupby('Gender').mean()
data_means

Unnamed: 0_level_0,Height,Weight,FootSize
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,5.4175,132.5,7.5
male,5.855,176.25,11.25


然后，计算所有特质的方差。

In [11]:
data_variance = data.groupby('Gender').var()
data_variance

Unnamed: 0_level_0,Height,Weight,FootSize
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.097225,558.333333,1.666667
male,0.035033,122.916667,0.916667


获取不同性别不同特征的均值和方差。

In [15]:
male_height_mean = data_means['Height'][data_means.index == 'male'].values[0]
male_weight_mean = data_means['Weight'][data_means.index == 'male'].values[0]
male_footsize_mean = data_means['FootSize'][data_means.index == 'male'].values[0]

male_height_variance = data_variance['Height'][data_variance.index == 'male'].values[0]
male_weight_variance = data_variance['Weight'][data_variance.index == 'male'].values[0]
male_footsize_variance = data_variance['FootSize'][data_variance.index == 'male'].values[0]

female_height_mean = data_means['Height'][data_means.index == 'female'].values[0]
female_weight_mean = data_means['Weight'][data_means.index == 'female'].values[0]
female_footsize_mean = data_means['FootSize'][data_means.index == 'female'].values[0]

female_height_variance = data_variance['Height'][data_variance.index == 'female'].values[0]
female_weight_variance = data_variance['Weight'][data_variance.index == 'female'].values[0]
female_footsize_variance = data_variance['FootSize'][data_variance.index == 'female'].values[0]

In [16]:
print(male_height_mean, male_weight_mean, male_footsize_mean, 
      male_height_variance, male_weight_variance, male_footsize_variance)

print(female_height_mean, female_weight_mean, female_footsize_mean, 
      female_height_variance, female_weight_variance, female_footsize_variance)

5.855 176.25 11.25 0.0350333333333333 122.91666666666667 0.9166666666666665
5.4175 132.5 7.5 0.09722499999999994 558.3333333333334 1.6666666666666667


通过平均值和方差可以计算$P(x|y)$的概率。强调了一下是$P(x|y)$，而不是$P(y|x)$。 \n
假设身高是符合高斯分布的，所以使用了下面的公式。

In [17]:
def p_x_given_y(x, mean_y, variance_y):
    p = 1/(np.sqrt(2*np.pi*variance_y))* np.exp((-(x-mean_y)**2)/(2*variance_y))
    return p

假设测试数据是男性的概率为：

In [24]:
isMale = p_male *\
p_x_given_y(person['Height'][0], male_height_mean, male_height_variance) * \
p_x_given_y(person['Weight'][0], male_weight_mean, male_weight_variance) * \
p_x_given_y(person['FootSize'][0], male_footsize_mean, male_footsize_variance)

假设测试数据是女性的概率为：

In [25]:
isFemale = p_female *\
p_x_given_y(person['Height'][0], female_height_mean, female_height_variance) * \
p_x_given_y(person['Weight'][0], female_weight_mean, female_weight_variance) * \
p_x_given_y(person['FootSize'][0], female_footsize_mean, female_footsize_variance)

比较两个值的大小就知道预测的结果了。

In [26]:
if isMale > isFemale:
    print("This person probable is a male.")
elif isMale < isFemale:
    print("This person probable is a female.")
else :
    print("The program can't judge gender.")

This person probable is a female.
