In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
#    导入sklearn自带的数据集
from sklearn.datasets import load_iris
#    导入数学函数库
import math

In [2]:
iris = load_iris()
X = iris.data
y = iris.target

该数据集是**连续的特征向量**，我们可以假设每一个特征都符合**高斯分布**，通过高斯分布的概率密度求出条件概率密度

In [3]:
#    分割为训练集、测试集
X_train,X_test, y_train, y_test = train_test_split(X,y,test_size=0.4, random_state=5)

In [4]:
from collections import Counter
ck = set(y_train)
#    先验概率
prior_probability = {}
for n in ck:
    prior_probability[n] = Counter(y_train)[n]/len(y_train)
prior_probability

{0: 0.3333333333333333, 1: 0.32222222222222224, 2: 0.34444444444444444}

In [5]:
def gaussian(l, n, ck, X_train, y_train):
    '''
    高斯分布
    l: 第l个属性
    n: 测试集的某个特征
    ck: label
    X: 训练集特征
    '''
    #    转为DataFrame
    data = pd.DataFrame(X_train)
    data['label'] = y_train
    #    只计算ck类别的均值和方差
    ck_data = data[data['label'] == ck]
    #    均值
    mean = ck_data.mean()
    #    方差
    std = ck_data.std()
    #    高斯分布函数 把此作为条件概率返回
    return (math.e**(-(((n-mean[l])**2)/std[l])))/((2*math.pi*std[l])**0.5)


In [6]:
def naive_bayes(X_train, y_train, X_test):
    '''
    朴素贝叶斯算法 贝叶斯估计
    X_train: 训练集的特征向量
    y_train: 训练集的标签
    X_test:  测试集的特征向量
    '''
    #    先验概率
    ck = set(y_train)
    prior_probability = {}
    for n in ck:
        prior_probability[n] = Counter(y_train)[n]/len(y_train)
    #    存放预测的标签值   
    predict = []
    #    对一个特征向量进行朴素贝叶斯
    for i in range(X_test.shape[0]):
        post_prob_all = {}
        #    对每个标签进行后验概率的计算
        for n in ck:
            post_prob = prior_probability[n]
            for j in range(X_test.shape[1]):
                
                post_prob *= gaussian(j, X_test[i,j], n, X_train, y_train)
            post_prob_all[n] = post_prob
        
        #    从所有的后验概率中选出最大后验概率标签作为预测标签
        predict_i = None
        max_prob = -1
        for key in post_prob_all:
            if post_prob_all[key] > max_prob:
                predict_i = key
                max_prob = post_prob_all[key]
                
        predict.append(predict_i)
    return predict



In [7]:
#    预测
predict = naive_bayes(X_train, y_train, X_test)

correct = [1 if i==j else 0 for (i,j) in zip(y_test,predict)]
#    正确率
correct.count(1)/len(correct)

0.9666666666666667