In [2]:
import pandas as pd
import numpy as np
import math
import random

### 1.数据集载入
读取并通过取反选择其中两类数据

In [3]:
data_df = pd.read_csv('IrisData.csv')
#data_df = data_df[~data_df["Species"].isin(["Setosa"])]
data_df.head()

Unnamed: 0,sepal length,sepal width,petal length,petal width,Species
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
3,4.6,3.1,1.5,0.2,Setosa
4,5.0,3.6,1.4,0.2,Setosa


### 2.划分训练集与测试集
参数ratio为训练集的占比,返回值为列表

In [4]:
def splitData(data_list,ratio):
    train_size = int(len(data_list)*ratio)
    random.shuffle(data_list)
    train_set = data_list[:train_size]
    test_set = data_list[train_size:]
    return train_set,test_set

In [5]:
data_list = np.array(data_df).tolist()
trainset,testset = splitData(data_list,ratio = 0.7)
print('Split {0} samples into {1} train and {2} test samples '.format(len(data_df), len(trainset), len(testset)))

Split 150 samples into 105 train and 45 test samples 


### 3.提取数据特征
所收集的训练数据的特征，包含相对于每个类的每个属性的均值和标准差。举例来说，这里有2个类和4个数值属性，然后我们需要每一个属性（4）和类（2）的组合的均值和标准差，也就是8个属性特征。

#### 3.1 按类别划分数据，返回值为划分好的数据，以及划分好的数据集中每个类别的样本数

In [6]:
def seprateByClass(dataset):
    seprate_dict = {}
    info_dict = {}
    for vector in dataset:
        if vector[-1] not in seprate_dict:
            seprate_dict[vector[-1]] = []
            info_dict[vector[-1]] = 0
        seprate_dict[vector[-1]].append(vector)
        info_dict[vector[-1]] +=1
    return seprate_dict,info_dict

In [7]:
train_separated,train_info = seprateByClass(trainset)
train_info

{'Setosa': 41, 'Versicolour': 33, 'Virginica': 31}

#### 3.2 计算均值和方差
$ var = \frac{\sum(x-avg)^{2}}{n-1}$

In [8]:
def mean(list):
    list = [float(x) for x in list] #字符串转数字
    return sum(list)/float(len(list))
def var(list):
    list = [float(x) for x in list]
    avg = mean(list)
    var = sum([math.pow((x-avg),2) for x in list])/float(len(list)-1)
    return var

#### 3.3 计算每个属性的均值和方差
zip函数将数据样本按照属性分组为一个个列表，然后可以对每个属性计算均值和标准差。

In [9]:
def summarizeAttribute(dataset):
    dataset = np.delete(dataset,-1,axis = 1) # delete label
    summaries = [(mean(attr),var(attr)) for attr in zip(*dataset)]
    return summaries

In [10]:
summary = summarizeAttribute(trainset)
summary

[(5.758095238095239, 0.7345732600732595),
 (3.065714285714285, 0.18592857142857133),
 (3.5533333333333323, 3.2627051282051274),
 (1.1142857142857148, 0.6014285714285714)]

#### 3.4 按类别提取属性特征

In [11]:
def summarizeByClass(dataset):
    dataset_separated,dataset_info = seprateByClass(dataset)
    summarize_by_class = {}
    for classValue, vector in dataset_separated.items():
        summarize_by_class[classValue] = summarizeAttribute(vector)
    return summarize_by_class

In [12]:
train_Summary_by_class = summarizeByClass(trainset)
train_Summary_by_class

{'Setosa': [(4.982926829268291, 0.12445121951219511),
  (3.3975609756097565, 0.1417439024390244),
  (1.4707317073170731, 0.03412195121951221),
  (0.24390243902439032, 0.012024390243902434)],
 'Versicolour': [(5.933333333333334, 0.2766666666666667),
  (2.7909090909090906, 0.08960227272727272),
  (4.254545454545454, 0.23755681818181815),
  (1.33030303030303, 0.03905303030303031)],
 'Virginica': [(6.596774193548387, 0.5036559139784946),
  (2.9193548387096775, 0.10427956989247314),
  (5.5612903225806445, 0.37711827956989247),
  (2.0354838709677416, 0.06369892473118278)]}

### 4. 贝叶斯分类器，计算概率并选择具有最大概率的类作为预测结果

#### 算法流程
由于此处的数值为连续值，所以在计算类条件概率的时候要用概率密度函数
1. 计算样本属于某类的先验概率p[yi]， //属于A类的概率，B类的概率....
2. 通过概率密度函数计算 p(xi|c) ，首先计算在一个属性的前提下，该样本属于某类的概率；相乘合并所有属性的概率，即为某个数据样本属于某类的类条件概率 // 若为离散值，则通过类别数量比值计算p(xi|c)
3. 计算1和2的乘积，结果的最大值就是该样本所属的类别

#### 4.1 先验概率

In [13]:
def calulateClassPriorProb(dataset,dataset_info):
    dataset_prior_prob = {}
    sample_sum = len(dataset)
    for class_value, sample_nums in dataset_info.items():
        dataset_prior_prob[class_value] = sample_nums/float(sample_sum)
    return dataset_prior_prob

In [14]:
prior_prob = calulateClassPriorProb(trainset,train_info)
prior_prob

{'Setosa': 0.3904761904761905,
 'Versicolour': 0.3142857142857143,
 'Virginica': 0.29523809523809524}

#### 4.2 类条件概率
计算类条件概率 $\prod_{i=1}^d p(xi|c)$,计算各特征的各条件概率的乘积，如下所示：
 - 判断为A类的概率：p(A|特征1)*p(A|特征2)*p(A|特征3)*p(A|特征4).....
 - 判断为B类的概率：p(B|特征1)*p(B|特征2)*p(B|特征3)*p(B|特征4).....
 - 判断为C类的概率：p(C|特征1)*p(C|特征2)*p(C|特征3)*p(C|特征4).....
 
$ p(xi|c) = \frac{1}{\sqrt{2\pi}\sigma_{c,i}}exp(-\frac{(xi-mean_{c,i})^{2}}{2\sigma_{c,i}^{2}})$ , $\sigma$是标准差（方差开方）

In [15]:
def calculateProb(x,mean,var):
    exponent = math.exp(math.pow((x-mean),2)/(-2*var))
    p = (1/math.sqrt(2*math.pi*var))*exponent
    return p

In [16]:
def calculateClassProb(input_data,train_Summary_by_class):
    prob = {}
    for class_value, summary in train_Summary_by_class.items():
        prob[class_value] = 1
        for i in range(len(summary)):
            mean,var = summary[i]
            x = input_data[i]
            p = calculateProb(x,mean,var)
        prob[class_value] *=p
    return prob

In [17]:
input_vector = testset[1]
input_data = input_vector[:-1]
train_Summary_by_class = summarizeByClass(trainset)
class_prob = calculateClassProb(input_data,train_Summary_by_class)
class_prob

{'Setosa': 3.3579279836005993,
 'Versicolour': 1.5896628317396685e-07,
 'Virginica': 5.176617264913899e-12}

#### 4.3 先验概率*类条件概率
由于P（x）在一个样本中是相等的，所以贝叶斯分类器只需要比较分子部分：先验概率*类条件概率，最终属于哪类的概率最大，则判别为哪类，此处为最小错误率贝叶斯分类，若采用最小风险需要加上判断为每个类别的风险损失值

In [16]:
def bayesianPredictOneSample(input_data):
    prior_prob = calulateClassPriorProb(trainset,train_info)
    train_Summary_by_class = summarizeByClass(trainset)
    classprob_dict = calculateClassProb(input_data,train_Summary_by_class)
    result = {}
    for class_value,class_prob in classprob_dict.items():
        p = class_prob*prior_prob[class_value]
        result[class_value] = p
    return max(result,key=result.get)

### 5.利用分类器进行预测

#### 5.1 单个样本预测类别

In [17]:
input_vector = testset[1]
input_data = input_vector[:-1]
result = bayesianPredictOneSample(input_data)
print("the sameple is predicted to class: {0}.".format(result))

the sameple is predicted to class: Versicolour.


#### 5.2 对测试集进行判别，并计算分类准确率

In [18]:
def calculateAccByBeyesian(dataset):
    correct = 0
    for vector in dataset:
        input_data = vector[:-1]
        label = vector[-1]
        result = bayesianPredictOneSample(input_data)
        if result == label:
            correct+=1
    return correct/len(dataset)

In [19]:
acc = calculateAccByBeyesian(testset)
acc

0.9333333333333333