# 朴素贝叶斯分类器
## 下载数据集
皮马印第安人糖尿病数据集：https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv

In [65]:
import csv
import random
import math

In [66]:
def LoadData(filename):
    lines = csv.reader(open(filename, "r"))
    dataset = list(lines)
    for i in range(len(dataset)):
        dataset[i] = [float(x) for x in dataset[i]]
    return dataset
filename = 'pima-indians-diabetes.data.csv'
dataset = LoadData(filename)

In [67]:
dataset[:2]

[[6.0, 148.0, 72.0, 35.0, 0.0, 33.6, 0.627, 50.0, 1.0],
 [1.0, 85.0, 66.0, 29.0, 0.0, 26.6, 0.351, 31.0, 0.0]]

### 查看数据集
前八列分别表示属性特征：
1．怀孕次数。  
2．2小时口服葡萄糖耐量测试中得到的血糖浓度。  
3．舒张期血压（mm Hg）。  
4．三头肌皮脂厚度（mm）。  
5．2小时血清胰岛素（mu U/ml）。  
6．身体质量指数（体重kg/（身高in m）^2）。  
7．糖尿病家族遗传作用值。  
8．年龄。  
最后一列代表类别。

### 划分数据集

In [68]:
def SplitDataset(dataset, ratio):
    length = int(len(dataset) * ratio)
    trainset = []
    copy = list(dataset)
    while len(trainset) < length:
        index = random.randrange(len(copy))
        trainset.append(copy.pop(index))
    return [trainset, copy]

## 提取特征
### 按类别划分数据
将训练数据集中的样本按照类别进行划分，统计每个类的数据数量。

In [69]:
def SeparateByClass(dataset):
    separated = {}
    for i in range(len(dataset)):
        vector = dataset[i]
        if (vector[-1] not in separated):
            separated[vector[-1]] = []
        separated[vector[-1]].append(vector)
    return separated

### 计算均值和标准差
计算每个类中属性的均值以及每个类中属性的标准差

In [70]:
def Mean(numbers):
    return sum(numbers) / float(len(numbers))

def Stdev(numbers):
    avg = mean(numbers)
    variance = sum([pow(x - avg, 2) for x in numbers]) / float(len(numbers) - 1)
    return math.sqrt(variance)

### 提取数据集特征
对数据样本的每个属性计算均值和标准差

In [87]:
def Summarize(dataset):
    summaries = [(mean(attribute), stdev(attribute)) for attribute in zip(*dataset)]
    del summaries[-1]
    return summaries

In [88]:
def SummarizeByClass(dataset):
    separated = separateByClass(dataset)
    summaries = {}
    for classValue, instances in separated.items():
        summaries[classValue] = Summarize(instances)
    return summaries


## 预测
计算高斯分布的概率密度函数

In [73]:
def ClaculateProbability(x, mean, stdev):
    exponent = math.exp(-(math.pow(x - mean, 2) / (2 * math.pow(stdev, 2))))
    return (1 / (math.sqrt(2 * math.pi) * stdev)) * exponent

### 计算所属类的概率

In [74]:
def CalculateClassProbabilities(summaries, inputvector):
    probabilities = {}
    for classvalue, classsummaries in summaries.items():
        probabilities[classvalue] = 1
        for i in range(len(classsummaries)): # 特征维度
            mean, stdev = classsummaries[i]
            x = inputvector[i]
            probabilities[classvalue] *= calculateprobability(x, mean, stdev)
    return probabilities

In [75]:
def Predict(summaries, inputvector):
    probabilities = CalculateClassProbabilities(summaries, inputvector)
    bestlabel, bestprob = None, -1
    for classvalue, probability in probabilities.items():
        if bestlabel is None or probability > bestprob:
            bestpro = probability
            bestlabel = classvalue
    return bestlabel

In [76]:
def GetPredictions(summaries, testset):
    predictions = []
    for i in range(len(testset)):
        result = predict(summaries, testset[i])
        predictions.append(result)
    return predictions


In [77]:
def GetAccuracy(testset, predictions):
    correct = 0
    for x in range(len(testset)):
        if testset[x][-1] == predictions[x]:
            correct += 1
    return (correct / float(len(testset))) * 100.0

In [78]:
trainset, testset = SplitDataset(dataset, 0.67)

In [80]:
print(len(trainset))
print(len(testset))

514
254


In [89]:
summaries = SummarizeByClass(trainset)

In [90]:
predictions = GetPredictions(summaries, testset)

In [91]:
accuracy = GetAccuracy(testset, predictions)

In [92]:
print('Accuracy: {0}%'.format(accuracy))

Accuracy: 76.37795275590551%
