In [7]:
import csv
import random
import math
 
def loadCsv(filename):#读取数据，并处理
    lines = csv.reader(open(filename, 'r'))
    dataset = list(lines)
    for i in range(len(dataset)):
        dataset[i] = [float(x) for x in dataset[i]]# 将所有数据变更为float型
    return dataset
 
def splitDataset(dataset, splitRatio):#训练测试集分割
    trainSize = int(len(dataset) * splitRatio)
    trainSet = []
    copy = list(dataset)
    while len(trainSet) < trainSize:
        index = random.randrange(len(copy))
        trainSet.append(copy.pop(index))
    return [trainSet, copy]

def pre_probs(trainSet):
    pre_p=[0]*4
    for i in range(len(trainSet)):
        if trainSet[i][0]==1 :pre_p[1]=pre_p[1]+1
        if trainSet[i][0]==2 :pre_p[2]=pre_p[2]+1
        if trainSet[i][0]==3 :pre_p[3]=pre_p[3]+1
    for i in range(1,len(pre_p)):
        pre_p[i]=pre_p[i]/float(len(trainSet))
    return pre_p
def separateByClass(dataset):#按照类别（最后一个属性值）对数据进行划分，返回的separated格式为{0: [[2, 21, 0]], 1: [[1, 20, 1], [3, 22, 1]]}
    separated = {}
    for i in range(len(dataset)):
        vector = dataset[i]
        if (vector[0] not in separated):
            separated[vector[0]] = []
        separated[vector[0]].append(vector)
    return separated
 
def mean(numbers):#计算均值，输入为一组list，sum（list）求和，再除以list长度
    return sum(numbers)/float(len(numbers))
 
def stdev(numbers): #计算标准差，输入list，通过计算好的均值计算标准差
    avg = mean(numbers)
    variance = sum([pow(x-avg,2) for x in numbers])/float(len(numbers)-1)
    return math.sqrt(variance)
    
def summarize(dataset):#计算每一个属性的特征值，输入为一个二维list，通过zip函数提取出同一类属性attribute（list），利用函数计算均值
                       #返回一个二维list，[(2.0, 1.0), (21.0, 1.0)]
    summaries = [(mean(attribute), stdev(attribute)) for attribute in zip(*dataset)]
    del summaries[0]
    return summaries
 
def summarizeByClass(dataset):#将不同的类别数据划分，对不同属性值分别计算其特征值
            #{0: [(3.0, 1.4142135623730951), (21.5, 0.7071067811865476)],1: [(2.0, 1.4142135623730951), (21.0, 1.4142135623730951)]}
    separated = separateByClass(dataset)
    summaries = {}
    for classValue, instances in separated.items():
        summaries[classValue] = summarize(instances)
    return summaries
 
def calculateProbability(x, mean, stdev):#概率计算，每一个属性为单位进行计算
    exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2))))
    return (1 / (math.sqrt(2*math.pi) * stdev)) * exponent

def calculateClassProbabilities(summaries, inputVector):#计算每一个类概率
    probabilities = {}
    psum=0
    for classValue, classSummaries in summaries.items():
        
        probabilities[classValue] = 0
        for i in range(len(classSummaries)):#读取每个属性的特征值并计算相对应的概率
            mean, stdev = classSummaries[i]
            x = inputVector[i]
            probabilities[classValue] += calculateProbability(x, mean, stdev)#对数处理之后，采用累加得到对应类别的概率
        psum+=probabilities[classValue]
    for classValue, classSummaries in summaries.items():#概率归一化
        probabilities[classValue]=probabilities[classValue]/psum
    return probabilities
"""
def calculateClassProbabilities(summaries, inputVector):#计算每一个类概率
    probabilities = {}
    for classValue, classSummaries in summaries.items():
        probabilities[classValue] = 0
        for i in range(len(classSummaries)):#读取每个属性的特征值并计算相对应的概率
            mean, stdev = classSummaries[i]
            x = inputVector[i]
            probabilities[classValue] *= calculateProbability(x, mean, stdev)#累乘之后得到对应类别的概率
    return probabilities
""" 
def predict(summaries, inputVector):#根据概率进行预测
    probabilities = calculateClassProbabilities(summaries, inputVector)
    bestLabel, bestProb = None, -1
    for classValue, probability in probabilities.items():
        if bestLabel is None or probability > bestProb:
            bestProb = probability
            bestLabel = classValue
    return bestLabel
 
def getPredictions(summaries, testSet):#读取每一个分类的特征值和测试集进行预测
    predictions = []
    for i in range(len(testSet)):
        result = predict(summaries, testSet[i])
        predictions.append(result)
    return predictions
 
def getAccuracy(testSet, predictions):#计算精确值
    correct = 0
    for i in range(len(testSet)):
        if testSet[i][0] == predictions[i]:
            correct += 1
    return (correct/float(len(testSet))) * 100.0
 
def main():
    filename = 'winedata.csv'
    splitRatio = 0.70
    dataset = loadCsv(filename)
    trainingSet, testSet = splitDataset(dataset, splitRatio)
    print(('Split {0} rows into train={1} and test={2} rows').format(len(dataset), len(trainingSet), len(testSet)))
    # prepare model
    summaries = summarizeByClass(trainingSet)
    # test model
    #pre_p=pre_probs(trainingSet)
    #print(pre_p)
    #predictions = getPredictions(summaries, testSet,pre_p)
    predictions = getPredictions(summaries, testSet)
    accuracy = getAccuracy(testSet, predictions)
    print(('our_Accuracy: {0}%').format(accuracy))

main()

Split 178 rows into train=124 and test=54 rows
our_Accuracy: 61.111111111111114%
