In [1]:
import numpy
import math

# 导入数据集

In [2]:
def importdata(filename = 'dataset1.txt') :
    f = open(filename,'r')
    dataset = []
    arr = []
    for item in f :
        vars = item.split()
        dataset.append([float(vars[0]), float(vars[1]), vars[2].upper()])
    return dataset 

# 获取不同类别的期望，方差，标准差，类别的先验概率以及变量之间的系数

In [3]:
def getParameters(dataset) :
    class1 = []
    class2 = []
    class_sum = []
    for item in dataset :
        class_sum.append([item[0],item[1]])
        if item[-1] == 'F' :
            class1.append([item[0],item[1]])
        if item[-1] == 'M' :
            class2.append([item[0],item[1]])
    class1 = numpy.array(class1)
    class2 = numpy.array(class2)
    class_total = numpy.array(class_sum)
    mean1 = numpy.mean(class1,axis=0)
    variance1 = numpy.var(class1,axis=0)
    stand_deviation1 = numpy.std(class1,axis=0)
    mean2 = numpy.mean(class2,axis=0)
    variance2 = numpy.var(class2,axis=0)
    stand_deviation2 = numpy.std(class2,axis=0)
    class_total = (len(class1) + len(class2)) * 1.0
     
    mean = numpy.mean(class_sum, axis=0)
    stand_deviation = numpy.std(class_sum, axis=0)
     
    new_arr = [ ((item[0] - mean[0]) * (item[1] - mean[1]) / stand_deviation[0] / stand_deviation[1])  for item in dataset]
    coefficient = numpy.mean(new_arr)
 
    return (mean1,mean2),(variance1,variance2),(stand_deviation1, stand_deviation2),(len(class1)/class_total,len(class2)/class_total),coefficient
      

# 生成高斯函数

In [4]:
def GaussianFunc(mean, variance, stand_deviation, coefficient) :
    def func(X) :
        X = [X[0] - mean[0], X[1] - mean[1]]
        B = [[variance[0], coefficient * stand_deviation[0] * stand_deviation[1]],[coefficient * stand_deviation[0] * stand_deviation[1], variance[1]]]
        inv_B = numpy.linalg.inv(B)
        A = inv_B
        B_val = (1.0 - coefficient**2) * variance[0] * variance[1]
        tmp1 = 2*math.pi * (B_val ** 0.5)
        X = numpy.array([X])
        tmp2 = (-0.5) * numpy.dot(numpy.dot(X, A), X.T)
        res = 1.0 / tmp1 * (math.e ** tmp2)
        return res
    return func

# 贝叶斯概率计算函数

In [5]:
def f(X, funcs, class_ps, index) :

    tmp1 = funcs[index](X) * class_ps[index]
    tmp2 = funcs[0](X) * class_ps[0] + funcs[1](X) * class_ps[1]
    return tmp1 / tmp2

# 基于最小错误率的贝叶斯判别分类

In [6]:
def classify(X,funcs,class_ps,labels) :

    res1 = f(X,funcs,class_ps,0)
    res2 = f(X,funcs,class_ps,1) 
    if res1 > res2 :
        return labels[0]
    else :
        return labels[1]

# 测试

In [8]:
def test(dataset, funcs,class_ps,labels) :
    positive0 = 0
    positive1 = 0
    F = [item for item in dataset if item[-1] == 'F']
    len_F = len(F)
    len_M = len(dataset) - len_F
    for item in dataset :
        res = classify([item[0],item[1]], funcs, class_ps,labels)
        if res == item[-1] and res == 'F' :
            positive0 += 1
        if res == item[-1] and res == 'M' :
            positive1 += 1
    #print("total":class_ps)
    print("F", positive0 * 1.0 / len_F)
    print ("M", positive1 * 1.0 / len_M)

# 主函数

In [15]:
if __name__ == '__main__' :
    dataset = importdata()
    (mean1,mean2),(variance1,variance2),(stand_deviation1, stand_deviation2), (class1_p, class2_p), coefficient = getParameters(dataset)
    func1 = GaussianFunc(mean1, variance1, stand_deviation1,coefficient)
    func2 = GaussianFunc(mean2, variance2, stand_deviation2,coefficient)
    funcs = []
    funcs.append(func1)
    funcs.append(func2)
    class_ps = []
    class_ps.append(class1_p)
    class_ps.append(class2_p)
 
    classs = [class_ps]
    # 手工指定先验概率
    classs.append([0.5,0.5])
    classs.append([0.4,0.6])
    classs.append([0.3,0.7])
    classs.append([0.2,0.8])
 
    labels = ["F", "M"]
    for class_ps in classs :
        print("-"* 24)
        print(class_ps)
        print("-"*10,"dataset1","-"*10)
        testset0 = importdata('dataset1.txt')
        test(testset0, funcs, class_ps, labels)
        print("-"*10,"dataset2","-"*10)
        testset1 = importdata('dataset2.txt')
        test(testset1, funcs, class_ps, labels)
        print("-"*10,"dataset3","-"*10)
        testset2 = importdata('dataset3.txt')
        test(testset2, funcs, class_ps, labels)

------------------------
[0.23780487804878048, 0.7621951219512195]
---------- dataset1 ----------
F 0.8076923076923077
M 0.968
---------- dataset2 ----------
F 0.8
M 0.9166666666666666
---------- dataset3 ----------
F 0.5625
M 0.8918918918918919
------------------------
[0.5, 0.5]
---------- dataset1 ----------
F 0.8846153846153846
M 0.92
---------- dataset2 ----------
F 0.85
M 0.8690476190476191
---------- dataset3 ----------
F 0.6875
M 0.8783783783783784
------------------------
[0.4, 0.6]
---------- dataset1 ----------
F 0.8717948717948718
M 0.944
---------- dataset2 ----------
F 0.825
M 0.9047619047619048
---------- dataset3 ----------
F 0.6875
M 0.8918918918918919
------------------------
[0.3, 0.7]
---------- dataset1 ----------
F 0.8461538461538461
M 0.956
---------- dataset2 ----------
F 0.825
M 0.9166666666666666
---------- dataset3 ----------
F 0.6875
M 0.8918918918918919
------------------------
[0.2, 0.8]
---------- dataset1 ----------
F 0.8076923076923077
M 0.972
---------