#### 朴素贝叶斯修正

In [1]:
import numpy as np

def loaddata():
    X = np.array([[1,'S'],[1,'M'],[1,'M'],[1,'S'],[1,'S'],
                  [2,'S'],[2,'M'],[2,'M'],[2,'L'],[2,'L'],
                  [3,'L'],[3,'M'],[3,'M'],[3,'L'],[3,'L']])
    y = np.array([-1,-1,1,1,-1,-1,-1,1,1,1,1,1,1,1,-1])
    return X, y

##### 计算各个概率值

$p(y_i|x)=p(x_1|y_i)p(x_2|y_i)...p(x_n|y_i)p(y_i)$

In [34]:
def train(trainset,train_labels):
    m = trainset.shape[0] # 数据量
    n = trainset.shape[1] # 特征数
    piority_prob = {} # 先验概率 key：标签，value：概率
    condition_prob = {} # 条件概率 key：标签，属性，属性值

    # 计算先验概率：这里先计算各标签的数量，后面再计算最终的概率
    # 类别可能的取值
    label_set = set(train_labels)
    for label in label_set:
        # print(label,':',sum(train_labels==label))
        piority_prob[str(label)] = sum(train_labels==label)
        
    # 计算条件概率
    for i in range(m):
        for j in range(n):
            key = str(train_labels[i])+','+str(j)+','+str(trainset[i][j])
            if key not in condition_prob.keys():
                condition_prob[key] = 0
            condition_prob[key] += 1

    condition_prob_final = {}
    for key in condition_prob.keys():
        '''
        原代码：
        label = key.split(',')[0]
        condition_prob_final[key] =  condition_prob[key]/piority_prob[label] 
        '''
        label = key.split(',')[0]
        index = key.split(',')[1]
        condition_prob_final[key] =  (condition_prob[key] + 1)/(piority_prob[label]+ len(set(trainset[train_labels == int(label),int(index)]))) 

    # 先验概率的最终结果
    piority_prob_final = {}
    for key in piority_prob.keys():
        '''
        原代码：
        piority_prob_final[key] = piority_prob[key] / m
        '''
        piority_prob_final[key] = (piority_prob[key] + 1) / (m + len(label_set))
    
    return piority_prob_final, condition_prob_final, label_set

In [35]:
X,y=loaddata()

piority_prob, condition_prob, labels_set = train(X,y)
print('piority_prob=',str(piority_prob))
print('condition_prob=',str(condition_prob))
print(str(labels_set))

piority_prob= {'1': np.float64(0.5882352941176471), '-1': np.float64(0.4117647058823529)}
condition_prob= {'-1,0,1': np.float64(0.4444444444444444), '-1,1,S': np.float64(0.4444444444444444), '-1,1,M': np.float64(0.3333333333333333), '1,0,1': np.float64(0.25), '1,1,M': np.float64(0.4166666666666667), '1,1,S': np.float64(0.16666666666666666), '-1,0,2': np.float64(0.3333333333333333), '1,0,2': np.float64(0.3333333333333333), '1,1,L': np.float64(0.4166666666666667), '1,0,3': np.float64(0.4166666666666667), '-1,0,3': np.float64(0.2222222222222222), '-1,1,L': np.float64(0.2222222222222222)}
{np.int64(1), np.int64(-1)}


##### 利用贝叶斯模型预测新样本的分类

In [36]:
def predict(data, piority_prob, condition_prob, labels_set):
    result = {} # 存储后验概率
    for label in labels_set:
        temp = 1.0
        # 累乘条件概率
        for i in range(len(data)):
            key = str(label) + ',' + str(i)+','+str(data[i])
            temp *= condition_prob[key]
        # 最后乘以先验概率
        temp *= piority_prob[str(label)]
        result[label] = temp
    print('result=',result)

    return sorted(result.items(),key=lambda x: x[1],reverse=True)[0][0]

In [37]:
y_hat = predict([2,'S'],piority_prob, condition_prob, labels_set)
print('y_hat=',y_hat)

result= {np.int64(1): np.float64(0.032679738562091505), np.int64(-1): np.float64(0.061002178649237467)}
y_hat= -1
