In [16]:
from sklearn import datasets
import numpy as np
import math

In [17]:
def load_iris(ratio=0.8):
    features, target = datasets.load_iris(True)
    
    num_samples = len(target)
    num_train = math.ceil(num_samples * ratio)
    
    # 随机打乱数据
    #np.random.seed(3)
    idx = np.random.permutation(np.arange(num_samples))
    traindata = features[idx[:num_train]], target[idx[:num_train]]
    validdata = features[idx[num_train:]], target[idx[num_train:]]
    
    return traindata, validdata

# 作业三

## 二、朴素贝叶斯分类器

要求：

* 用朴素贝叶斯构造一个iris数据集的分类器
* 在尽量不修改代码结构的前提下完成工作

## 定义模型

In [28]:
# 只需要修改这一部分 -- 代码量在40行以内
class NaiveBayes:
    def __init__(self, smooth = 1):
        self.smooth = smooth # lambda
        self.conditional_prob = None # 条件概率
        self.prior_prob = None # 先验概率
         
    def __call__(self, features):
        return self.predict(features)
              
    def fit(self, features, target):
        """
        给定特征及真实结果，拟合分类器
        将预测过程中所需要用到的条件概率及先验概率全部计算好
        """
        self.num_features = features.shape[-1] # 特征的数目；iris数据集中共有4个特征
        self.target_labels = np.unique(target) # 预测结果的可能值：c_k    
        self.features_labels = [np.unique(features[:, i]) for i in range(features.shape[-1])]# features_labels[j][l]表示第 j 个特征的第 l 个可能值: a_{jl}       
        self.conditional_prob = self._conditional_prob(features, target)
        self.prior_prob = self._prior_prob(target)
               
    def predict(self, features):
        # 预测单个数据    
        if len(features.shape) == 1:
            return np.array([self._predict_single(features)])
        # 批量预测
        elif len(features.shape) == 2:
            N = features.shape[0]
            return np.array([self._predict_single(features[i, :]) for i in range(N)])
        else:
            raise(ValueError("Unsupported features size, should be 1 or 2 dimensional"))
              
    def _predict_single(self, feature):
        list1= [0]*len(self.target_labels)
        for k in range(len(self.target_labels)):
            conditional_prob1=1
            for i in range(len(feature)):               
                if feature[i] not in self.features_labels[i]:
                    #没有办法找到feature时，说明他不在条件先验库中，重写条件先验
                    conditional_temp=(0+self.smooth)/(self.lentarget[k]+ len((self.conditional_prob[k][i]))*self.smooth )  
                    #conditional_temp=1
                else:
                    indexl=np.argwhere(self.features_labels[i]==feature[i])[0,0]  
                    conditional_temp=self.conditional_prob[k][i][indexl]
                conditional_prob1 *= conditional_temp
            list1[k]=self.prior_prob[k]* conditional_prob1
        return list1.index(max(list1))
    
    def _prior_prob(self, target):
        # 利用式4.11估计先验概率
        N = len(target)
        K = len(self.target_labels)
        #存一下每个类别的个数
        self.lentarget=[sum(target==p) for p in range(K)]
        prob=[(sum(target==p)+self.smooth) / (N+K*self.smooth) for p in range(K)]
        
#         print("类别个数:",self.lentarget)
#         print("先验概率：",np.array(prob))   
#         print("先验概率sum：",sum(prob))  
        return prob
    
    def _conditional_prob(self,features, target):
        # 利用式4.10估计条件概率  
        # prob[k][j][l] is P(X^{(j)} = a_{jl} | Y = c_k)
        prob = [[np.zeros(feature.shape) for feature in self.features_labels] for _ in self.target_labels]# prob里一共存储了多少个数？  (35+22+38+22)*3*4
        for k in range(len(prob)):
            for j in range(len(prob[k])):
                for l in range((len(prob[k][j]))):                  
                    P_jlck= np.sum(features[target==k,:][:,j]==self.features_labels[j][l])
                    prob[k][j][l]= (P_jlck + self.smooth) / ( (sum(target==k)+len(prob[k][j]*self.smooth)))   
                    
                    
        np.set_printoptions(formatter={'float': '{: 0.9f}'.format})
        #print("以下所有条件概率求和应该为1：")
        #print(np.array([sum(prob[k][i]) for k in range(len(prob)) for i in range(len(prob[k]))]))
        return prob


读取数据

In [29]:
(X_train, Y_train), (X_valid, Y_valid) = load_iris()

创建模型并拟合数据

In [30]:
model = NaiveBayes( smooth = 1.0)
model.fit(X_train, Y_train)

预测结果

In [32]:
Y_pred = model.predict(X_valid)
#print(Y_pred)
accuracy = np.sum(Y_pred == Y_valid)/len(Y_valid)
print(f"accuracy: {accuracy:.4f}")

accuracy: 0.9000
