In [1]:
from sklearn import datasets
import numpy as np
import math

In [2]:
def load_iris(ratio=0.8):
    features, target = datasets.load_iris(True)
    
    num_samples = len(target)
    num_train = math.ceil(num_samples * ratio)
    
    # 随机打乱数据
    idx = np.random.permutation(np.arange(num_samples))
    traindata = features[idx[:num_train]], target[idx[:num_train]]
    validdata = features[idx[num_train:]], target[idx[num_train:]]
    
    return traindata, validdata

# 作业三

## 二、朴素贝叶斯分类器

要求：

* 用朴素贝叶斯构造一个iris数据集的分类器
* 在尽量不修改代码结构的前提下完成工作

ETA：1-5 hours

## 定义模型

In [3]:
class NaiveBayes:
    def __init__(self, smooth = 1):
        self.smooth = smooth # lambda
        self.conditional_prob = None # 条件概率
        self.prior_prob = None # 先验概率
        
    def __call__(self, features):
        return self.predict(features)
        
    def fit(self, features, target):
        """
        
        给定特征及真实结果，拟合分类器
        
        将预测过程中所需要用到的条件概率及先验概率全部计算好
        """
        
        self.num_features = features.shape[-1] # 特征的数目；iris数据集中共有4个特征
        self.target_labels = np.unique(target) # 预测结果的可能值：c_k
        # features_labels[j][l]表示第 j 个特征的第 l 个可能值: a_{jl}
        self.features_labels = [np.unique(features[:, i]) for i in range(features.shape[-1])]
        
        self.conditional_prob = self._conditional_prob(features, target)
        self.prior_prob = self._prior_prob(target)
        
    def predict(self, features):
        # 预测单个数据
        if len(features.shape) == 1:
            return np.array([self._predict_single(features)])
        # 批量预测
        elif len(features.shape) == 2:
            N = features.shape[0]
            return np.array([self._predict_single(features[i, :]) for i in range(N)])
        else:
            raise(ValueError("Unsupported features size, should be 1 or 2 dimensional"))
            
    
    def _predict_single(self, feature):
        prod = self.prior_prob.copy()
        for k in range(len(prod)):
            for j in range(self.num_features):
                cur_feature = self.features_labels[j]

                # find the closet feature idx since
                # cur_feature == feature[j] might be all False
                idx = np.argmin(np.abs(cur_feature - feature[j]))
                for l in range(len(cur_feature)):
                    prod[k] *= self.conditional_prob[k][j][idx]
        return np.argmax(prod)
        
    
    def _prior_prob(self, target):
        # 利用式4.11估计先验概率
        N = len(target)
        K = len(self.target_labels)
        prob = [(np.sum(target==c) + self.smooth)/(N + K*self.smooth) for c in self.target_labels]
        return prob
    
    def _conditional_prob(self, features, target):
        # 利用式4.10估计条件概率
        # prob[k][j][l] is P(X^{(j)} = a_{jl} | Y = c_k)
        prob = [[np.zeros(feature.shape) for feature in self.features_labels] 
                for _ in self.target_labels]
        
        for (k, ck) in enumerate(self.target_labels):
            N_ck = np.sum(target == ck)
            for j in range(self.num_features):
                Xj = features[:, j]
                Sj = len(self.features_labels[j])
                for (l, ajl) in enumerate(self.features_labels[j]):
                    prob[k][j][l] = (np.sum(np.bitwise_and(Xj == ajl, target == ck)) + self.smooth) / (
                        N_ck + Sj * self.smooth)
        return prob

读取数据

In [4]:
(X_train, Y_train), (X_valid, Y_valid) = load_iris()

创建模型并拟合数据

In [5]:
model = NaiveBayes()
model.fit(X_train, Y_train)

预测结果

In [6]:
Y_pred = model.predict(X_valid)
accuracy = np.sum(Y_pred == Y_valid)/len(Y_valid)
print(f"accuracy: {accuracy:.4f}")

accuracy: 0.8667
