In [1]:
import  numpy as np

"""弱分类器"""
class WeakClassifier():

    def __init__(self):
        pass
        """
            feature_idx 划分特征索引
            feature_val 划分的阈值
            threshold_type 划分的方式：大于或者小于
            weight 该分类器权重
        """
        self.feature_idx = None
        self.feature_val = None
        self.threshold_type = None
        self.alpha = None

    def predict(self,X):
        pass
        """单个弱分类器的预测结果,满足条件的预测为正 1,否则为-1"""
        y_predict = np.full(X.shape[0],-1);
        if self.threshold_type == "less":
            idx = np.argwhere(X[:,self.feature_idx] < self.feature_val)
        else:
            idx = np.argwhere(X[:,self.feature_idx] > self.feature_val)
        y_predict[idx] = 1
        return y_predict

class AdaBoost:

    def __init__(self,n_classifiers =50,learning_rate=0.5):
        pass
        """
            n_classifiers 弱分类器数量
            learning_rate 学习率
        """
        self.n_classifiers = n_classifiers
        self.learing_rate = learning_rate
        self.classifiers = list()


    def _calc_error_rate(self,X,f_idx,val,threshold_type,y,weight):
        pass
        """计算当前分类器的分类误差率(带权重)"""
        #当前条件下的预测值
        y_predict = self._stump_predict(X,f_idx,val,threshold_type)
        #带权重的误差率
        weight_error = np.sum(weight[np.argwhere(y_predict!=y)])
        return weight_error,y_predict


    def fit(self,X,y,X_test,y_test):
        pass
        #如果原始数据的标签是 y={0,1},标签改为{-1,1}
        y = y.copy()
        y[np.argwhere(y!=1)]=-1
        self._n_samples,self._n_features = X.shape
        #初始化样本的权重
        weight = np.full(self._n_samples,1/self._n_samples)

        for n in range(self.n_classifiers):
            wc = WeakClassifier()
            #记录划分后误差最小的特征的索引
            wc.feature_idx = None
            #记录划分后误差最小的特征值
            wc.feature_val = None
            #记录划分的方式
            wc.threshold_type = None
            #记录最小误差率
            min_error = 1
            #记录预测值
            y_predict = None
            #遍历每一维特征
            for f_idx in range(0,self._n_features):
                features = np.unique(X[:,f_idx])
                #遍历没一个特征值
                for i in range(1,features.shape[0]):
                    val = 0.5*(features[i-1]+features[i])
                    #两种划分方式比较
                    for threshold_type in ["less","great"]:
                        #返回带权重的错误率，和此时的预测值
                        weight_error,curr_predict = self._calc_error_rate(X,f_idx,val,threshold_type,y,weight)
                        if weight_error < min_error:
                            y_predict = curr_predict
                            min_error = weight_error
                            wc.feature_idx = f_idx
                            wc.feature_val = val
                            wc.threshold_type = threshold_type

            #求此基础分类器的权重值alpha
            wc.alpha = self.learing_rate * np.log( (1-min_error)/(min_error) )
            #更新样本权重
            weight = weight * np.exp(-wc.alpha*y*y_predict)
            #归一化
            weight /= np.sum(weight)

            self.classifiers.append(wc)

            print("基础决策器",n,":",np.sum(y_predict !=y),wc.feature_idx,\
                  wc.feature_val,wc.threshold_type,min_error,wc.alpha)
            print("此时训练集正确率为：",np.sum(self.predict(X) == y )/self._n_samples*100,"%")
            print("此时测试集正确率为：",np.sum(self.predict(X_test) == y_test)/len(y_test)*100,'%')

    def _stump_predict(self,X,f_idx,f_val,threshold_type):
        pass
        """单个弱分类器的预测结果,满足条件的预测为正 1,否则为-1"""
        y_predict = np.full(X.shape[0],-1);
        if threshold_type == "less":
            idx = np.argwhere(X[:,f_idx] < f_val)
        else:
            idx = np.argwhere(X[:,f_idx] > f_val)
        y_predict[idx] = 1
        return y_predict

    def predict(self,X_test):
        pass
        """最终分类器的预测结果"""
        y_predict = np.zeros(X_test.shape[0])
        for clf in self.classifiers:
            base_predict = self._stump_predict(X_test,clf.feature_idx,clf.feature_val,clf.threshold_type)
            y_predict += clf.alpha * base_predict
        return np.sign(y_predict)


为了演示每加一个分类器后的预测准确率，故把测试集的样本和标签也一同传进去了，但是并没有使用它进行训练。

In [2]:

from  sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
import numpy as np

samples = load_breast_cancer()
x_samples = samples.data
y_samples = samples.target

y_samples[np.argwhere(y_samples!=1)] = -1
X_train, X_test, y_train, y_test = train_test_split(x_samples, y_samples)

print(X_train.shape,y_train.shape)

adaBoost  =AdaBoost(n_classifiers=10,learning_rate=0.5)
adaBoost.fit(X_train,y_train,X_test,y_test)
y_predict = adaBoost.predict(X_test)
print(np.sum(y_predict == y_test)/len(y_test)*100,"%")

(426, 30) (426,)
基础决策器 0 : 35 27 0.1416 less 0.08215962441314556 1.206679749247976
此时训练集正确率为： 91.78403755868545 %
此时测试集正确率为： 91.6083916083916 %
基础决策器 1 : 46 23 784.1500000000001 less 0.13686518085495064 0.9207872702098626
此时训练集正确率为： 91.78403755868545 %
此时测试集正确率为： 91.6083916083916 %
基础决策器 2 : 108 1 19.365000000000002 less 0.1969683567201335 0.702675514723151
此时训练集正确率为： 95.53990610328638 %
此时测试集正确率为： 94.4055944055944 %
基础决策器 3 : 56 13 36.405 less 0.25929658954697754 0.5248138747265142
此时训练集正确率为： 95.07042253521126 %
此时测试集正确率为： 94.4055944055944 %
基础决策器 4 : 192 21 20.225 less 0.22991877278802447 0.6043849572070298
此时训练集正确率为： 95.77464788732394 %
此时测试集正确率为： 95.8041958041958 %
基础决策器 5 : 37 7 0.0501 less 0.2632118331618701 0.5146706336705278
此时训练集正确率为： 95.77464788732394 %
此时测试集正确率为： 95.1048951048951 %
基础决策器 6 : 176 18 0.01744 great 0.288585340326979 0.45113230843195395
此时训练集正确率为： 96.94835680751174 %
此时测试集正确率为： 95.8041958041958 %
基础决策器 7 : 117 28 0.31745 less 0.31924676948510955 0.37861775063996