# 作业四

## 四、AdaBoost算法

利用算法8.1对iris数据集进行分类

* 利用sklearn提供的`DecisionTreeClassifier`构造单层决策树作为基本分类器
* 调整式(8.7)以适用于多分类的情况

ETA：0.5-3 hours

In [1]:
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
import numpy as np
import math

In [2]:
def load_iris(ratio=0.8):
    features, target = datasets.load_iris(True)
    
    num_samples = len(target)
    num_train = math.ceil(num_samples * ratio)
    
    # 随机打乱数据
    idx = np.random.permutation(np.arange(num_samples))
    traindata = features[idx[:num_train]], target[idx[:num_train]]
    validdata = features[idx[num_train:]], target[idx[num_train:]]
    
    return traindata, validdata

In [3]:
class AdaBoostClassifier:
    def __init__(self, n_estimators=50):
        self.n_estimators = n_estimators
    
    def fit(self, X, y):
        """
        Inputs:
          X: array of shape (N, C)
          y: array of shape (N, )
        """
        assert len(X.shape) == 2
        assert X.shape[0] == y.shape[0]
        
        self.classes_ = np.unique(y)  ##[0,1,2]
            
        self.estimators_ = []
        self.estimator_weights_ = np.zeros(self.n_estimators) ###[0,0,0,*50]
        self.estimator_errors_ = np.ones(self.n_estimators) ###[1,1,1,*50]
        
        self.boss=0
        
        num_samples = X.shape[0]  ##120
        sample_weight = np.full((num_samples, ), 1./num_samples)  ##[1/120,1/120,1/120,*120]

        for iboost in range(self.n_estimators):
            sample_weight, estimator, estimator_weight, estimator_error = self._boost(X, y,sample_weight)
            self.estimators_.append(estimator)
            self.estimator_weights_[iboost] = estimator_weight## alpha_m
            self.estimator_errors_[iboost] = estimator_error###e_m
                            
        return self
            
    def _boost(self, X, y, sample_weight):
        estimator = DecisionTreeClassifier(max_depth=1) ##model_name
        estimator.fit(X,y,sample_weight=sample_weight)
        Y_pred=estimator.predict(X)

        estimator_error=np.sum(sample_weight*(Y_pred!=Y_train))  ###e_m
        sample_weight_temp=[]
        estimator_weight=0.5*np.log((1-estimator_error+0.00001)/(estimator_error+0.00001))  ## alpha_m
        
        for i in range(X.shape[0]):
            if Y_pred[i]!=Y_train[i]:
                sample_weight_temp.append(sample_weight[i]*np.exp(estimator_weight))
            else:
                sample_weight_temp.append(sample_weight[i]*np.exp(-estimator_weight))
        Z=np.sum(sample_weight_temp)
        sample_weight=sample_weight_temp/Z  ##D_m
        
        return sample_weight, estimator, estimator_weight, estimator_error
    
    def predict(self, X):
        n,k=X.shape[0],len(self.classes_ )
        G=np.zeros((n,k))
        for iboost in range(self.n_estimators):
            Y_pred=self.estimators_[iboost].predict_proba(X) ## 用软分类做,懒得onehot了
            G=G+self.estimator_weights_[iboost]*Y_pred
#         print(G)
        pred=np.argmax(G, axis=1)
        return pred

In [4]:
def accuracy(Y_real, Y_pred):
    return np.sum(Y_real == Y_pred)/len(Y_real)

In [5]:
(X_train, Y_train), (X_valid, Y_valid) = load_iris()
model = AdaBoostClassifier(n_estimators=50)
model.fit(X_train, Y_train)
accu = accuracy(model.predict(X_valid), Y_valid)
print(f"Accuracy: {accu:.4f}")

Accuracy: 0.9667
