# Adaboost Lab

## 准备工作
### 环境准备
请确保完成以下依赖包的安装，并且通过下面代码来导入与验证。

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### 数据集准备
我们将使用以下数据集进行 Adaboost 的训练。

该数据集与决策树部分使用的数据集相同，包括 7 个特征以及一个标签“是否适合攻读博士”，涵盖了适合攻读博士的各种条件，如love doing research,I absolutely want to be a college professor等。

请执行下面的代码来加载数据集。


In [2]:
# read decision_tree_datasets.csv
train_data = pd.read_csv('train_phd_data.csv')
test_data = pd.read_csv('test_phd_data.csv')

# translate lables [0,1] to [-1,1]
# if 0 then -1, if 1 then 1
train_data.iloc[:, -1] = train_data.iloc[:, -1].map({0: -1, 1: 1})
test_data.iloc[:, -1] = test_data.iloc[:, -1].map({0: -1, 1: 1})

## Adaboost (15 pts)

在上一个lab中，你已经成功完成了 Decision Tree 的构建。在本部分，你可以继续沿用上一部分的代码，学习并完成 Adaboost 模型的训练。

在这个 Adaboost 模型中，我们选择了一层决策树作为弱学习器，并使用基尼系数作为分类标准。

请完成以下类的构建以及相应函数的实现：

1. **weakClassifier()**: 我们采用一层决策树，包括 `split()` 和 `predict()`。你可以参考上一次实验中的代码。

2. **Adaboost()** ：包括弱学习器的集合，拟合过程 `fit()` 和预测过程 `predict()`。


In [3]:
class weakClassifier:
    def __init__(self):

        self.tree = None
        self.alpha = None

    @staticmethod
    def split_data(data, column):
        # the same as in the tree lab

        splt_datas = pd.Series(dtype='float64')
        str_values = data.iloc[:, column].unique()
        for i in range(len(str_values)):
            df = data.loc[data.iloc[:, column] == str_values[i]]
            splt_datas[str(i)] = df
        return splt_datas

    def best_split(self, X, y, sample_weight):

        best_feature_index = 0
        numFeatures = X.shape[1]
        
        # to be same as the tree lab, add labels
        X['label'] = y
        X['SampleWeight'] = sample_weight

        best_gini = 100
        best_Series = self.split_data(X, 0)

        for i in range(numFeatures):
            gini = 1
            series = self.split_data(X, i)

            for j in range(len(series)):
                df = series.iloc[j]
                p1, p2 = np.sum(df[df.iloc[:, -2] == 1].iloc[:, -1]), np.sum(df[df.iloc[:, -2] == -1].iloc[:, -1])
                gini -= df.shape[0] / X.shape[0] / np.sum(df.iloc[:, -1]) ** 2 * (p1 ** 2 + p2 ** 2)

            if gini < best_gini:
                best_gini = gini
                best_feature_index = i
                best_Series = series

        return X.columns[best_feature_index], best_Series

    def fit(self, X, y, sample_weight):
        """
            fit the data to the decision tree

            Args:
                X: the features of the data
                y: the labels of the data
                sample_weight: the weight of each sample

            Returns:
                None, but self.tree should be updated
        """
        best_feature, best_splits = self.best_split(X, y, sample_weight)

        if best_feature is None:
            return
        tree = {best_feature: {}}
        for df in best_splits:
            prediction = df.loc[:, best_feature].unique()[0]

            if np.sum(df.iloc[:, -2] * df.iloc[:, -1]) > 0:
                tree[best_feature][prediction] = 1
            else:
                tree[best_feature][prediction] = -1
        self.tree = tree

    def predict(self, x):
        """
        predict the label of the data

        Args:
            x: the features of the data
        Return:
            predict_labels: the predict labels of the data
        """

        predict_labels = []

        for i in range(len(x)):
            sample = x.iloc[i, :]
            first_str = list(self.tree.keys())[0]
            feat_index = sample.index.get_loc(first_str)
            key = sample.iloc[feat_index]
            predict_labels.append(self.tree[first_str][key])

        return predict_labels

In [4]:
class Adaboost:

    def __init__(self, n_estimators=10):
        self.n_estimators = n_estimators
        self.clfs = []

    def fit(self, X, y):
        n_samples, m_features = X.shape
        w = np.ones(n_samples) / n_samples
        
        for _ in range(self.n_estimators):
            clf = weakClassifier()
            clf.fit(X, y, w)
            X.drop(["label", "SampleWeight"], axis=1, inplace=True)
            
            y_pred = clf.predict(X)
            error = np.sum(w * np.where(y_pred == y, 0, 1))
            alpha = np.log((1 - error) / error) / 2

            w *= np.exp(-alpha * y * y_pred)
            w /= np.sum(w)

            # save classifier and weight
            clf.alpha = alpha
            self.clfs.append(clf)

    def predict(self, X):
        """
        predict the label of the data

        Args:
            X: the features of the data
        Return:
            y_pred: the predicted labels of the data
        """
        y_pred, alpha = [], []

        for clf in self.clfs:
            y_pred.append(clf.predict(X))
            alpha.append(clf.alpha)

        w_sum = np.dot(np.array(alpha), np.array(y_pred))

        return np.sign(w_sum)

In [5]:
adaboost_model = Adaboost(n_estimators=10)
adaboost_model.fit(train_data.iloc[:, :-1], train_data.iloc[:, -1])
predictions = adaboost_model.predict(test_data.iloc[:, :-1])
accuracy = np.mean(predictions == test_data.iloc[:, -1].values)
print("The accuracy of Adaboost is: ", accuracy)

The accuracy of Adaboost is:  1.0
