# AdaBoost Implementation

## `Algorithm`

1\. Start with same weight for all data  $\alpha_j = 1/N$

2\. For t = 1,...T:
  * Learn $f_t(x)$ with data weights $\alpha_j$
  * Compute coefficient $\hat{w}_t$:
     $$\hat{w}_t = \frac{1}{2}\ln{\left(\frac{1- \mbox{E}(\mathbf{\alpha}, \mathbf{\hat{y}})}{\mbox{E}(\mathbf{\alpha}, \mathbf{\hat{y}})}\right)}$$
  * Re-compute weights $\alpha_j$:
     $$\alpha_j \gets \begin{cases}
     \alpha_j \exp{(-\hat{w}_t)} & \text{ if }f_t(x_j) = y_j\\
     \alpha_j \exp{(\hat{w}_t)} & \text{ if }f_t(x_j) \neq y_j
     \end{cases}$$
  * Normalize weights $\alpha_j$:
      $$\alpha_j \gets \frac{\alpha_j}{\sum_{i=1}^{N}{\alpha_i}} $$
  

In [33]:
class AdaBoostClassifier:

    def __init__(self, num_tree_stumps=30, max_depth=1):
        self.max_depth = max_depth
        self.num_tree_stumps = num_tree_stumps
        self.n = None
        self.features = None
        self.tree_weights = None
        self.trees = None

    def fit(self, data, features, target):
        self.n = len(data)
        self.features = features

        self.tree_weights, self.trees = \
            self._train_ada_boost(data, features, target, self.num_tree_stumps, self.max_depth)

    def predict(self, X):
        scores = np.zeros(len(X))

        for i, tree in enumerate(self.trees):
            predictions = X.apply(lambda x: self._predict(tree, x), axis=1)
            scores += predictions * self.tree_weights[i]

        return np.array([1 if score > 0 else 0 for score in scores])

    def _train_ada_boost(self, data, features, target, num_tree_stumps, tree_depth):

        N = len(data)
        alpha = np.ones(N) / N
        weights = []
        tree_stumps = []
        # Convert labels to AdaBoost format (1 and -1)
        target_values = data[target].apply(lambda y: 1 if y > 0 else -1)

        for t in range(num_tree_stumps):
            print('=====================================================')
            print('AdaBoost Iteration %d' % t)
            print('=====================================================')
            # Learn a weighted decision tree stump. Use max_depth=1
            tree_stump = self._train_tree(data, features, target, data_weights=alpha, max_depth=tree_depth)
            tree_stumps.append(tree_stump)

            predictions = data.apply(lambda x: self._predict(tree_stump, x), axis=1)

            is_correct = (predictions == target_values)
            # Compute weighted error
            weighted_error = np.sum(alpha[predictions != target_values]) / np.sum(alpha)

            # Compute model coefficient using weighted error
            weight = 0.5 * np.log((1 - weighted_error) / weighted_error)
            weights.append(weight)

            # Adjust weights on data point and scale alpha by multiplying by adjustment
            # Then normalize data points weights
            alpha *= is_correct.apply(lambda y: np.exp(-weight) if y else np.exp(weight))
            alpha = alpha / np.sum(alpha)

        return weights, tree_stumps

    def _calculate_gini(self, labels_in_node, data_weights):
        if len(labels_in_node) == 0:
            return 0

        node_wght = np.sum(data_weights)
        
        first_class_wght = np.sum(data_weights[labels_in_node == 1])
        second_class_wght = np.sum(data_weights[labels_in_node == 0])

        # Gini = 1 - sum(Pr_c^2)
        return 1 - ((first_class_wght/node_wght)**2 + (second_class_wght/node_wght)**2)

    def _calculate_gini_and_get_best_split_threshold(self, data, data_weights, feature, target):
        unique_inputs = data[feature].unique()
        if len(unique_inputs) > 100:
            unique_inputs = np.quantile(unique_inputs, np.arange(0, 1.01, 0.02))

        N = np.sum(data_weights)
        best_gini = float("inf")
        best_threshold = None

        for val in unique_inputs:
            left_split = data[data[feature] < val]
            right_split = data[data[feature] >= val]

            L_wgh = data_weights[data[feature] < val]
            R_wgh = data_weights[data[feature] >= val]

            N_L = np.sum(L_wgh)
            N_R = np.sum(R_wgh)

            gini_L = self._calculate_gini(left_split[target], L_wgh)
            gini_R = self._calculate_gini(right_split[target], R_wgh)

            gini_for_current_val = (N_L / N) * gini_L + (N_R / N) * gini_R

            if gini_for_current_val < best_gini:
                best_threshold = val
                best_gini = gini_for_current_val

        return best_gini, best_threshold

    def _find_best_splitting_feature(self, data, data_weights, features, target):
        best_feature = None
        best_split_threshold = None
        best_gini = float("inf")

        # Loop through each feature to consider splitting on that feature
        for feature in features:
            current_feature_gini, threshold = \
                self._calculate_gini_and_get_best_split_threshold(data, data_weights, feature, target)

            if current_feature_gini < best_gini:
                best_feature = feature
                best_gini = current_feature_gini
                best_split_threshold = threshold

        return [best_feature, best_gini, best_split_threshold]

    def _predict(self, tree, x):
        if tree["is_leaf"]:
            return tree["prediction"]
        else:
            split_feature_value = x[tree['splitting_feature']]
            threshold = tree["split_threshold"]
            if split_feature_value < threshold:
                return self._predict(tree["left"], x)
            else:
                return self._predict(tree["right"], x)

    def _calculate_node_weighted_mistakes(self, labels_in_node, data_weights):
        total_weight_positive = sum(data_weights[labels_in_node == +1])
        weighted_mistakes_all_negative = total_weight_positive

        total_weight_negative = sum(data_weights[labels_in_node == 0])
        weighted_mistakes_all_positive = total_weight_negative

        return (weighted_mistakes_all_positive, +1) if \
            weighted_mistakes_all_positive <= weighted_mistakes_all_negative else \
            (weighted_mistakes_all_negative, -1)

    def _create_leaf(self, target_values, data_weights):
        leaf = {'splitting_feature': None,
                'is_leaf': True}

        # Computed weight of mistakes.
        weighted_error, best_class = self._calculate_node_weighted_mistakes(target_values, data_weights)

        leaf['prediction'] = best_class
        return leaf

    def _create_node(cls, splitting_feature, split_threshold, left_tree, right_tree):
        return {"is_leaf": False,
                "prediction": None,
                "splitting_feature": splitting_feature,
                "split_threshold": split_threshold,
                "left": left_tree,
                "right": right_tree}

    def _train_tree(self, data, features, target, data_weights, current_depth=1, max_depth=1):
        remaining_features = features[:]
        target_values = data[target]

        print("--------------------------------------------------------------------")
        print("Subtree, depth = %s (%s data points)." % (current_depth, len(target_values)))

        # Stopping condition 1. Error is 0.
        if self._calculate_node_weighted_mistakes(target_values, data_weights)[0] <= 1e-15:
            print("Stopping condition 1 reached, error is 0")
            return self._create_leaf(target_values, data_weights)

        # Stopping condition 2: No more features to split on.
        if not remaining_features:
            print("Stopping condition 2 reached. No remaining features.")
            return self._create_leaf(target_values, data_weights)

        # Early stopping condition 1: Reached max depth limit.
        if current_depth > max_depth:
            print("Early stopping condition 1 reached. Reached maximum depth.")
            return self._create_leaf(target_values, data_weights)

        # Find the best splitting feature
        splitting_feature, feature_error, split_threshold = \
            self._find_best_splitting_feature(data, data_weights, features, target)
        remaining_features.remove(splitting_feature)

        left_split = data[data[splitting_feature] < split_threshold]
        right_split = data[data[splitting_feature] >= split_threshold]

        left_data_weights = data_weights[data[splitting_feature] < split_threshold]
        right_data_weights = data_weights[data[splitting_feature] >= split_threshold]

        print("Split on feature %s. (%s, %s)" % (splitting_feature, len(left_split), len(right_split)))

        # Create a leaf node if the split is "perfect"
        if len(left_split) == len(data):
            print("Creating leaf node.")
            return self._create_leaf(left_split[target], data_weights)
        if len(right_split) == len(data):
            print("Creating leaf node.")
            return self._create_leaf(right_split[target], data_weights)

        # Recurse on left and right subtrees
        # Repeat (recurse) on left and right subtrees
        left_tree = self._train_tree(
            left_split, remaining_features, target, left_data_weights, current_depth + 1, max_depth)
        right_tree = self._train_tree(
            right_split, remaining_features, target, right_data_weights, current_depth + 1, max_depth)

        return self._create_node(splitting_feature, split_threshold, left_tree, right_tree)

# Load and prepare data for training

In [34]:
import numpy as np
import pandas as pd
import sklearn
import seaborn as sns
import matplotlib.pyplot as plt

In [35]:
cardio_df = pd.read_csv("./data/cardio_train.csv", sep=";")

In [36]:
# Drop id
cardio_df = cardio_df.drop(columns = 'id')

In [37]:
# since the age is given in days, we convert it into years
cardio_df["age"] = cardio_df["age"]/365
cardio_df["age"] = cardio_df["age"].astype(int)

In [38]:
# get feature names and target name 
features = list(cardio_df.columns)
features.remove("cardio")
target = "cardio"

In [39]:
# split into train and test datasets
train = cardio_df.sample(frac=0.75, random_state=1) 
test = cardio_df.drop(train.index)

## Train and compare Custom AdaBoost and SKlearn AdaBoost 

In [40]:
custom_tree_model = AdaBoostClassifier(num_tree_stumps=50, max_depth=1)
custom_tree_model.fit(train, features, target)

AdaBoost Iteration 0
--------------------------------------------------------------------
Subtree, depth = 1 (52500 data points).
Split on feature ap_hi. (31007, 21493)
--------------------------------------------------------------------
Subtree, depth = 2 (31007 data points).
Early stopping condition 1 reached. Reached maximum depth.
--------------------------------------------------------------------
Subtree, depth = 2 (21493 data points).
Early stopping condition 1 reached. Reached maximum depth.
AdaBoost Iteration 1
--------------------------------------------------------------------
Subtree, depth = 1 (52500 data points).
Split on feature age. (23398, 29102)
--------------------------------------------------------------------
Subtree, depth = 2 (23398 data points).
Early stopping condition 1 reached. Reached maximum depth.
--------------------------------------------------------------------
Subtree, depth = 2 (29102 data points).
Early stopping condition 1 reached. Reached maximum

Split on feature weight. (6079, 46421)
--------------------------------------------------------------------
Subtree, depth = 2 (6079 data points).
Early stopping condition 1 reached. Reached maximum depth.
--------------------------------------------------------------------
Subtree, depth = 2 (46421 data points).
Early stopping condition 1 reached. Reached maximum depth.
AdaBoost Iteration 14
--------------------------------------------------------------------
Subtree, depth = 1 (52500 data points).
Split on feature age. (45135, 7365)
--------------------------------------------------------------------
Subtree, depth = 2 (45135 data points).
Early stopping condition 1 reached. Reached maximum depth.
--------------------------------------------------------------------
Subtree, depth = 2 (7365 data points).
Early stopping condition 1 reached. Reached maximum depth.
AdaBoost Iteration 15
--------------------------------------------------------------------
Subtree, depth = 1 (52500 data po

AdaBoost Iteration 27
--------------------------------------------------------------------
Subtree, depth = 1 (52500 data points).
Split on feature ap_hi. (37807, 14693)
--------------------------------------------------------------------
Subtree, depth = 2 (37807 data points).
Early stopping condition 1 reached. Reached maximum depth.
--------------------------------------------------------------------
Subtree, depth = 2 (14693 data points).
Early stopping condition 1 reached. Reached maximum depth.
AdaBoost Iteration 28
--------------------------------------------------------------------
Subtree, depth = 1 (52500 data points).
Split on feature ap_hi. (31007, 21493)
--------------------------------------------------------------------
Subtree, depth = 2 (31007 data points).
Early stopping condition 1 reached. Reached maximum depth.
--------------------------------------------------------------------
Subtree, depth = 2 (21493 data points).
Early stopping condition 1 reached. Reached max

Split on feature cholesterol. (46413, 6087)
--------------------------------------------------------------------
Subtree, depth = 2 (46413 data points).
Early stopping condition 1 reached. Reached maximum depth.
--------------------------------------------------------------------
Subtree, depth = 2 (6087 data points).
Early stopping condition 1 reached. Reached maximum depth.
AdaBoost Iteration 41
--------------------------------------------------------------------
Subtree, depth = 1 (52500 data points).
Split on feature ap_hi. (31007, 21493)
--------------------------------------------------------------------
Subtree, depth = 2 (31007 data points).
Early stopping condition 1 reached. Reached maximum depth.
--------------------------------------------------------------------
Subtree, depth = 2 (21493 data points).
Early stopping condition 1 reached. Reached maximum depth.
AdaBoost Iteration 42
--------------------------------------------------------------------
Subtree, depth = 1 (5250

In [41]:
from sklearn.ensemble import AdaBoostClassifier as AdaBoost
from sklearn.tree import DecisionTreeClassifier as DTree

sklearn_dtree = DTree(max_depth=1)
sklearn_ada_boost = AdaBoost(base_estimator=sklearn_dtree, n_estimators=50)
sklearn_ada_boost.fit(train[features], train[target] )

AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1))

In [43]:
from sklearn.metrics import accuracy_score

accuracy_custom = accuracy_score(test[target], custom_tree_model.predict(test))
accuracy_sklearn = accuracy_score(test[target], sklearn_ada_boost.predict(test[features]))

print("Custom model accuracy score on test", accuracy_custom)
print("SKlearn model accuracy score on test", accuracy_sklearn)

Custom model accuracy score on test 0.7273714285714286
SKlearn model accuracy score on test 0.73
