In [1]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn import datasets
# Import train_test_split function
from sklearn.model_selection import train_test_split
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics

In [5]:
iris = datasets.load_iris()
X = iris.data
y = iris.target

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [4]:
abc = AdaBoostClassifier(n_estimators=50,
                         learning_rate=0.1)
# Train Adaboost Classifer
model = abc.fit(X_train, y_train)

#Predict the response for test dataset
y_pred = model.predict(X_test)


In [5]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))


Accuracy: 0.9333333333333333


In [16]:
import numpy as np
import copy
class Node:
    def __init__(self, gini, num_samples, num_samples_per_class, predicted_class):
        self.gini = gini
        self.num_samples = num_samples
        self.num_samples_per_class = num_samples_per_class
        self.predicted_class = predicted_class
        self.feature_index = 0
        self.threshold = 0
        self.left = None
        self.right = None

class DecisionTreeClassifier:
    
    def __init__(self, max_depth=None):
        self.max_depth = max_depth
    
    def _gini(self, y, sample_weight):
        
        class_weight = [0.0, 0.0, 0.0]
        
        for i, label in enumerate(y):
            class_weight[label] += sample_weight[i]
        
        return 1.0 - sum((class_weight[c])**2 for c in range(self.n_classes_))
    
    def fit(self, X, y, sample_weight):
        self.N = len(y)
        self.n_classes_ = len(set(y))
        self.n_features_ = X.shape[1]
        self.tree_ = self._grow_tree(X, y, sample_weight, 0)
        
    def predict(self, X):
        return [self._predict(inputs) for inputs in X]

    def _predict(self, inputs):
        node = self.tree_
        while node.left:
            if inputs[node.feature_index] < node.threshold:
                node = node.left
            else:
                node = node.right
        return node.predicted_class
    
    def _grow_tree(self, X, y, sample_weight, depth=0):
        
        num_samples_per_class = [0, 0, 0]
        for i, label in enumerate(y):
            num_samples_per_class[label] += sample_weight[i]*self.N
        
        predicted_class = np.argmax(num_samples_per_class)
        node = Node(
            gini=self._gini(y, sample_weight), 
            num_samples=len(y), 
            num_samples_per_class=num_samples_per_class,
            predicted_class=predicted_class,
            )
        
        if depth < self.max_depth:
            idx, thr = self._best_split(X, y, sample_weight)
            if idx is not None:
                indices_left = X[:, idx] < thr
                X_left, y_left, sample_weight_left = X[indices_left], y[indices_left], sample_weight[indices_left]
                X_right, y_right, sample_weight_right = X[~indices_left], y[~indices_left], sample_weight[~indices_left]
                node.feature_index = idx
                node.threshold = thr
                node.left = self._grow_tree(X_left, y_left, sample_weight_left, depth + 1)
                node.right = self._grow_tree(X_right, y_right, sample_weight_right, depth + 1)
        return node
    
    def _best_split(self, X, y, sample_weight):
        
        m = y.size
        if m <= 1:
            return None, None
        
        num_parent = [0, 0, 0]
        for i, label in enumerate(y):
            num_parent[label] += sample_weight[i] * self.N
    
        best_gini = 1.0 - sum((n / m) ** 2 for n in num_parent)
        best_idx, best_thr = None, None
        
        for idx in range(self.n_features_):
            
            thresholds, classes, weights = zip(*sorted(zip(X[:, idx], y, sample_weight)))
            
            num_left = [0] * self.n_classes_
            num_right = num_parent.copy()
            for i in range(1, m):
                c = classes[i - 1]
                num_left[c] += weights[i-1]*self.N
                num_right[c] -= weights[i-1]*self.N
                
                gini_left = 1.0 - sum(
                    (num_left[x] / i) ** 2 for x in range(self.n_classes_)
                )
                gini_right = 1.0 - sum(
                    (num_right[x] / (m - i)) ** 2 for x in range(self.n_classes_)
                )
               
                gini = (i * gini_left + (m - i) * gini_right) / m
                
                if thresholds[i] == thresholds[i-1]:
                    continue
                
                if gini < best_gini:
                    best_gini = gini
                    best_idx = idx
                    best_thr = (thresholds[i] + thresholds[i - 1]) / 2
        
        return best_idx, best_thr

In [17]:
# implement a Adaboost classifier from scratch!

class AdaBoostClassifier:
    
    def __init__(self, base_estimator, n_estimaters, learning_rate):
        self.base_estimator = base_estimator
        self.n_estimater = n_estimaters
        self.lr = learning_rate
        
        self.estimators_ = []
        self.estimator_weight_ = np.zeros(self.n_estimater, dtype=np.float64)
    
    def fit(self, X, y):
        sample_weight = np.ones(X.shape[0])/X.shape[0]
        for tree in range(self.n_estimater):
            estimator, sample_weight, estimator_weight = self._boost(X, y, sample_weight)
            self.estimators_.append(estimator)
            self.estimator_weight_[tree] = estimator_weight
            
    def _boost(self, X, y, sample_weight):
        estimator = copy.deepcopy(self.base_estimator)
        estimator.fit(X, y, sample_weight=sample_weight)
        pred_y = estimator.predict(X)
        indicator = np.ones(X.shape[0])*[pred_y!=y][0]
        err = np.dot(sample_weight, indicator) / np.sum(sample_weight)
        alpha = np.log((1-err)/err)
        new_sample_weight = sample_weight * np.exp(alpha*indicator)
        return estimator, new_sample_weight, alpha
    
    def predict(self, X):
        predicts = []
        for estimator in self.estimators_:
            pred = estimator.predict(X)
            pred = np.array(pred)
            pred[pred==0] = -1
            predicts.append(pred)
        
        predicts = np.array(predicts)
        
        pr = np.sign(np.dot(self.estimator_weight_, predicts))
        pr[pr==-1] = 0
        return pr.astype(int)

In [20]:

myTree = DecisionTreeClassifier(max_depth=10)
myAda = AdaBoostClassifier(myTree, 50, 0.1)
myAda.fit(X_train, y_train)
y_pred = myAda.predict(X_test)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.0




In [22]:
for i, j in enumerate(y_train):
    print(i, j)


0 0
1 1
2 0
3 1
4 1
5 1
6 1
7 2
8 1
9 2
10 0
11 0
12 2
13 2
14 0
15 2
16 1
17 1
18 1
19 0
20 2
21 2
22 1
23 2
24 1
25 2
26 0
27 0
28 2
29 0
30 2
31 2
32 1
33 1
34 0
35 0
36 2
37 2
38 1
39 2
40 2
41 0
42 0
43 2
44 0
45 0
46 2
47 1
48 0
49 2
50 1
51 0
52 0
53 1
54 1
55 0
56 2
57 2
58 2
59 1
60 2
61 1
62 1
63 0
64 2
65 1
66 2
67 2
68 0
69 2
70 0
71 0
72 2
73 1
74 0
75 2
76 0
77 1
78 0
79 0
80 2
81 0
82 1
83 1
84 0
85 1
86 1
87 2
88 0
89 2
90 1
91 0
92 2
93 1
94 0
95 0
96 2
97 1
98 2
99 1
100 2
101 0
102 0
103 1
104 1
