In [None]:
'''
Reference:
1. https://adataanalyst.com/machine-learning/adaboost-python-3/
2. https://www.python-course.eu/Boosting.php
3. https://cole-maclean.github.io/blog/Adaboost-Predicting-Churn/
4. https://github.com/eriklindernoren/ML-From-Scratch/blob/master/mlfromscratch/supervised_learning/adaboost.py
5. https://medium.com/@chih.sheng.huang821/%E6%A9%9F%E5%99%A8%E5%AD%B8%E7%BF%92-ensemble-learning%E4%B9%8Bbagging-boosting%E5%92%8Cadaboost-af031229ebc3
6. https://www.youtube.com/watch?v=tH9FH1DH5n0
'''
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("white")
import pandas as pd
import numpy as np
import random
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import train_test_split


In [136]:


data = datasets.load_digits()
X = data.data
y = data.target

digit1 = 1
digit2 = 8
idx = np.append(np.where(y == digit1)[0], np.where(y == digit2)[0])
y = data.target[idx]
# Change labels to {-1, 1}
y[y == digit1] = -1
y[y == digit2] = 1
X = data.data[idx]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


In [163]:


class DecisionStump():
    def __init__(self):
        self.polarity = 1
        self.feature_dim = None
        self.threshold = None
        self.alpha = None

class MyAdaboost():
    def __init__(self, n_classifier):
        self.n_classifier= n_classifier
    
    def fit(self, X, y):
        n_samples, n_dim = np.shape(X)
        weight = np.ones(n_samples)/n_samples
        self.classifiers = []
        
        for _ in range(self.n_classifier):
            # Decision Stump is a decision tree with depth 1
            classifier = DecisionStump()
            
            # Find random dimension for 
            # random.seed(random.random())
            # d = random.randint(0, n_dim-1)
            # using random dimension for classifier result is not good
            min_error = float('inf')
            for d in range(n_dim):
                unique_vals = set(X[:, d])
                for thres in unique_vals:
                    p = 1
                    predictions = np.ones(n_samples)
                    predictions[X[:, d] < thres] = -1
                    error = sum(weight[y!=predictions])
                    if error > 0.5:
                        error = 1-error
                        p = -1

                    if error < min_error:
                        min_error = error
                        classifier.polarity = p
                        classifier.threshold = thres
                        classifier.feature_dim = d
                        
            classifier.alpha = 0.5*np.log((1-min_error)/(min_error+1e-10))
            print("Minimum Error {}, Alpha {}".format(min_error, classifier.alpha))
            predictions = np.ones(n_samples)
            wrong_preds = (classifier.polarity * X[:, classifier.feature_dim] < classifier.polarity * classifier.threshold)
            predictions[wrong_preds] = -1
            weight = weight * np.exp(-classifier.alpha*y*predictions)
            weight = weight / np.sum(weight)
            
            self.classifiers.append(classifier)
            
        
    def predict(self, X):
        n_samples, _ = X.shape
        sum_predictions = np.zeros(n_samples)
        
        for classifier in self.classifiers:
            pred = np.ones(n_samples)
            wrong_preds = (classifier.polarity * X[:, classifier.feature_dim] < classifier.polarity * classifier.threshold)
            pred[wrong_preds==True] = -1
            sum_predictions = sum_predictions + classifier.alpha*pred
            
        
        sum_predictions = np.sign(sum_predictions)
        return sum_predictions
            

In [164]:
# Adaboost classification with 5 weak classifiers

clf = MyAdaboost(20)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

count = 0
for i in range(len(y_pred)):
    if y_pred[i] == y_test[i]:
        count += 1

accuracy = count / len(y_pred)        
print ("Accuracy:", accuracy)


Minimum Error 0.16901408450704325, Alpha 0.7963153967630227
Minimum Error 0.18294701986318807, Alpha 0.7482536686991507
Minimum Error 0.1508113203689272, Alpha 0.8641259384669009
Minimum Error 0.2032580524603338, Alpha 0.6830272399883083
Minimum Error 0.20410360060316576, Alpha 0.6804206571722484
Minimum Error 0.21122395525854742, Alpha 0.658781731595795
Minimum Error 0.12631090385213894, Alpha 0.9669891138814865
Minimum Error 0.23339702137891738, Alpha 0.5946140400539544
Minimum Error 0.2699123520880072, Alpha 0.4975336538982501
Minimum Error 0.2609749480726081, Alpha 0.5204537007185551
Minimum Error 0.26248465785084, Alpha 0.5165471282448121
Minimum Error 0.27239337534646135, Alpha 0.4912566483185151
Minimum Error 0.27082202447721226, Alpha 0.4952279847375006
Minimum Error 0.270967241152971, Alpha 0.49486036755070123
Minimum Error 0.2655170851693572, Alpha 0.5087437722413432
Minimum Error 0.2803278924513035, Alpha 0.47141786962083587
Minimum Error 0.274836918316292, Alpha 0.485109335

In [161]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),algorithm="SAMME",n_estimators=30)
bdt.fit(X_train, y_train)
print(bdt.estimator_weights_)
z = bdt.predict(X_test)
count = 0
for i in range(len(z)):
    if z[i] == y_test[i]:
        count += 1

accuracy = count / len(y_pred)        
print ("Accuracy:", accuracy) 


[1.59263079 1.47097947 1.53696734 1.28559835 1.30838849 1.14555895
 0.9459087  1.10738491 0.76739656 1.32938437 0.94785876 0.54436414
 0.75950441 0.84790799 0.87500963 1.00451105 0.70949051 0.82946359
 0.57939936 0.65517751 1.00476565 0.72773119 0.83652    0.78600597
 0.59929934 0.69675216 0.47656019 0.59679759 0.63984881 1.00663187]
Accuracy: 0.9583333333333334
