In [11]:
import sys, os
import csv
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
import numpy as np
import math

absFilePath = os.path.abspath('')

filepath_test = absFilePath + '/dataset/adaboost_test.csv'
filepath_train = absFilePath + '/dataset/adaboost_train.csv'

## Read data from files

In [12]:
def read_file(filepath): 
    dataframe = pd.read_csv(filepath, index_col=0)
    return dataframe

In [13]:
test_data = read_file(filepath_test).sort_index()
train_data = read_file(filepath_train).sort_index()

In [14]:
test_data.head()

Unnamed: 0,y,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10
2,-1.0,-0.46192,0.038536,0.371783,0.304404,1.444908,0.960392,0.725383,0.687336,-0.045559,1.543303
3,-1.0,-0.975133,0.004793,-1.191792,0.735661,-0.32838,0.043966,0.153503,0.654422,-0.337848,1.440259
5,-1.0,-0.018381,-0.691168,0.500622,-1.071849,-0.692466,0.415805,0.04639,0.298364,0.459021,0.626613
8,1.0,-2.276532,0.090605,-0.079632,-1.856046,-0.63577,1.254386,2.003412,1.053793,0.98537,0.180492
12,-1.0,-0.839241,-0.369428,-0.14055,-1.093575,-0.967626,0.277246,-0.695233,0.636772,0.337332,-1.178125


In [15]:
train_data.head()

Unnamed: 0,y,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10
0,-1.0,0.254988,0.870128,-0.683589,0.258015,0.936893,-0.359203,-1.034167,0.039811,1.118325,0.002867
1,-1.0,0.75497,-1.533848,-0.200174,-1.408173,0.871535,0.575223,-0.225445,0.739054,1.15635,-0.391315
4,1.0,0.399106,0.983204,1.249376,1.042185,0.956406,-1.258943,-0.448505,2.135672,0.971649,1.355421
6,1.0,-3.050537,-0.405085,-0.15281,-0.235555,-0.928292,-0.511155,1.113148,1.774802,1.063016,-1.826765
7,-1.0,1.840329,-0.60101,0.471581,0.782108,-0.931921,-0.312348,0.432793,0.121716,-0.042086,0.479016


## Prepare training data

In [16]:
# FIXME: take a sample for development purposes
train_data = train_data.sample(n=1000)

# According to pseudo code, AdaBoost takes a training set of data as input
X = train_data.drop(labels='y', axis=1)
X.head()

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10
6199,0.806554,-1.342476,-1.322086,-0.038186,0.011433,-0.698058,1.132161,-1.040981,0.877792,-1.972443
3199,-0.899661,0.859686,-0.821379,-0.122366,-0.621698,-1.454805,-0.771455,-0.369231,0.964371,0.267027
88,-0.58807,-0.757006,-0.629864,-0.716267,-0.028775,0.546602,0.550433,0.26318,1.528215,0.197674
8298,1.222291,0.752867,1.438666,0.519545,0.411937,0.151247,0.616759,-1.385639,0.685371,1.275034
832,-0.994046,-0.847345,0.728672,0.042686,1.094674,1.552953,1.28764,0.596837,-0.701361,0.507333


In [17]:
y = train_data['y']
y.head()

6199    1.0
3199   -1.0
88     -1.0
8298   -1.0
832    -1.0
Name: y, dtype: float64

## AdaBoost

In [18]:
def err(X, y, y_predicted, w):
    # Function to compute the error in each step according to pseudocode
    
    N = len(X)
    
    sum_errors = 0
    for i in range(N): 
        if y[i] == y_predicted[i]: 
            # Correct predicted, I = 0
            sum_errors += 0
        else: 
            # y[i] != y_predicted[i] => Error!, I = 1
            sum_errors += w[i] * 1
    sum_of_weights = sum(w)
    
    return sum_errors/sum_of_weights

In [19]:
def alfa(error): 
    return math.log(((1-error)/error), 2)

In [20]:
def adaboost(X, y, sample_weight, M):
    # X, y: Training data
    # sample_weight: Should be one of the parameters, according to the task
    # M: Maximum number of iterations
    
    N = len(X) # Number of observations
    
    weights = [1/N for i in range(N)] # Initialize the observation weights w_i = 1/N, i = 1, 2, …, N
    
    G_list = []
    y_predicted_list = []
    alfa_list = []
    
    y_np_array = np.array(y)
    
    for m in range(M):
        
        G = DecisionTreeClassifier(max_depth = 1, max_leaf_nodes=2)
        G.fit(X, y_np_array, weights)
        y_predicted = G.predict(X)
        
        err_m = err(X, y_np_array, y_predicted, weights)
        
        alfa_m = alfa(err_m)
        
        for i in range(len(weights)):
            if y_np_array[i] == y_predicted[i]: 
            # Correct predicted, I = 0
                weights[i] = weights[i] * 1
            else: 
            # y_np_array[i] != y_predicted[i] => Error!, I = 1
                weights[i] = weights[i] * math.exp(alfa_m)
                
        G_list.append(G)
        y_predicted_list.append(y_predicted)
        alfa_list.append(alfa_m)
    
    before_sum = []
    for i in range(M):
        before_sum.append(alfa_list[i] * y_predicted_list[i])
    
    data = np.array(before_sum)
    output = np.sign(sum(data,0))
    
    print('Accuracy = ', (output == y).sum() / N) 
    
    return output


In [21]:
result = adaboost(X, y, 1, 100)

Accuracy =  0.918
