# Boosting
-- Thiago Vieira de Alcantara Silva<br/>
-- 2017719891

In [10]:
import pandas as pd
import numpy as np
from math import log2, fabs, exp

RSEED = 42
np.random.seed(RSEED)

### Reading the dataset...

In [11]:
data = pd.read_csv('tic-tac-toe.data', header=None, delimiter=',')
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,x,x,x,x,o,o,x,o,o,positive
1,x,x,x,x,o,o,o,x,o,positive
2,x,x,x,x,o,o,o,o,x,positive
3,x,x,x,x,o,o,o,b,b,positive
4,x,x,x,x,o,o,b,o,b,positive


### Treating the dataset...
Since the target variable is *positive* whenever the player x won the game, we consider both **o** and **b** as in the same class.

In [12]:
data = data.applymap(lambda x: 1 if (x == 'x' or x == 'positive') else -1)
data.describe()

# TODO(thiagovas): Add a new column with zeros, to ease the implementation of the 'always true' stump and
#                  'always false' stump.

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
count,958.0,958.0,958.0,958.0,958.0,958.0,958.0,958.0,958.0,958.0
mean,-0.127349,-0.210856,-0.127349,-0.210856,-0.043841,-0.210856,-0.127349,-0.210856,-0.127349,0.306889
std,0.992376,0.978028,0.992376,0.978028,0.99956,0.978028,0.992376,0.978028,0.992376,0.952242
min,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
25%,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
50%,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0
75%,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [23]:
class AdaBoost():
    def __init__(self):
        pass


    def train(self, data):
        '''
            This function trains the model on the data received.
        '''
        self.bweights = [1.0/data.shape[0] for i in range(data.shape[0])]
        self.X = None
        self.Y = None
        self.data = data
        self.alphas = []
        self.stumps = []
        last_accuracy = 0

        while True:
            best_stump, cur_alpha = self.get_best_stump()
            self.alphas.append(cur_alpha)
            self.stumps.append(best_stump)
            self.update_bweights()
            cur_accuracy = self.evaluate_model(self.data)
            if fabs(cur_accuracy-last_accuracy) < 0.01:
                break
            last_accuracy = cur_accuracy


    def test(self, test_data):
        '''
            This function runs the model on the data received and returns the classifications.
        '''

        results = []
        for entry in test_data.values:
            cur_result = 0
            for s_index in range(len(self.stumps)):
                cur_result += self.alphas[s_index]*self.classify_stump(self.stumps[s_index], entry)
            if cur_result < 0:
                results.append(-1)
            else:
                results.append(1)

        return results

    
    def evaluate_model(self, test_data):
        results = self.test(test_data)
        acc = 0
        for i in range(test_data.shape[0]):
            if results[i] == test_data.iloc[i][-1]:
                acc += 1
        acc = float(acc) / test_data.shape[0]
        return acc
        

    def classify_stump(self, stump, data_entry):
        '''
            Given a stump and a data entry, this function returns the
            stump's classification.
        '''
        if data_entry[stump[0]] <= 0:
            return stump[1]
        else:
            return -stump[1]


    def get_best_stump(self):
        '''
            This function gets the stump that minimizes the empirical error,
            and add it to the list of stumps with its alpha value.
        '''
        best_stump = (0, 0)
        best_error = 1000000
        for i in range(9):
            for j in [-1, 1]:
                cur_error = 0
                cur_stump = (i, j)
                for k in range(self.data.shape[0]):
                    if self.classify_stump(cur_stump, self.data.iloc[k]) != self.data.iloc[k][9]:
                        cur_error += self.bweights[k]
                if cur_error < best_error:
                    best_error = cur_error
                    best_stump = (i, j)
        alpha = 0.0
        if best_error == 0:
            alpha = 100000
        else:
            best_error = float(best_error)/self.data.shape[0]
            alpha = log2((1.0-best_error)/best_error)/2.0

        return best_stump, alpha


    def update_bweights(self):
        '''
            ...
        '''
        for i in range(len(self.bweights)):
            exp_sign = self.classify_stump(self.stumps[-1], self.data.iloc[i]) * self.data.iloc[i][9]
            self.bweights[i] = self.bweights[i] * exp(-self.alphas[-1]*exp_sign)

        for i in range(len(self.bweights)):
            self.bweights[i] /= float(sum(self.bweights))

## Training the AdaBooost...

In [24]:
clf = AdaBoost()
clf.train(data)

KeyError: -1