# Trabalho Prático 2 - Implementação do Algoritmo de Boosting

- **Aluno:** Vítor Gabriel Reis Caitité
- **Matrícula:** 2021712430

In [7]:
# Imports
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter
import pandas as pd
from sklearn.metrics import roc_curve, precision_recall_curve
from sklearn.model_selection import StratifiedKFold, cross_val_score, KFold
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import tree
from sklearn import svm
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV 
from sklearn.pipeline import Pipeline
import seaborn as sns
import tabulate

## Leitura e Pré-processamento dos Dados

In [8]:
names = ['P1', 'P2', 'P3', 'P4', 'P5', 'P6', 'P7', 'P8', 'P9', 'class']
df = pd.read_csv('data/tic-tac-toe.data', header=None, sep=',', names=names)
df.head()

Unnamed: 0,P1,P2,P3,P4,P5,P6,P7,P8,P9,class
0,x,x,x,x,o,o,x,o,o,positive
1,x,x,x,x,o,o,o,x,o,positive
2,x,x,x,x,o,o,o,o,x,positive
3,x,x,x,x,o,o,o,b,b,positive
4,x,x,x,x,o,o,b,o,b,positive


In [9]:
# Encode categorical label as numbers
df['class'] = df['class'].replace({'negative': -1, 'positive': 1})
y = df[['class']].to_numpy()
y = y.flatten()
# One-hot encode board space columns
X = df.drop('class', axis=1)
one_hot_encoder = OneHotEncoder()
X = one_hot_encoder.fit_transform(X).toarray()

## Implementação do Algoritmo de Boosting

In [18]:
class AdaBoost:
    
    def __init__(self, n_estimators=100):
        self.n_estimators = n_estimators # number of estimators
        self.estimators = []            # list of estimators               
        self.alpha = []                  # list of weights relation with each weak classifier
    
    
    # Function to fit the model. Arguments:
    # X: independent variables - numpy array (n_samples x n_features)
    # y: target variable - numpy array (targets must be -1 or 1)
    def fit(self, X, y):
        # Step 1. Initialize all the observation weights (wi) with 1/(number_of_samples):
        self.w = np.ones([len(y)])/ len(y) 
                
        # Step 2: Iterate over the estimators:
        for estimator in range(self.n_estimators):
            # 2.a: Fit a estimator with the training data using the weights w:
            estimator = DecisionTreeClassifier(max_depth=1)
            estimator.fit(X, y, sample_weight=self.w)
            self.estimators.append(estimator) # saving on the list of wek classifiers
    
            # 2.b: Compute the error:
            y_hat = estimator.predict(X)
            error = 0
            for idx in range(len(y)):
                error += self.w[idx] * abs(y_hat[idx] - y[idx])/2
                        
            # 2.c: Compute alpha_i:
            self.alpha.append(0.5 * np.log((1 - error)/error))
                        
            # 2.d: Update weights w:
            self.w =  self.w * np.exp(-self.alpha[-1] * y_hat * y)
    
    # Function to predict. Arguments:
    # X: independent variables - numpy array (n_samples x n_features)
    def predict(self, X): 
        predictions = np.zeros([X.shape[0], self.n_estimators])
         # Predict class label for each weak classifier (weighted by alpha[i])
        for idx, classifier in enumerate(self.estimators):
            predictions[:, idx] = classifier.predict(X) * self.alpha[idx]
        # Final predictions:
        y_hat = np.zeros(X.shape[0])
        for idx in range(X.shape[0]):
            y_hat[idx] = (1 * np.sign(predictions[idx, :].sum())).astype(int)
        y_hat[y_hat==0] = 1 
        return y_hat

## Validação do Modelo (5-fold cross-validation) 

In [20]:
from sklearn.model_selection import train_test_split
#Applies one hot encoding to data and divides it into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.95) 
ab = AdaBoost()
ab.fit(X_train, y_train)

# Predict on test set
y_pred = ab.predict(X_test)

from sklearn.metrics import accuracy_score
accuracy_score(y_pred, y_test)


0.6458333333333334