**Sumário**<a id='toc0_'></a>    
- [Otimização de hiperparâmetros](#toc1_)    
  - [Árvore de decisão](#toc1_1_)    
    - [Resultados](#toc1_1_1_)    
  - [XGBoost](#toc1_2_)    
    - [Resultados](#toc1_2_1_)    
  - [SVM](#toc1_3_)    
    - [Resultados](#toc1_3_1_)    
  - [Rede Neural](#toc1_4_)    
    - [Resultados](#toc1_4_1_)    
  - [Floresta Aleatória](#toc1_5_)    
    - [Resultados](#toc1_5_1_)    
  - [KNN](#toc1_6_)    
    - [Resultados](#toc1_6_1_)    

<!-- vscode-jupyter-toc-config
	numbering=false
	anchor=true
	flat=false
	minLevel=1
	maxLevel=6
	/vscode-jupyter-toc-config -->
<!-- THIS CELL WILL BE REPLACED ON TOC UPDATE. DO NOT WRITE YOUR TEXT IN THIS CELL -->

# <a id='toc1_'></a>[Otimização de hiperparâmetros](#toc0_)

In [2]:
from sklearnex import patch_sklearn   # noreorder

patch_sklearn()

from sklearn.preprocessing import RobustScaler, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPRegressor
from sklearn.svm import LinearSVR
from xgboost import XGBRegressor

import numpy as np
import pandas as pd
from pathlib import Path
import joblib
from time import time

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


Carregando o banco de dados com 4 atributos

In [3]:
DATASET_PATH = Path('D:/dados_tcc/dataset')
SAVE_PATH = Path('D:/dados_tcc/files')

X = pd.read_feather(DATASET_PATH / 'X_clean.feather', columns=[0, 11, 15, 20])
y = np.load(DATASET_PATH / 'y_clean.npy')

X_train, _, y_train, _ = train_test_split(
    X, y, test_size=0.2, random_state=123
)

A normalização é tratada como um hiperparâmetro a ser ajustado. Uma classe seletora é criada de forma que se possa utilizá-la com o `GridSearchCV`.

In [4]:
# https://stackoverflow.com/a/65467569
class ScalerSelector(BaseEstimator, TransformerMixin):
    def __init__(self, scaler=StandardScaler()):
        super().__init__()
        self.scaler = scaler

    def fit(self, X, y=None):
        return self.scaler.fit(X)

    def transform(self, X, y=None):
        return self.scaler.transform(X)

## <a id='toc1_1_'></a>[Árvore de decisão](#toc0_)

In [None]:
tree_pipeline = Pipeline(
    steps=[
        ('scaler_selector', ScalerSelector()),
        ('tree', DecisionTreeRegressor(random_state=123)),
    ]
)

param_grid = {
    'scaler_selector__scaler': [
        StandardScaler(),
        MinMaxScaler(),
        RobustScaler(),
    ],
    'tree__criterion': ['squared_error', 'friedman_mse'],
    'tree__splitter': ['best', 'random'],
    'tree__max_depth': [3, 5, 7, 9, 11, 13],
}

tree_grid = GridSearchCV(
    tree_pipeline,
    param_grid=param_grid,
    scoring='r2',
    refit=False,
    verbose=4,
    return_train_score=True,
)

tree_grid.fit(X_train, y_train)

with open(SAVE_PATH / 'hyperparameter_tree.joblib', 'wb') as file:
    joblib.dump(tree_grid, file)

### <a id='toc1_1_1_'></a>[Resultados](#toc0_)

In [34]:
with open(SAVE_PATH / 'hyperparameter_tree.joblib', 'rb') as file:
    tree_results = joblib.load(file)

tree_results = pd.DataFrame(tree_results.cv_results_).sort_values(
    'mean_test_score', ascending=False
)

In [40]:
tree_results[
    [
        'rank_test_score',
        'mean_train_score',
        'mean_test_score',
        'param_scaler_selector__scaler',
        'param_tree__criterion',
        'param_tree__max_depth',
        'param_tree__splitter',
    ]
].head()

Unnamed: 0,rank_test_score,mean_train_score,mean_test_score,param_scaler_selector__scaler,param_tree__criterion,param_tree__max_depth,param_tree__splitter
58,1,0.899995,0.890062,RobustScaler(),squared_error,13,best
70,2,0.899995,0.890045,RobustScaler(),friedman_mse,13,best
10,3,0.900023,0.89004,StandardScaler(),squared_error,13,best
22,4,0.900023,0.89003,StandardScaler(),friedman_mse,13,best
46,5,0.89949,0.889793,MinMaxScaler(),friedman_mse,13,best


## <a id='toc1_2_'></a>[XGBoost](#toc0_)

Devido ao funcionamento da funcionalidade de *Early Stopping* do XGboost, é necessário realizar uma busca em grade de forma manual.

O Código desenvolvido foi baseado em https://xgboosting.com/xgboost-early-stopping-with-grid-search/

In [4]:
def params_generator(param_grid):
    for scaler in param_grid['scaler']:
        for booster in param_grid['booster']:
            for max_depth in param_grid['max_depth']:
                for n_estimators in param_grid['n_estimators']:
                    yield (scaler, booster, max_depth, n_estimators)

In [None]:
param_grid = {
    'scaler': [StandardScaler(), MinMaxScaler(), RobustScaler()],
    'booster': ['gbtree', 'gblinear'],
    'max_depth': [3, 5, 7, 9, 11, 13],
    'n_estimators': [70, 80, 90, 100, 110, 120, 130, 140, 150],
}

results = {
    'scaler': [],
    'booster': [],
    'max_depth': [],
    'n_estimators': [],
    'mean_r2_test': [],
    'mean_r2_train': [],
}


candidates = np.prod([len(val) for val in param_grid.values()])
kfold = KFold(shuffle=True, random_state=123)
best_score = 0
candidate = 1

print(
    f'Fitting 5 folds for each of {candidates} candidates, ',
    f'totalling {5*candidates} fits',
)

# Grid search
for scaler, booster, max_depth, n_estimators in params_generator(param_grid):
    print(f'\n---- Candidate {candidate}/{candidates} ----\n')

    test_score, train_score = [], []
    counter = 1

    # Cross validation in 5 folds
    for train_index, val_index in kfold.split(X_train, y_train):
        initial_time = time()

        X_train_fold = X_train.values[train_index]
        X_val_fold = X_train.values[val_index]
        y_train_fold = y_train[train_index]
        y_val_fold = y_train[val_index]

        X_train_fold = scaler.fit_transform(X_train_fold)
        X_val_fold = scaler.transform(X_val_fold)

        xgb = XGBRegressor(
            booster=booster,
            max_depth=max_depth,
            n_estimators=n_estimators,
            early_stopping_rounds=6,
            random_state=123,
            eval_metric=[
                'mae',
            ],
        )

        xgb.fit(
            X_train_fold,
            y_train_fold,
            eval_set=[(X_val_fold, y_val_fold)],
        )

        r2_train = xgb.score(X_train_fold, y_train_fold)
        r2_test = xgb.score(X_val_fold, y_val_fold)
        train_score.append(r2_train)
        test_score.append(r2_test)

        loop_time = time() - initial_time
        print(
            f'[CV {counter}/5] END {scaler=}, {booster=}, {max_depth=}, ',
            f'{n_estimators=}, r2=(train={r2_train}, test={r2_test}), ',
            f'{loop_time=:.2f} s',
        )
        counter += 1

    candidate += 1
    mean_score = np.mean(test_score)

    # saving iteration results
    results['scaler'].append(scaler)
    results['booster'].append(booster)
    results['max_depth'].append(max_depth)
    results['n_estimators'].append(n_estimators)
    results['mean_r2_test'].append(mean_score)
    results['mean_r2_train'].append(np.mean(r2_train))

    if mean_score > best_score:
        best_score = mean_score
        best_params = {
            'scaler': scaler,
            'booster': booster,
            'max_depth': max_depth,
            'n_estimators': n_estimators,
        }

with open(SAVE_PATH / 'hyperparameter_xgb.joblib', 'wb') as file:
    joblib.dump((results, best_params), file)

### <a id='toc1_2_1_'></a>[Resultados](#toc0_)

In [7]:
with open(SAVE_PATH / 'hyperparameter_xgb.joblib', 'rb') as file:
    xgb_results, best_params_xgb = joblib.load(file)

xgb_results = pd.DataFrame(xgb_results).sort_values(
    'mean_r2_test', ascending=False
)

In [16]:
xgb_results[
    [
        'mean_r2_train',
        'mean_r2_test',
        'scaler',
        'booster',
        'n_estimators',
        'max_depth',
    ]
].head()

Unnamed: 0,mean_r2_train,mean_r2_test,scaler,booster,n_estimators,max_depth
53,0.977208,0.941047,StandardScaler(),gbtree,150,13
269,0.977215,0.941005,RobustScaler(),gbtree,150,13
161,0.977608,0.94096,MinMaxScaler(),gbtree,150,13
52,0.97612,0.940889,StandardScaler(),gbtree,140,13
268,0.976167,0.940863,RobustScaler(),gbtree,140,13


## <a id='toc1_3_'></a>[SVM](#toc0_)

In [None]:
svm_pipeline = Pipeline(
    steps=[
        ('scaler_selector', ScalerSelector()),
        (
            'svm',
            LinearSVR(
                random_state=123,
                dual=False,
                loss='squared_epsilon_insensitive',
            ),
        ),
    ]
)

param_grid = {
    'scaler_selector__scaler': [
        StandardScaler(),
        MinMaxScaler(),
        RobustScaler(),
    ],
    'svm__C': [1, 2, 5, 10, 15, 20],
    'svm__epsilon': [0, 1e-4, 1e-3, 1e-2, 1e-1],
}

svm_grid = GridSearchCV(
    svm_pipeline,
    param_grid=param_grid,
    scoring='r2',
    refit=False,
    verbose=4,
    return_train_score=True,
)

svm_grid.fit(X_train, y_train)

with open(SAVE_PATH / 'hyperparameter_svm.joblib', 'wb') as file:
    joblib.dump(svm_grid, file)

In [18]:
with open(SAVE_PATH / 'hyperparameter_svm.joblib', 'rb') as file:
    svm_results = joblib.load(file)

svm_results = pd.DataFrame(svm_results.cv_results_).sort_values(
    'mean_test_score', ascending=False
)

### <a id='toc1_3_1_'></a>[Resultados](#toc0_)

In [20]:
svm_results[
    [
        'rank_test_score',
        'mean_train_score',
        'mean_test_score',
        'param_scaler_selector__scaler',
        'param_svm__C',
        'param_svm__epsilon',
    ]
].head()

Unnamed: 0,rank_test_score,mean_train_score,mean_test_score,param_scaler_selector__scaler,param_svm__C,param_svm__epsilon
2,1,0.340489,0.340466,StandardScaler(),1,0.001
7,2,0.340489,0.340466,StandardScaler(),2,0.001
12,3,0.340489,0.340466,StandardScaler(),5,0.001
17,4,0.340489,0.340466,StandardScaler(),10,0.001
22,5,0.340489,0.340466,StandardScaler(),15,0.001


## <a id='toc1_4_'></a>[Rede Neural](#toc0_)

In [None]:
nn_pipeline = Pipeline(
    steps=[
        ('scaler_selector', ScalerSelector()),
        (
            'nn',
            MLPRegressor(
                random_state=123,
                early_stopping=True,
                n_iter_no_change=6,
                max_iter=300,
            ),
        ),
    ]
)

param_grid = {
    'scaler_selector__scaler': [
        StandardScaler(),
        MinMaxScaler(),
        RobustScaler(),
    ],
    'nn__hidden_layer_sizes': [
        (2,),
        (4,),
        (8,),
        (16,),
        (32,),
        (2, 2),
        (4, 2),
        (4, 4),
        (8, 4),
        (8, 8),
        (16, 4),
        (16, 8),
        (16, 16),
        (32, 8),
        (32, 16),
        (32, 32),
    ],
}

nn_grid = GridSearchCV(
    nn_pipeline,
    param_grid=param_grid,
    scoring='r2',
    refit=False,
    verbose=4,
    return_train_score=True,
)

nn_grid.fit(X_train, y_train)

with open(SAVE_PATH / 'hyperparameter_nn.joblib', 'wb') as file:
    joblib.dump(nn_grid, file)

### <a id='toc1_4_1_'></a>[Resultados](#toc0_)

In [21]:
with open(SAVE_PATH / 'hyperparameter_nn.joblib', 'rb') as file:
    nn_results = joblib.load(file)

nn_results = pd.DataFrame(nn_results.cv_results_).sort_values(
    'mean_test_score', ascending=False
)

In [22]:
nn_results[
    [
        'rank_test_score',
        'mean_train_score',
        'mean_test_score',
        'param_scaler_selector__scaler',
        'param_nn__hidden_layer_sizes',
    ]
].head()

Unnamed: 0,rank_test_score,mean_train_score,mean_test_score,param_scaler_selector__scaler,param_nn__hidden_layer_sizes
47,1,0.897126,0.896893,RobustScaler(),"(32, 32)"
44,2,0.894068,0.893833,RobustScaler(),"(32, 16)"
45,3,0.890151,0.890018,StandardScaler(),"(32, 32)"
41,4,0.889558,0.889474,RobustScaler(),"(32, 8)"
38,5,0.887385,0.887152,RobustScaler(),"(16, 16)"


## <a id='toc1_5_'></a>[Floresta Aleatória](#toc0_)

In [None]:
rf_pipeline = Pipeline(
    steps=[
        ('scaler_selector', ScalerSelector()),
        ('rf', RandomForestRegressor(random_state=123)),
    ]
)

param_grid = {
    'scaler_selector__scaler': [
        StandardScaler(),
        MinMaxScaler(),
        RobustScaler(),
    ],
    'rf__n_estimators': [70, 80, 90, 100, 110, 120, 130, 140, 150],
    'rf__max_depth': [3, 5, 7, 9, 11, 13],
    'rf__max_features': ['sqrt', None],
}

rf_grid = GridSearchCV(
    rf_pipeline,
    param_grid=param_grid,
    scoring='r2',
    refit=False,
    verbose=4,
    return_train_score=True,
)

rf_grid.fit(X_train, y_train)

with open(SAVE_PATH / 'hyperparameter_rf.joblib', 'wb') as file:
    joblib.dump(rf_grid, file)

### <a id='toc1_5_1_'></a>[Resultados](#toc0_)

In [26]:
with open(SAVE_PATH / 'hyperparameter_rf.joblib', 'rb') as file:
    rf_results = joblib.load(file)

rf_results = pd.DataFrame(rf_results.cv_results_).sort_values(
    'mean_test_score', ascending=False
)

In [38]:
rf_results[
    [
        'rank_test_score',
        'mean_train_score',
        'mean_test_score',
        'param_scaler_selector__scaler',
        'param_rf__max_depth',
        'param_rf__n_estimators',
        'param_rf__max_features',
    ]
].head()

Unnamed: 0,rank_test_score,mean_train_score,mean_test_score,param_scaler_selector__scaler,param_rf__max_depth,param_rf__n_estimators,param_rf__max_features
282,1,0.903946,0.899951,StandardScaler(),13,110,sqrt
285,2,0.903723,0.899728,StandardScaler(),13,120,sqrt
284,3,0.903677,0.899683,RobustScaler(),13,110,sqrt
283,3,0.903677,0.899683,MinMaxScaler(),13,110,sqrt
294,5,0.903634,0.899636,StandardScaler(),13,150,sqrt


## <a id='toc1_6_'></a>[KNN](#toc0_)

In [None]:
knn_pipeline = Pipeline(
    steps=[
        ('scaler_selector', ScalerSelector()),
        ('knn', KNeighborsRegressor()),
    ]
)

param_grid = {
    'scaler_selector__scaler': [
        StandardScaler(),
        MinMaxScaler(),
        RobustScaler(),
    ],
    'knn__n_neighbors': [7, 9, 11, 13, 15, 17, 19, 21, 23],
    'knn__weights': ['uniform', 'distance'],
    'knn__metric': ['euclidean', 'manhattan'],
}

knn_grid = GridSearchCV(
    knn_pipeline,
    param_grid=param_grid,
    scoring='r2',
    refit=False,
    verbose=4,
    return_train_score=True,
)

knn_grid.fit(X_train, y_train)

with open(SAVE_PATH / 'hyperparameter_knn.joblib', 'wb') as file:
    joblib.dump(knn_grid, file)

### <a id='toc1_6_1_'></a>[Resultados](#toc0_)

In [28]:
with open(SAVE_PATH / 'hyperparameter_knn.joblib', 'rb') as file:
    knn_results = joblib.load(file)

knn_results = pd.DataFrame(knn_results.cv_results_).sort_values(
    'mean_test_score', ascending=False
)

In [39]:
knn_results[
    [
        'rank_test_score',
        'mean_train_score',
        'mean_test_score',
        'param_scaler_selector__scaler',
        'param_knn__n_neighbors',
        'param_knn__weights',
        'param_knn__metric',
    ]
].head()

Unnamed: 0,rank_test_score,mean_train_score,mean_test_score,param_scaler_selector__scaler,param_knn__n_neighbors,param_knn__weights,param_knn__metric
59,1,1.0,0.931521,RobustScaler(),7,distance,manhattan
65,2,1.0,0.930808,RobustScaler(),9,distance,manhattan
71,3,1.0,0.929976,RobustScaler(),11,distance,manhattan
77,4,1.0,0.929078,RobustScaler(),13,distance,manhattan
5,5,1.0,0.928537,RobustScaler(),7,distance,euclidean
