# ECO904 - Atividade 1 - Etapa 4 - Ajuste de Hiperparêmtros

[Proposta On-line](https://docs.google.com/document/d/e/2PACX-1vTbH1JBaBWc30jTjL6ECWhNfLF23-Iv9afdu7KL2oVP8WbiDxUewcHaAE5y6dQJVs6heCOiGmyO9fFX/pub)

Seleção de Metodologias e Hiperparâmetros de Aprendizado de Máquina
- Incorporar as metodologias XGBoost e LightGBM;
- Utilizar as 5 melhores metodologias da etapa anterior;
- Utilizar o recurso de grade de hiper-parâmetros e validação cruzada para melhorar resultados com métricas selecionadas;
- Identificar os 2 melhores modelos obtidos.

In [2]:
import pandas as pd
df = pd.read_csv('base_filtrada.csv')
df.head()

Unnamed: 0,dispositivo_1,dispositivo_2,dispositivo_3,dispositivo_4,dispositivo_5,dispositivo_6,dispositivo_7,dispositivo_8,dispositivo_9,dispositivo_10,...,dispositivo_42,dispositivo_43,dispositivo_44,dispositivo_45,dispositivo_46,dispositivo_47,dispositivo_48,dispositivo_49,dispositivo_50,falha
0,48.7,36.58,42.64,51.02,66.17,43.68,51.84,57.06,40.92,33.1,...,42.58,45.03,55.41,56.54,34.13,50.11,49.88,49.82,69.11,0
1,45.65,69.17,48.58,34.39,42.41,41.61,59.15,55.03,59.03,59.72,...,74.03,48.05,39.78,58.47,63.05,54.8,68.53,45.07,71.07,0
2,63.11,49.81,38.17,59.98,61.59,59.39,48.5,55.62,52.2,30.47,...,43.08,47.89,32.3,66.46,54.78,60.01,21.4,53.12,50.01,0
3,28.41,38.22,43.15,39.12,58.32,71.58,36.61,45.84,35.68,45.38,...,58.2,55.04,36.48,52.88,54.85,66.86,50.58,58.64,53.66,0
4,64.94,49.23,63.78,54.09,53.86,66.0,36.42,23.26,46.84,57.69,...,55.68,57.47,42.4,49.21,52.69,66.94,55.73,38.38,38.92,1


## Incorporar as metodologias XGBoost e LightGBM

In [3]:
!pip install XGBoost LightGBM



In [4]:
from xgboost import XGBRFClassifier
from lightgbm import LGBMClassifier

## Utilizar as 5 melhores metodologias da etapa anterior

In [5]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.neural_network import MLPClassifier

## Utilizar o recurso de grade de hiper-parâmetros e validação cruzada para melhorar resultados com métricas selecionadas

In [6]:
from sklearn.model_selection import train_test_split

X = df.drop('falha', axis=1).values
y = df['falha'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.95, random_state=42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((6912, 50), (131328, 50), (6912,), (131328,))

In [7]:

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.model_selection import GridSearchCV
import numpy as np
from tqdm import tqdm

prepros = [
    (None,None),
    (StandardScaler(),{'with_mean':[True,False],'with_std':[True,False]}),
    (MinMaxScaler(),{'feature_range':[(0,1),(-1,1)]}),
]

redutores = [
    (None,None),
    (PCA(random_state=42),{'n_components':[16,32,None]}),
    (TruncatedSVD(random_state=42),{'n_components':[16,32]}),
]

aprendizados = [
    (XGBRFClassifier(random_state=42),{}),
    (LGBMClassifier(random_state=42),{}),
    (RandomForestClassifier(random_state=42),{'criterion': ['gini', 'log_loss'],'max_depth':[5,10]}),
    (DecisionTreeClassifier(random_state=42),{'criterion': ['gini', 'log_loss'],'max_depth':[5,10]}),
    (GradientBoostingClassifier(random_state=42),{'loss': ['deviance', 'exponential']}),
    (ExtraTreeClassifier(random_state=42),{'criterion': ['gini', 'log_loss'],'max_depth':[5,10]}),
    (MLPClassifier(random_state=42), {'hidden_layer_sizes': [(100,), (50, 50)], 'activation': ['relu', 'tanh']}),
]

resultados = []
for pp, ppp, red, redp, ap, app in tqdm([(pp, ppp, red, redp, ap, app) for pp, ppp in prepros for red, redp in redutores for ap, app in aprendizados]):

    param_grid = {}
    steps = []

    pre_nome = pp.__class__.__name__
    red_nome = red.__class__.__name__
    ap_nome = ap.__class__.__name__

    if pp is not None:
        steps.append((pre_nome, pp))
        # parametros do pré-processamento
        for key in ppp.keys():
            param_grid[pre_nome + '__' + key] = ppp[key]

    if red is not None:
        steps.append((red_nome, red))
        # parametros do redutor
        for key in redp.keys():
            param_grid[red_nome + '__' + key] = redp[key]

    steps.append((ap_nome, ap))
    # parametros do aprendizado
    for key in app.keys():
        param_grid[ap_nome + '__' + key] = app[key]

    pipe = Pipeline(steps)

    grid = GridSearchCV(estimator=pipe, param_grid=param_grid, cv=5, scoring='f1', n_jobs=-1)
    grid.fit(X_train, y_train)
    cv = grid.cv_results_
    res = {
        'preprocessamento': pre_nome,
        'reducao': red_nome,
        'aprendizado': ap_nome,
        'tempo': cv['std_fit_time'],
        'f1': cv['mean_test_score'],
    }
    resultados.append(res)

  3%|▎         | 2/63 [00:08<03:56,  3.88s/it]

[LightGBM] [Info] Number of positive: 3429, number of negative: 3483
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002030 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12750
[LightGBM] [Info] Number of data points in the train set: 6912, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.496094 -> initscore=-0.015625
[LightGBM] [Info] Start training from score -0.015625


5 fits failed out of a total of 10.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
3 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Repositorio Offline\ECO904-2024\ECO904\.venv\lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Repositorio Offline\ECO904-2024\ECO904\.venv\lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Repositorio Offline\ECO904-2024\ECO904\.venv\lib\site-packages\sklearn\pipeline.py", line 475, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "c:\Repositorio Offline\ECO904-2024\ECO904\.ven

[LightGBM] [Info] Number of positive: 3429, number of negative: 3483
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003405 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12750
[LightGBM] [Info] Number of data points in the train set: 6912, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.496094 -> initscore=-0.015625
[LightGBM] [Info] Start training from score -0.015625


15 fits failed out of a total of 30.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
8 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Repositorio Offline\ECO904-2024\ECO904\.venv\lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Repositorio Offline\ECO904-2024\ECO904\.venv\lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Repositorio Offline\ECO904-2024\ECO904\.venv\lib\site-packages\sklearn\pipeline.py", line 475, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "c:\Repositorio Offline\ECO904-2024\ECO904\.ve

[LightGBM] [Info] Number of positive: 3429, number of negative: 3483
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001220 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8160
[LightGBM] [Info] Number of data points in the train set: 6912, number of used features: 32
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.496094 -> initscore=-0.015625
[LightGBM] [Info] Start training from score -0.015625


10 fits failed out of a total of 20.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Repositorio Offline\ECO904-2024\ECO904\.venv\lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Repositorio Offline\ECO904-2024\ECO904\.venv\lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Repositorio Offline\ECO904-2024\ECO904\.venv\lib\site-packages\sklearn\pipeline.py", line 475, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "c:\Repositorio Offline\ECO904-2024\ECO904\.ve

[LightGBM] [Info] Number of positive: 3429, number of negative: 3483
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001299 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12750
[LightGBM] [Info] Number of data points in the train set: 6912, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.496094 -> initscore=-0.015625
[LightGBM] [Info] Start training from score -0.015625


20 fits failed out of a total of 40.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
12 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Repositorio Offline\ECO904-2024\ECO904\.venv\lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Repositorio Offline\ECO904-2024\ECO904\.venv\lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Repositorio Offline\ECO904-2024\ECO904\.venv\lib\site-packages\sklearn\pipeline.py", line 475, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "c:\Repositorio Offline\ECO904-2024\ECO904\.v

KeyboardInterrupt: 

## Identificar os 2 melhores modelos obtidos

In [8]:
df_res = pd.DataFrame(resultados)
# Ordenar decrescente os melhores modelos avaliados pelas métricas
df_res.sort_values('f1', ascending=False).head(2)

ValueError: operands could not be broadcast together with shapes (12,) (16,) 