---
## 🤖 Fase 2: Selección de modelo y optimización de hiperparámetros

---

In [1]:
import pandas as pd
import numpy as np
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.impute import KNNImputer
from sklearn.preprocessing import RobustScaler
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline
from sklearn.linear_model import (
    LinearRegression,
    Lasso,
    Ridge,
    ElasticNet,
    LassoCV
)
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import LinearSVR
from sklearn.ensemble import (
    ExtraTreesRegressor,
    RandomForestRegressor,
    AdaBoostRegressor,
    GradientBoostingRegressor,
    BaggingRegressor
)
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from scipy.stats import uniform, loguniform, randint

import sys
sys.path.append('..\src')
from functions import (cv_modelos,
                       optimizar_hiperparametros
                       )

In [2]:
datos = pd.read_csv('../data/df_pipeline.csv', index_col="Unnamed: 0")
datos.head()

Unnamed: 0,x2,x3,x4,x5,x6,x7,x8,x9,x10,deseada,x1_bin
0,0.0,2.215558,176.46,4.49,1058.6,780.09,28.0,-1.867265,0.900023,21.53923,0.0
1,98.06,1.406881,,6.65,1066.0,785.52,,0.633919,0.862797,17.836744,1.0
2,0.0,0.0,192.0,,931.2,842.6,7.0,-0.203045,0.461557,23.404952,2.0
3,26.0,2.093422,210.0,3.93,882.0,699.0,28.0,-1.3828,0.338268,55.551081,3.0
4,124.1,0.0,185.7,0.0,1083.4,,28.0,-0.510016,0.603488,17.596806,0.0


---
Con la información proporcionada por el EDA, el paso siguiente es pretratar los datos antes de utilzarlos como input en los modelos.

Para ello, vamos a generar 1 único pipeline de preprocesado, ya que, tras testear diversas combinaciones, se descartó la presencia de variables categóricas u ordinales camufladas entre las numéricas del dataset.

En lo que respecta a las relaciones polinómicas detectadas mediante test de Ramsey, se ha desestimado su modelización debido a los problemas que arrojaban los modelos (r2 ajustado disparado incluso por encima de la unidad, error inflado, etc.).

---

In [3]:
y = datos['deseada']
X = datos.drop(columns=['deseada'])

prep = Pipeline([
    ('imputadorKNN', KNNImputer(n_neighbors=7)),
    ('escalador', RobustScaler())
    # adoptamos escalado robusto en lugar de otras técnicas, como
    # windsorización, para conservar toda la información posible
])

In [4]:
# debido a la escasez de datos de entrenamiento + riesgo de overfitting
# introducimos regularización con L1, porque sabemos que hay variables
# que contienen muy poca información
seed = 42
selector_lasso = SelectFromModel(
    LassoCV(
        alphas=np.logspace(-4, 1, 100),
        cv=5,
        random_state=seed,
        max_iter=10000
        ),
        threshold='median'
        )

In [5]:
linear_models = {'OLS': LinearRegression(n_jobs=-1),
                 'Lasso': Lasso(random_state=seed),
                 'Ridge': Ridge(random_state=seed),
                 'ElasticNet': ElasticNet(random_state=seed)}
other_models = {'SVM': LinearSVR(random_state=seed),
                'Arbol_decision': DecisionTreeRegressor(random_state=seed),
                'Arboles_aleatorios': ExtraTreesRegressor(random_state=seed),
                'Random_Forest': RandomForestRegressor(random_state=seed),
                'Bagging': BaggingRegressor(random_state=seed),
                'AdaBoost': AdaBoostRegressor(random_state=seed),
                'GradientBoosting': GradientBoostingRegressor(random_state=seed),
                'XGBoost': XGBRegressor(random_state=seed),
                'KVecinos': KNeighborsRegressor(),
                'LightGBM': LGBMRegressor(random_state=seed),
                'CatBoost': CatBoostRegressor(random_state=seed, 
                                              verbose=False,
                                              allow_writing_files=False)}

In [6]:
# Probamos con todos los modelos instanciados
resultados_lineales = cv_modelos(linear_models, prep, selector_lasso, X, y)
resultados_otros = cv_modelos(other_models, prep, selector_lasso, X, y)
resultados = {**resultados_lineales, **resultados_otros}

In [7]:
df_resultados = pd.DataFrame.from_dict(resultados, orient='index')
df_resultados.sort_values(by=['r2_test'], ascending=False, inplace=True)
df_resultados.head(15)

Unnamed: 0,r2_test,rmse_test,mae_test,r2_test_var,rmse_test_var,mae_test_var,r2_train,rmse_train,mae_train,fit_time,r2_ratio
CatBoost,0.783519,7.75632,5.494502,0.000605,0.243856,0.081156,0.951252,3.700772,2.653711,2.736003,1.214077
LightGBM,0.767889,8.004882,5.758912,0.001078,0.032925,0.03169,0.925092,4.591111,3.233669,3.799426,1.20472
GradientBoosting,0.763023,8.104149,6.072102,0.001212,0.39926,0.154795,0.870008,6.047196,4.468032,0.19818,1.140212
XGBoost,0.743631,8.444029,6.074711,0.000609,0.147571,0.031471,0.987731,1.827648,0.732325,0.210401,1.328253
Random_Forest,0.739775,8.4972,6.181261,0.000975,0.219254,0.086061,0.953471,3.616399,2.479676,0.363971,1.288866
Arboles_aleatorios,0.729654,8.670533,6.133896,0.0013,0.575919,0.210503,0.989407,1.69259,0.310229,0.296799,1.355996
Bagging,0.713461,8.902772,6.435706,0.001681,0.270763,0.087474,0.941442,4.056847,2.652729,0.136271,1.319541
KVecinos,0.645013,9.928786,7.42886,0.002199,0.553593,0.227522,0.771498,8.009611,6.002164,0.094041,1.196097
AdaBoost,0.64033,9.994251,7.984168,0.001832,0.4036,0.153092,0.696247,9.242639,7.633498,0.185783,1.087326
Arbol_decision,0.551082,11.184562,7.89471,0.002592,0.721677,0.11485,0.989407,1.69259,0.310199,0.112903,1.795391


---
De todos los modelos evaluados, los que arrojan mejores métricas en test son:


1.   Category Boost
2.   LightGBM
3.   Gradient Boosting

Las varianzas que presentan para la validación cruzada en R² son bastante aceptables, lo que indica que no hay mucha variación entre foldings.

Si bien es cierto que los modelos basados en árboles han demostrado ser superiores en explicabilidad a los modelos lineales, la realidad es que los modelos que mejor rendimiento presentan también dan signos de overfitting (R² en training es superior al de test).

Esto se debe, entre otros potenciales factores, al reducido conjunto de datos que tenemos disponible, y que hace que el ratio entre registros (n) y variables independientes (k) se quede muy cerca de sobrepasar el margen óptimo.

Puesto que ya se ha hecho selección de variables en el pipeline, el siguiente paso es optimizar hiperparámetros de los modelos, teniendo en mente que el principal objetivo es reducir el overfitting.

---

In [8]:
gb_params = {
    'n_estimators': randint(100, 800),
    'learning_rate': loguniform(0.01, 0.3),
    'max_depth': randint(3, 12),
    'subsample': uniform(0.6, 0.4),
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 10),
    'max_features': uniform(0.5, 0.5)
}

lgbm_params = {
    'n_estimators': randint(100, 1000),
    'learning_rate': loguniform(0.01, 0.3),
    'max_depth': randint(3, 15),
    'num_leaves': randint(20, 80),
    'min_child_samples': randint(5, 30),
    'subsample': uniform(0.6, 0.4),
    'colsample_bytree': uniform(0.6, 0.4),
    'reg_alpha': loguniform(1e-6, 10),
    'reg_lambda': loguniform(1e-6, 10)
}

cat_params = {
    'iterations': randint(100, 1000),
    'learning_rate': loguniform(0.01, 0.3),
    'depth': randint(3, 12),
    'l2_leaf_reg': loguniform(1e-3, 10),
    'bagging_temperature': uniform(0, 1),
    'border_count': randint(32, 255),
    'random_strength': uniform(0, 1),
    'subsample': uniform(0.6, 0.4)
}

gb_params_pipeline = {f'modelo__{k}': v for k, v in gb_params.items()}
lgbm_params_pipeline = {f'modelo__{k}': v for k, v in lgbm_params.items()}
cat_params_pipeline = {f'modelo__{k}': v for k, v in cat_params.items()}

modelos_parametros_pipeline = {
    'GradientBoosting': (GradientBoostingRegressor(random_state=seed),
                         gb_params_pipeline),
    'LightGBM': (LGBMRegressor(random_state=seed), lgbm_params_pipeline),
    'CatBoost': (CatBoostRegressor(random_state=seed, verbose=False),
                 cat_params_pipeline)
}

In [10]:
opt = optimizar_hiperparametros(modelos_parametros_pipeline, 
                                prep, selector_lasso, 
                                X, 
                                y, 
                                seed)
df_opt = pd.DataFrame.from_dict(opt, orient='index')
print(df_opt)

Exception in thread Thread-7 (_readerthread):
Traceback (most recent call last):
  File "c:\ProgramData\anaconda3\envs\datasci_minimal\Lib\threading.py", line 1045, in _bootstrap_inner
    self.run()
  File "c:\ProgramData\anaconda3\envs\datasci_minimal\Lib\site-packages\ipykernel\ipkernel.py", line 766, in run_closure
    _threading_Thread_run(self)
  File "c:\ProgramData\anaconda3\envs\datasci_minimal\Lib\threading.py", line 982, in run
    self._target(*self._args, **self._kwargs)
  File "c:\ProgramData\anaconda3\envs\datasci_minimal\Lib\subprocess.py", line 1599, in _readerthread
    buffer.append(fh.read())
                  ^^^^^^^^^
  File "<frozen codecs>", line 322, in decode
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xa2 in position 107: invalid start byte


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000329 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 437
[LightGBM] [Info] Number of data points in the train set: 772, number of used features: 5
[LightGBM] [Info] Start training from score 35.895711
                                                    mejor_estimador  \
GradientBoosting  ((KNNImputer(n_neighbors=7), RobustScaler()), ...   
LightGBM          ((KNNImputer(n_neighbors=7), RobustScaler()), ...   
CatBoost          ((KNNImputer(n_neighbors=7), RobustScaler()), ...   

                                                 mejores_parametros  mejor_r2  \
GradientBoosting  {'modelo__learning_rate': 0.017279572377986996...  0.781521   
LightGBM          {'modelo__colsample_bytree': 0.665173770832571...  0.774191   
CatBoost          {'modelo__bagging_temperature': 0.456534570482...  0.779901   

                  mejor_rmse_test  mejor_mae_test  

In [None]:
df_opt.to_excel('params_opt.xlsx')