# Implementación de Modelos

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import graphviz


# Utilidades pre-procesamiento, pipelines y automatización de entrenamiento
from sklearn.model_selection import (
    train_test_split, 
    cross_val_score, 
    KFold, 
    StratifiedKFold, 
    GridSearchCV
    )

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
# from sklearn.impute import SimpleImputer

# Métricas de performance
from sklearn.metrics import (
    classification_report,
    accuracy_score, 
    precision_score, 
    recall_score, 
    f1_score, 
    roc_auc_score, 
    roc_curve, auc,
    confusion_matrix
)

# Modelos
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVR, SVC
from sklearn.neural_network import MLPClassifier
from src import MOEConformalTransformer
from src import StableTabularMoE as stm

from src.trainClassifiers import train_LogisticRegression # Custom LogisticRegression para evaluar en multiples tresholds.

# Diccionario con el grid de algoritmos y especificación de sus respectivos hiperparámetros
from src.models_config import config 

# Función para muestreo con SMOTE y Token Links.
from src.TabularDataPipeline import TabularDataPipeline


# Utilidades
import missingno as msno
import time
from datetime import datetime
from typing import Dict, List, Tuple, Optional, Any, Union
import warnings
warnings.filterwarnings('ignore')

## Carga y Preprocesamiento de datos

In [2]:
# Importación de los datos
url = 'https://raw.githubusercontent.com/tuliorozco/applied-statistics/refs/heads/main/data/diabetes_dataset.csv'
data = pd.read_csv(url)

In [3]:
data.head()

Unnamed: 0,year,gender,age,location,race:AfricanAmerican,race:Asian,race:Caucasian,race:Hispanic,race:Other,hypertension,heart_disease,smoking_history,bmi,hbA1c_level,blood_glucose_level,diabetes
0,2020,Female,32.0,Alabama,0,0,0,0,1,0,0,never,27.32,5.0,100,0
1,2015,Female,29.0,Alabama,0,1,0,0,0,0,0,never,19.95,5.0,90,0
2,2015,Male,18.0,Alabama,0,0,0,0,1,0,0,never,23.76,4.8,160,0
3,2015,Male,41.0,Alabama,0,0,1,0,0,0,0,never,27.32,4.0,159,0
4,2016,Female,52.0,Alabama,1,0,0,0,0,0,0,never,23.75,6.5,90,0


In [4]:
# Eliminación de observaciones duplicadas
data = data.drop_duplicates()
print(f"Nuevo total de registros: {len(data)}")
data.shape

Nuevo total de registros: 99986


(99986, 16)

In [5]:
# =================================== PREPROCESAMIENTO ===================================
# 1. Separar las features predictoras (X) de la variable objetivo/target (y).
TARGET_COL = 'diabetes'
X = data.drop('diabetes', axis=1)
y = data[TARGET_COL]

# 2. Clasificar los campos por tipo de variable para el pre-procesamiento adecuado
numerical_cols = ["age", "bmi", "hbA1c_level", "blood_glucose_level"] 

binary_cols = ['race:AfricanAmerican','race:Asian','race:Caucasian',
               'race:Hispanic','race:Other','hypertension','heart_disease'] 


categorical_cols = [col for col in data.columns if col not in numerical_cols + binary_cols]
categorical_cols.remove('diabetes') 


# 3. División Train/Test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42)


# 4. Pre-procesamiento de los datos
preprocessor = ColumnTransformer(transformers=[
    ("cat", OneHotEncoder(drop="first", handle_unknown='ignore'), categorical_cols),  
    ("num", StandardScaler(), numerical_cols),
    ("bin", "passthrough", binary_cols)  
])


## Metodología

### Especificación del Grid de Hiperparámetros por Tipo de Modelo

Teniendo en cuenta que los modelos a entrenar y analizar pueden ser ejecutados de forma automática mediante `GridSearchCV` (A excepción del algoritmo propuesto), se especificará un diccionario *models_config* estructurado de la siguiente forma:

* Key (clave): Identificación del tipo de algoritmo, ejemplo: KNN, LogisticRegresion, DecisionTree, etc.
* Value (valor): El conjunto de funciones (la implementación de cada algortimo en los paquetes `scikit-learn` o `xgboost`), los parámetros de inicialización del algoritmo (si los tiene) y sus respectivos hiperparámetros a evaluar en el entrenamiento. 

En la especificación se muestra de la siguiente manera:

```python
models_config = {
        'KNN': {
            'function': KNeighborsClassifier,
            'param_grid': {
            'classifier__n_neighbors': [3, 5, 7, 9, 11, 13, 15],
            'classifier__weights': ['uniform', 'distance'],
            'classifier__metric': ['euclidean']
            }   
        } ,
         'LogisticRegression': {
            'function': LogisticRegression,
            'init_params': {'max_iter': 1000, 'random_state': random_state}, 
            'param_grid': {
                'classifier__C': [0.01, 0.1, 1, 10, 100],
                'classifier__penalty': ['l1', 'l2'],
                'classifier__solver': ['liblinear', 'saga']
            }
         },
    # Y así sucesivamente ...
    'MLP': {
        'function': MLPClassifier,
        'init_params': {
            'max_iter': 500,
            'random_state': 101
        },
        'param_grid': {
            'classifier__hidden_layer_sizes': [(50,), (100,), (50, 50)],
            'classifier__activation': ['relu', 'tanh'],
            'classifier__solver': ['adam', 'sgd'],
            'classifier__alpha': [0.0001, 0.001, 0.01]
        }
    }
}
```
Así mismo, se define una variable tipo `Dict` denominada *common_params* con las siguientes *key-values*:

* `X_train`: Variables predictoras del conjunto de entrenamiento.
* `X_test`: Variables predictoras del conjunto de test.
* `y_train`: Variable *target* del conjunto de entrenamiento.
* `y_test`: Variable *target* del conjunto de test.
* `cv_folds`: Número de pliegues para la validación cruzada (cross validation).
* `random_state`: Semilla de reproducibilidad.

In [None]:
random_state = 101
models_config = config(random_state=random_state)

# Parámetros comunes para el entrenamiento
common_params = {
    'X_train': X_train,
    'X_test': X_test,
    'y_train': y_train,
    'y_test': y_test,
    'cv_folds': 5,
    'random_state': random_state
}

### Entrenamiento de los Modelos

Definidos los algoritmos con sus respectivos hiperparámetros así como los parámetros comunes (conjuntos de entrenamiento y pruebas), se define un bucle *for* que recorrerá el diccionario y procesará cada algoritmo según lo especificado en *models_config*. Las métricas resultantes del entrenamiento se almacenarán en listas para el análisis comparativo posterior frente al modelo propuesto.

In [None]:
# =================================== ENTRENAMIENTO ===================================

# Estructuras para almacenar las métricas requeridas en el benchmark.
metrics_rows = []
best_configs_list = []
cv_results_dfs = []   # lista para guardar todos los resultados de GridSearchCV
roc_curves = {}       # diccionario para guardar las curvas ROC

for model_name, config in models_config.items():
    print(f"\nEntrenando modelo: {model_name}")
    
    # 1. Construcción del pipeline
    steps = []
    steps.append(("preprocessing", preprocessor))

    # Usar init_params si existen, si no dict vacío
    init_params = config.get('init_params', {})
    steps.append(("classifier", config['function'](**init_params)))
    pipeline = Pipeline(steps)
    
    # 2. Cross-validation
    skf = StratifiedKFold(n_splits=common_params['cv_folds'], shuffle=True, random_state=common_params['random_state'])
    grid_search = GridSearchCV(pipeline, config['param_grid'], cv=skf, scoring='roc_auc', n_jobs=-1, verbose=1)

    # calcular los tiempos de entranamiento de cada modelo.
    inicio = time.perf_counter()

    grid_search.fit(common_params['X_train'], common_params['y_train'])
    fin = time.perf_counter()
    train_time = fin - inicio

    # Guardar resultados completos de GridSearchCV
    cv_results_df = pd.DataFrame(grid_search.cv_results_)
    cv_results_df["Model"] = model_name  # para identificar de qué modelo son
    cv_results_dfs.append(cv_results_df)
    
    # 3. Predicciones
    y_pred = grid_search.predict(common_params['X_test'])
    y_proba = grid_search.predict_proba(common_params['X_test'])[:, 1]
    
    # 4. Resultados principales
    cm = confusion_matrix(common_params['y_test'], y_pred)
    metrics_row = {
        'Model': model_name,
        'Precision': precision_score(common_params['y_test'], y_pred, zero_division=0),
        'Recall': recall_score(common_params['y_test'], y_pred, zero_division=0),
        'F1-Score': f1_score(common_params['y_test'], y_pred, zero_division=0),
        'Accuracy': accuracy_score(common_params['y_test'], y_pred),
        'ROC-AUC': roc_auc_score(common_params['y_test'], y_proba),
        'Best Params': {k.replace("classifier__", ""): v for k, v in grid_search.best_params_.items()},
        'TN': int(cm[0, 0]),
        'TP': int(cm[1, 1]),
        'FP': int(cm[0, 1]),
        'FN': int(cm[1, 0])
    }
    metrics_rows.append(metrics_row)
    
    # 5) Guardar las configuraciones de mejores métricas en la lista
    for metric in ['Precision', 'Recall', 'F1-Score', 'Accuracy', 'ROC-AUC']:
        best_configs_list.append({
            'Modelo': model_name,
            'Métrica': metric,
            'Valor': metrics_row[metric],
            'Parámetros': metrics_row['Best Params']
        })

    # 6. Guardar curva ROC
    fpr, tpr, _ = roc_curve(common_params['y_test'], y_proba)
    roc_auc = auc(fpr, tpr)
    roc_curves[model_name] = (fpr, tpr, roc_auc)
    
    print(f"Entrenamiento completado. Tiempo entrenamiento {model_name}: {train_time:.4f} segundos")

col_order = [
    'Model', 'Precision', 'Recall', 'F1-Score', 'Accuracy', 'ROC-AUC',
    'Best Params', 'TN', 'TP', 'FP', 'FN'
]
results_df = pd.DataFrame(metrics_rows)[col_order]

best_config_df = pd.DataFrame(best_configs_list)[['Modelo', 'Métrica', 'Valor', 'Parámetros']]


Entrenando modelo: KNN
Fitting 5 folds for each of 14 candidates, totalling 70 fits
Entrenamiento completado. Tiempo entrenamiento KNN: 302.4487 segundos

Entrenando modelo: LogisticRegression
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Entrenamiento completado. Tiempo entrenamiento LogisticRegression: 90.0171 segundos

Entrenando modelo: DecisionTree
Fitting 5 folds for each of 45 candidates, totalling 225 fits
Entrenamiento completado. Tiempo entrenamiento DecisionTree: 10.2351 segundos

Entrenando modelo: RandomForest
Fitting 5 folds for each of 45 candidates, totalling 225 fits
Entrenamiento completado. Tiempo entrenamiento RandomForest: 286.5900 segundos

Entrenando modelo: XGBoost
Fitting 5 folds for each of 72 candidates, totalling 360 fits
Entrenamiento completado. Tiempo entrenamiento XGBoost: 50.7437 segundos

Entrenando modelo: SVM
Fitting 5 folds for each of 12 candidates, totalling 60 fits
Entrenamiento completado. Tiempo entrenamiento SVM: 3210.0026 seg

---

Construcción del modelo:

In [6]:
# --- 3. División y Procesamiento de Datos ---

# Dividir en entrenamiento (70%), calibración (15%) y prueba (15%)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)
X_cal, X_test, y_cal, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42
)

print("Tamaños:")
print("Train:", X_train.shape)
print("Calibración:", X_cal.shape)
print("Test:", X_test.shape)

Tamaños:
Train: (69990, 15)
Calibración: (14998, 15)
Test: (14998, 15)


In [7]:
# Definir vocabularios para categóricas
cat_vocab_sizes = {col: data[col].nunique() for col in categorical_cols}
print(cat_vocab_sizes)

{'year': 7, 'gender': 3, 'location': 55, 'smoking_history': 6}


In [8]:
model = stm.build_switch_transformer_tabular(
    cont_cols=numerical_cols,
    bin_cols=binary_cols,
    cat_cols=categorical_cols,
    cat_vocab_sizes=cat_vocab_sizes,
    d_model=64,
    num_heads=4,
    d_ff=256,
    num_experts=4,
    num_layers=2,
    dropout=0.1
)

2025-08-29 09:39:19.521794: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M3
2025-08-29 09:39:19.521835: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 24.00 GB
2025-08-29 09:39:19.521845: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 8.00 GB
2025-08-29 09:39:19.521867: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2025-08-29 09:39:19.521883: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [8]:
# OPCIÓN 1: Sin balanceo
smote_config = None

In [9]:
from sklearn.preprocessing import OrdinalEncoder

# por si en el futuro aparecen categorías no vistas en el entrenamiento.
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

# 2. Ajustar el codificador SOLO con los datos de entrenamiento (X_train)
#    y transformar X_train.
X_train[categorical_cols] = encoder.fit_transform(X_train[categorical_cols])

#  Transformar X_cal y X_test usando el encoder YA AJUSTADO. Es crucial no volver a hacer 'fit'.
X_cal[categorical_cols] = encoder.transform(X_cal[categorical_cols])
X_test[categorical_cols] = encoder.transform(X_test[categorical_cols]) # <-- Paso crucial para X_test


# Crear el DataFrame de entrenamiento completo
train_df = X_train.copy()
train_df[TARGET_COL] = y_train

# Crear el DataFrame de calibración completo
cal_df = X_cal.copy()
cal_df[TARGET_COL] = y_cal

# DataFrame para la función evaluate
test_df = X_test.copy()
test_df[TARGET_COL] = y_test # <-- Re-unimos X_test con y_test

Entrenamiento del modelo:

In [None]:
model, history = stm.train(
    model=model,
    df_train=train_df,
    df_val=cal_df,   # validación en el entrenamiento
    cont_cols=numerical_cols,
    bin_cols=binary_cols,
    cat_cols=categorical_cols,
    target_col=TARGET_COL,
    smote_config=smote_config,  # aquí decides balancear o no
    epochs=30,
    lr=3e-4,
    batch_size=256,
    focal_loss=True
)

Calibrar conformal (split-conformal sobre cal_df):

In [None]:
# Calibración conforme

qhats = stm.calibrate_conformal_conditional(
    model,
    df_val=cal_df,   # validación en el entrenamiento
    cont_cols=numerical_cols,
    bin_cols=binary_cols,
    cat_cols=categorical_cols,
    target_col=TARGET_COL,
    alpha=0.1
)
print("Umbrales de calibración:", qhats)

Evaluar en el conjunto test:

In [None]:
# Evaluación en Test
results_df, cm = stm.evaluate(
    model,
    df_test=test_df,
     cont_cols=numerical_cols,
    bin_cols=binary_cols,
    cat_cols=categorical_cols,
    target_col=TARGET_COL,
    threshold=0.5
)
print(results_df)
print("Matriz de confusión:", cm)

# ¿Se puede usar una lista de tresholds? thresholds=[0.3, 0.35, 0.4, 0.45, 0.5, conf_threshold] -- Pendiente revisar

In [10]:
from src import TopKTabularMoE as tkm

model = tkm.build_switch_transformer_tabular(
    cont_cols=numerical_cols,
    bin_cols=binary_cols,
    cat_cols=categorical_cols,
    cat_vocab_sizes=cat_vocab_sizes,
    d_model=64,
    num_heads=4,
    d_ff=256,
    num_experts=8, # 8 expertos en total
    num_layers=2,
    dropout=0.2,
    k=2  # <-- Cada token será procesado por los 2 mejores expertos
)

model.summary()
print("Modelo construido exitosamente.")


Modelo construido exitosamente.


In [None]:
# Llamar a la función de entrenamiento
model, history = tkm.train(
    model=model,
    df_train=train_df,
    df_val=cal_df,
    cont_cols=numerical_cols,
    bin_cols=binary_cols,
    cat_cols=categorical_cols,
    target_col=TARGET_COL,
    smote_config={'apply': False}, # Decidimos no usar SMOTE por ahora
    epochs=30,                     # Número de épocas
    batch_size=256,                # Tamaño del lote
    lr=3e-4,                       # Tasa de aprendizaje
    focal_loss=True,               # Usar Focal Loss por el desbalance
    mixed_precision=True
)
print("Entrenamiento completado.")

Epoch 1/30


2025-08-29 09:40:01.879247: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.



Epoch 1: val_auc improved from -inf to 0.72004, saving model to model.weights.best.weights.h5
501/501 - 115s - 229ms/step - accuracy: 0.8349 - auc: 0.8930 - loss: 0.0770 - precision: 0.9758 - recall: 0.6868 - val_accuracy: 0.9129 - val_auc: 0.7200 - val_loss: 0.0356 - val_precision: 0.3571 - val_recall: 0.0314
Epoch 2/30

Epoch 2: val_auc improved from 0.72004 to 0.87200, saving model to model.weights.best.weights.h5
501/501 - 84s - 168ms/step - accuracy: 0.8683 - auc: 0.9459 - loss: 0.0668 - precision: 0.9910 - recall: 0.7433 - val_accuracy: 0.9301 - val_auc: 0.8720 - val_loss: 0.0218 - val_precision: 0.8229 - val_recall: 0.2259
Epoch 3/30

Epoch 3: val_auc improved from 0.87200 to 0.89290, saving model to model.weights.best.weights.h5
501/501 - 96s - 191ms/step - accuracy: 0.8896 - auc: 0.9633 - loss: 0.0630 - precision: 0.9877 - recall: 0.7891 - val_accuracy: 0.9367 - val_auc: 0.8929 - val_loss: 0.0179 - val_precision: 0.9683 - val_recall: 0.2635
Epoch 4/30

Epoch 4: val_auc improv

In [None]:
# --- PASO 4: CALIBRACIÓN CONFORMAL ---
# Usar el conjunto de calibración para encontrar un umbral de decisión con garantías estadísticas.
print("\nPaso 4: Realizando la calibración conformal...")
ALPHA = 0.1 # Nivel de error aceptable (queremos un 90% de confianza)

qhat, calibrated_threshold = tkm.calibrate_conformal(
    model,
    df_cal=cal_df,
    cont_cols=numerical_cols,
    bin_cols=binary_cols,
    cat_cols=categorical_cols,
    target_col=TARGET_COL,
    alpha=ALPHA
)
print("Calibración completada.")
print(f"Nivel de error alpha: {ALPHA}")
print(f"Cuantil de no-conformidad (q-hat): {qhat:.4f}")
print(f"Umbral de decisión calibrado: {calibrated_threshold:.4f}")

In [None]:
# --- PASO 5: EVALUACIÓN FINAL EN EL CONJUNTO DE PRUEBA ---
# Evaluar el rendimiento del modelo en datos nunca antes vistos.
print("\nPaso 5: Evaluando el modelo en el conjunto de prueba...")
# Primero, preparamos el df_test completo
df_test = X_test.copy()
df_test[TARGET_COL] = y_test

# Evaluar con el umbral estándar de 0.5 como línea base
print("\n--- Evaluación con umbral estándar de 0.5 ---")
results_50, cm_50 = tkm.evaluate(
    model,
    df_test=df_test,
    cont_cols=numerical_cols,
    bin_cols=binary_cols,
    cat_cols=categorical_cols,
    target_col=TARGET_COL,
    threshold=0.5
)
print(results_50)
print("Matriz de confusión (umbral 0.5):", cm_50)

# Evaluar con el umbral calibrado por Conformal Prediction
print(f"\n--- Evaluación con umbral calibrado de {calibrated_threshold:.4f} ---")
results_cal, cm_cal = tkm.evaluate(
    model,
    df_test=df_test,
    cont_cols=numerical_cols,
    bin_cols=binary_cols,
    cat_cols=categorical_cols,
    target_col=TARGET_COL,
    threshold=calibrated_threshold
)
print(results_cal)
print(f"Matriz de confusión (umbral calibrado):", cm_cal)

## Resultados y *Benchmark*

In [15]:
print("\n" + "="*100)
print(" "*35 + "TABLA COMPARATIVA DE RESULTADOS")
print("="*100)
results_df.style.hide(axis="index").format({
    "Precision": "{:.3f}",
    "Recall": "{:.3f}",
    "F1-Score": "{:.3f}",
    "Accuracy": "{:.3f}",
    "ROC-AUC": "{:.3f}"
})


                                   TABLA COMPARATIVA DE RESULTADOS


Model,Precision,Recall,F1-Score,Accuracy,ROC-AUC,Best Params,TN,TP,FP,FN
KNN,0.987,0.522,0.683,0.959,0.929,"{'metric': 'euclidean', 'n_neighbors': 15, 'weights': 'distance'}",27428,1331,18,1219
LogisticRegression,0.875,0.628,0.732,0.961,0.96,"{'C': 0.1, 'penalty': 'l1', 'solver': 'liblinear'}",27218,1602,228,948
DecisionTree,0.971,0.676,0.797,0.971,0.974,"{'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 10}",27394,1725,52,825
RandomForest,0.998,0.666,0.799,0.971,0.964,"{'max_depth': None, 'min_samples_split': 10, 'n_estimators': 300}",27442,1699,4,851
XGBoost,0.985,0.677,0.803,0.972,0.978,"{'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 100, 'subsample': 1.0}",27420,1727,26,823
SVM,0.922,0.597,0.725,0.961,0.959,"{'C': 10, 'gamma': 'scale', 'kernel': 'linear'}",27318,1522,128,1028
MLP,0.938,0.682,0.79,0.969,0.972,"{'activation': 'relu', 'alpha': 0.01, 'hidden_layer_sizes': (50,), 'solver': 'adam'}",27332,1738,114,812


In [20]:
# Agrupar por métrica y seleccionar el índice del máximo en la columna 'Valor'
idx = best_config_df.groupby("Métrica")["Valor"].idxmax()

# Filtrar el DataFrame con esos índices
best_overall_df = best_config_df.loc[idx].reset_index(drop=True)

print("\n" + "="*90)
print(" "*30 + "TABLA MEJORES CONFIGURACIONES")
print("="*90)
best_overall_df.style.hide(axis="index").format({
    "Valor": "{:.3f}"
})


                              TABLA MEJORES CONFIGURACIONES


Modelo,Métrica,Valor,Parámetros
XGBoost,Accuracy,0.972,"{'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 100, 'subsample': 1.0}"
XGBoost,F1-Score,0.803,"{'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 100, 'subsample': 1.0}"
RandomForest,Precision,0.998,"{'max_depth': None, 'min_samples_split': 10, 'n_estimators': 300}"
XGBoost,ROC-AUC,0.978,"{'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 100, 'subsample': 1.0}"
MLP,Recall,0.682,"{'activation': 'relu', 'alpha': 0.01, 'hidden_layer_sizes': (50,), 'solver': 'adam'}"


In [17]:
# Entreaniemto de modelo con algoritmo de Regresión Logística y evaluación con thresholds entre 0.30 y 0.48

# Definición de umbrales para la evaluación.
thresholds = [round(0.30 + i *0.02, 2) for i in range(10)]

results_df, best_configs = train_LogisticRegression(
    X_train=X_train,
    X_test=X_test,
    y_train=y_train,
    y_test=y_test,
    preprocessor=preprocessor,
    thresholds=thresholds,
    param_grid={
        'classifier__C': [0.01, 0.1, 1, 10, 100],
        'classifier__solver': ['liblinear', 'lbfgs']
    },
    cv_folds=5,
    max_iter=1000,
    random_state=101
)

In [18]:
results_df

Unnamed: 0,Threshold,Precision,Recall,F1-Score,Accuracy,ROC-AUC,Best Params,TN,TP,FP,FN
0,0.3,0.714342,0.70902,0.711671,0.95116,0.960556,C: 0.1,26723,1808,723,742
1,0.32,0.733664,0.695686,0.714171,0.95266,0.960556,C: 0.1,26802,1774,644,776
2,0.34,0.751398,0.685098,0.716718,0.953961,0.960556,C: 0.1,26868,1747,578,803
3,0.36,0.768,0.677647,0.72,0.955194,0.960556,C: 0.1,26924,1728,522,822
4,0.38,0.783809,0.668235,0.721423,0.956127,0.960556,C: 0.1,26976,1704,470,846
5,0.4,0.804379,0.662745,0.726725,0.957628,0.960556,C: 0.1,27035,1690,411,860
6,0.42,0.820098,0.656078,0.728976,0.958528,0.960556,C: 0.1,27079,1673,367,877
7,0.44,0.836198,0.648627,0.730565,0.959328,0.960556,C: 0.1,27122,1654,324,896
8,0.46,0.850731,0.639216,0.72996,0.959795,0.960556,C: 0.1,27160,1630,286,920
9,0.48,0.862179,0.632941,0.729986,0.960195,0.960556,C: 0.1,27188,1614,258,936


In [19]:
best_configs

Unnamed: 0,Métrica,Mejor Threshold,Valor,Parámetros
0,Precision,0.48,0.862179,C: 0.1
1,Recall,0.3,0.70902,C: 0.1
2,F1-Score,0.44,0.730565,C: 0.1
3,Accuracy,0.48,0.960195,C: 0.1
4,ROC-AUC,0.3,0.960556,C: 0.1


# Evaluación de Métricas de Performance