# Implementación de Modelos

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import graphviz


# Utilidades pre-procesamiento, pipelines y automatización de entrenamiento
from sklearn.model_selection import (
    train_test_split, 
    cross_val_score, 
    KFold, 
    StratifiedKFold, 
    GridSearchCV
    )

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
# from sklearn.impute import SimpleImputer

# Métricas de performance
from sklearn.metrics import (
    classification_report,
    accuracy_score, 
    precision_score, 
    recall_score, 
    f1_score, 
    roc_auc_score, 
    roc_curve, auc,
    confusion_matrix
)

# Modelos
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVR, SVC
from sklearn.neural_network import MLPClassifier
from src import MOEConformalTransformer

from src.trainClassifiers import train_LogisticRegression # Custom LogisticRegression para evaluar en multiples tresholds.

# Diccionario con el grid de algoritmos y especificación de sus respectivos hiperparámetros
from src.models_config import config 

# Función para muestreo con SMOTE y Token Links.
from src.TabularDataPipeline import TabularDataPipeline


# Utilidades
import missingno as msno
import time
from datetime import datetime
from typing import Dict, List, Tuple, Optional, Any, Union
import warnings
warnings.filterwarnings('ignore')

## Carga y Preprocesamiento de datos

In [5]:
# Importación de los datos
url = 'https://raw.githubusercontent.com/tuliorozco/applied-statistics/refs/heads/main/data/diabetes_dataset.csv'
data = pd.read_csv(url)

In [3]:
data.head()

Unnamed: 0,year,gender,age,location,race:AfricanAmerican,race:Asian,race:Caucasian,race:Hispanic,race:Other,hypertension,heart_disease,smoking_history,bmi,hbA1c_level,blood_glucose_level,diabetes
0,2020,Female,32.0,Alabama,0,0,0,0,1,0,0,never,27.32,5.0,100,0
1,2015,Female,29.0,Alabama,0,1,0,0,0,0,0,never,19.95,5.0,90,0
2,2015,Male,18.0,Alabama,0,0,0,0,1,0,0,never,23.76,4.8,160,0
3,2015,Male,41.0,Alabama,0,0,1,0,0,0,0,never,27.32,4.0,159,0
4,2016,Female,52.0,Alabama,1,0,0,0,0,0,0,never,23.75,6.5,90,0


In [6]:
# Eliminación de observaciones duplicadas
data = data.drop_duplicates()
print(f"Nuevo total de registros: {len(data)}")
data.shape

Nuevo total de registros: 99986


(99986, 16)

In [7]:
# =================================== PREPROCESAMIENTO ===================================
# 1. Separar las features predictoras (X) de la variable objetivo/target (y).
X = data.drop('diabetes', axis=1)
y = data['diabetes']

# 2. Clasificar los campos por tipo de variable para el pre-procesamiento adecuado
numerical_cols = ["age", "bmi", "hbA1c_level", "blood_glucose_level"] 

binary_cols = ['race:AfricanAmerican','race:Asian','race:Caucasian',
               'race:Hispanic','race:Other','hypertension','heart_disease'] 

categorical_cols = [col for col in data.columns if col not in numerical_cols + binary_cols]
categorical_cols.remove('diabetes') 


# 3. División Train/Test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42)


# 4. Pre-procesamiento de los datos
preprocessor = ColumnTransformer(transformers=[
    ("cat", OneHotEncoder(drop="first", handle_unknown='ignore'), categorical_cols),  
    ("num", StandardScaler(), numerical_cols),
    ("bin", "passthrough", binary_cols)  
])


## Metodología

### Especificación del Grid de Hiperparámetros por Tipo de Modelo

Teniendo en cuenta que los modelos a entrenar y analizar pueden ser ejecutados de forma automática mediante `GridSearchCV` (A excepción del algoritmo propuesto), se especificará un diccionario *models_config* estructurado de la siguiente forma:

* Key (clave): Identificación del tipo de algoritmo, ejemplo: KNN, LogisticRegresion, DecisionTree, etc.
* Value (valor): El conjunto de funciones (la implementación de cada algortimo en los paquetes `scikit-learn` o `xgboost`), los parámetros de inicialización del algoritmo (si los tiene) y sus respectivos hiperparámetros a evaluar en el entrenamiento. 

En la especificación se muestra de la siguiente manera:

```python
models_config = {
        'KNN': {
            'function': KNeighborsClassifier,
            'param_grid': {
            'classifier__n_neighbors': [3, 5, 7, 9, 11, 13, 15],
            'classifier__weights': ['uniform', 'distance'],
            'classifier__metric': ['euclidean']
            }   
        } ,
         'LogisticRegression': {
            'function': LogisticRegression,
            'init_params': {'max_iter': 1000, 'random_state': random_state}, 
            'param_grid': {
                'classifier__C': [0.01, 0.1, 1, 10, 100],
                'classifier__penalty': ['l1', 'l2'],
                'classifier__solver': ['liblinear', 'saga']
            }
         },
    # Y así sucesivamente ...
    'MLP': {
        'function': MLPClassifier,
        'init_params': {
            'max_iter': 500,
            'random_state': 101
        },
        'param_grid': {
            'classifier__hidden_layer_sizes': [(50,), (100,), (50, 50)],
            'classifier__activation': ['relu', 'tanh'],
            'classifier__solver': ['adam', 'sgd'],
            'classifier__alpha': [0.0001, 0.001, 0.01]
        }
    }
}
```
Así mismo, se define una variable tipo `Dict` denominada *common_params* con las siguientes *key-values*:

* `X_train`: Variables predictoras del conjunto de entrenamiento.
* `X_test`: Variables predictoras del conjunto de test.
* `y_train`: Variable *target* del conjunto de entrenamiento.
* `y_test`: Variable *target* del conjunto de test.
* `cv_folds`: Número de pliegues para la validación cruzada (cross validation).
* `random_state`: Semilla de reproducibilidad.

In [None]:
random_state = 101
models_config = config(random_state=random_state)

# Parámetros comunes para el entrenamiento
common_params = {
    'X_train': X_train,
    'X_test': X_test,
    'y_train': y_train,
    'y_test': y_test,
    'cv_folds': 5,
    'random_state': random_state
}

### Entrenamiento de los Modelos

Definidos los algoritmos con sus respectivos hiperparámetros así como los parámetros comunes (conjuntos de entrenamiento y pruebas), se define un bucle *for* que recorrerá el diccionario y procesará cada algoritmo según lo especificado en *models_config*. Las métricas resultantes del entrenamiento se almacenarán en listas para el análisis comparativo posterior frente al modelo propuesto.

In [None]:
# =================================== ENTRENAMIENTO ===================================

# Estructuras para almacenar las métricas requeridas en el benchmark.
metrics_rows = []
best_configs_list = []
cv_results_dfs = []   # lista para guardar todos los resultados de GridSearchCV
roc_curves = {}       # diccionario para guardar las curvas ROC

for model_name, config in models_config.items():
    print(f"\nEntrenando modelo: {model_name}")
    
    # 1. Construcción del pipeline
    steps = []
    steps.append(("preprocessing", preprocessor))

    # Usar init_params si existen, si no dict vacío
    init_params = config.get('init_params', {})
    steps.append(("classifier", config['function'](**init_params)))
    pipeline = Pipeline(steps)
    
    # 2. Cross-validation
    skf = StratifiedKFold(n_splits=common_params['cv_folds'], shuffle=True, random_state=common_params['random_state'])
    grid_search = GridSearchCV(pipeline, config['param_grid'], cv=skf, scoring='roc_auc', n_jobs=-1, verbose=1)

    # calcular los tiempos de entranamiento de cada modelo.
    inicio = time.perf_counter()

    grid_search.fit(common_params['X_train'], common_params['y_train'])
    fin = time.perf_counter()
    train_time = fin - inicio

    # Guardar resultados completos de GridSearchCV
    cv_results_df = pd.DataFrame(grid_search.cv_results_)
    cv_results_df["Model"] = model_name  # para identificar de qué modelo son
    cv_results_dfs.append(cv_results_df)
    
    # 3. Predicciones
    y_pred = grid_search.predict(common_params['X_test'])
    y_proba = grid_search.predict_proba(common_params['X_test'])[:, 1]
    
    # 4. Resultados principales
    cm = confusion_matrix(common_params['y_test'], y_pred)
    metrics_row = {
        'Model': model_name,
        'Precision': precision_score(common_params['y_test'], y_pred, zero_division=0),
        'Recall': recall_score(common_params['y_test'], y_pred, zero_division=0),
        'F1-Score': f1_score(common_params['y_test'], y_pred, zero_division=0),
        'Accuracy': accuracy_score(common_params['y_test'], y_pred),
        'ROC-AUC': roc_auc_score(common_params['y_test'], y_proba),
        'Best Params': {k.replace("classifier__", ""): v for k, v in grid_search.best_params_.items()},
        'TN': int(cm[0, 0]),
        'TP': int(cm[1, 1]),
        'FP': int(cm[0, 1]),
        'FN': int(cm[1, 0])
    }
    metrics_rows.append(metrics_row)
    
    # 5) Guardar las configuraciones de mejores métricas en la lista
    for metric in ['Precision', 'Recall', 'F1-Score', 'Accuracy', 'ROC-AUC']:
        best_configs_list.append({
            'Modelo': model_name,
            'Métrica': metric,
            'Valor': metrics_row[metric],
            'Parámetros': metrics_row['Best Params']
        })

    # 6. Guardar curva ROC
    fpr, tpr, _ = roc_curve(common_params['y_test'], y_proba)
    roc_auc = auc(fpr, tpr)
    roc_curves[model_name] = (fpr, tpr, roc_auc)
    
    print(f"Entrenamiento completado. Tiempo entrenamiento {model_name}: {train_time:.4f} segundos")

col_order = [
    'Model', 'Precision', 'Recall', 'F1-Score', 'Accuracy', 'ROC-AUC',
    'Best Params', 'TN', 'TP', 'FP', 'FN'
]
results_df = pd.DataFrame(metrics_rows)[col_order]

best_config_df = pd.DataFrame(best_configs_list)[['Modelo', 'Métrica', 'Valor', 'Parámetros']]


Entrenando modelo: KNN
Fitting 5 folds for each of 14 candidates, totalling 70 fits
Entrenamiento completado. Tiempo entrenamiento KNN: 302.4487 segundos

Entrenando modelo: LogisticRegression
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Entrenamiento completado. Tiempo entrenamiento LogisticRegression: 90.0171 segundos

Entrenando modelo: DecisionTree
Fitting 5 folds for each of 45 candidates, totalling 225 fits
Entrenamiento completado. Tiempo entrenamiento DecisionTree: 10.2351 segundos

Entrenando modelo: RandomForest
Fitting 5 folds for each of 45 candidates, totalling 225 fits
Entrenamiento completado. Tiempo entrenamiento RandomForest: 286.5900 segundos

Entrenando modelo: XGBoost
Fitting 5 folds for each of 72 candidates, totalling 360 fits
Entrenamiento completado. Tiempo entrenamiento XGBoost: 50.7437 segundos

Entrenando modelo: SVM
Fitting 5 folds for each of 12 candidates, totalling 60 fits
Entrenamiento completado. Tiempo entrenamiento SVM: 3210.0026 seg

---

Construcción del modelo:

In [11]:
# Selección de muestra balanceada para evaluar el algoritmo
from imblearn.combine import SMOTETomek

# Transformar X → numérico
X_prep = preprocessor.fit_transform(X)

# Ahora sí aplicar SMOTE + Tomek
smt = SMOTETomek(sampling_strategy=1.0, random_state=42)
X_res, y_res = smt.fit_resample(X_prep, y)

# Convertir a DataFrame después del resampling
X_res_df = pd.DataFrame(X_res.toarray())  # .toarray() convierte sparse → numpy
y_res_df = pd.Series(y_res, name="diabetes")

# Concatenar
df_res = pd.concat([X_res_df, y_res_df], axis=1)


df_balanced = (df_res.groupby("diabetes", group_keys=False)
                       .apply(lambda x: x.sample(8000, random_state=42)))

In [14]:
X_train.head()

Unnamed: 0,year,gender,age,location,race:AfricanAmerican,race:Asian,race:Caucasian,race:Hispanic,race:Other,hypertension,heart_disease,smoking_history,bmi,hbA1c_level,blood_glucose_level
84407,2019,Female,9.0,South Carolina,1,0,0,0,0,0,0,No Info,16.51,6.0,140
62008,2019,Female,64.0,New Hampshire,0,1,0,0,0,0,0,never,27.32,6.2,158
63952,2019,Male,60.0,New Jersey,0,0,0,0,1,0,0,current,18.46,4.5,85
85354,2019,Male,80.0,South Carolina,0,0,0,1,0,1,1,No Info,27.32,5.7,140
30206,2019,Male,50.0,Indiana,0,1,0,0,0,1,0,never,27.32,6.2,200


In [None]:
df_balanced

In [7]:
print("\nDataFrame Balanceado y Listo para Entrenar:")
balanced_df.head()


DataFrame Balanceado y Listo para Entrenar:


Unnamed: 0,age,bmi,hbA1c_level,blood_glucose_level,year,gender,location,smoking_history,race:AfricanAmerican,race:Asian,race:Caucasian,race:Hispanic,race:Other,hypertension,heart_disease,diabetes
0,0.293398,1.353667,3.183348,0.038482,3.0,0.0,15.320869,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.320869,1
1,-0.394644,-0.000116,0.254498,-1.426227,3.0,0.0,39.0,4.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0
2,0.493601,-0.000116,1.001695,-1.303403,3.0,0.0,15.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0
3,1.204198,0.509134,1.001695,0.538966,3.0,0.0,31.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0
4,0.945856,1.02511,0.990515,-0.154907,3.0,0.302815,29.302815,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [12]:
df_balanced.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
0,16000.0,0.077735,0.262473,0.0,0.0,0.0,0.0,1.000000
1,16000.0,0.024232,0.148796,0.0,0.0,0.0,0.0,1.000000
2,16000.0,0.817054,0.378605,0.0,1.0,1.0,1.0,1.000000
3,16000.0,0.000145,0.010680,0.0,0.0,0.0,0.0,0.924060
4,16000.0,0.000062,0.007781,0.0,0.0,0.0,0.0,0.984247
...,...,...,...,...,...,...,...,...
74,16000.0,0.192176,0.391471,0.0,0.0,0.0,0.0,1.000000
75,16000.0,0.200468,0.397567,0.0,0.0,0.0,0.0,1.000000
76,16000.0,0.142714,0.337908,0.0,0.0,0.0,0.0,1.000000
77,16000.0,0.078396,0.256439,0.0,0.0,0.0,0.0,1.000000


In [9]:
# Split de los datos en train / cal / test (por ejemplo 70/15/15) manteniendo estratificación:

train, temp = train_test_split(data, test_size=0.3, stratify=data['diabetes'], random_state=42)
cal, test  = train_test_split(temp, test_size=0.5, stratify=temp['diabetes'], random_state=42)

# Construcción del modelo
# número de layers sugeridos [2, 3, 4]
# dropouts sugeridos [0.1, 0.2, 0.3]
# Mejores métricas 
# d_model = 64
# num_heads = 4
# d_ff = 4 * d_model
# num_experts = 8
# num_layers = 2
# dropout = 0.2
d_model = 64
model = MOEConformalTransformer.build_moe_transformer_tabular(train, numerical_cols, binary_cols, categorical_cols,
                                      d_model=d_model, num_heads=4, d_ff=4 * d_model,
                                      num_experts=8, num_layers=2, dropout=0.2) 


# make class weights
cw = MOEConformalTransformer.make_class_weight(y)

model, hist = MOEConformalTransformer.train(model,
                    df_train=train, df_val=cal,
                    cont_cols=numerical_cols, bin_cols=binary_cols, cat_cols=categorical_cols,
                    target_col='diabetes',
                    batch_size=512, epochs=30,
                    class_weight=cw, lr=3e-4, focal_loss=False)

Epoch 1/30

Epoch 1: val_auc improved from -inf to 0.50000, saving model to model.weights.best.weights.h5
137/137 - 30s - 216ms/step - accuracy: 0.7574 - auc: 0.5127 - loss: 12.4594 - precision: 0.0925 - recall: 0.2103 - val_accuracy: 0.9150 - val_auc: 0.5000 - val_loss: 1.3702 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00
Epoch 2/30

Epoch 2: val_auc improved from 0.50000 to 0.53035, saving model to model.weights.best.weights.h5
137/137 - 15s - 106ms/step - accuracy: 0.8004 - auc: 0.5069 - loss: 13.7970 - precision: 0.0904 - recall: 0.1487 - val_accuracy: 0.1406 - val_auc: 0.5304 - val_loss: 13.7017 - val_precision: 0.0900 - val_recall: 1.0000
Epoch 3/30

Epoch 3: val_auc did not improve from 0.53035
137/137 - 14s - 103ms/step - accuracy: 0.8649 - auc: 0.5060 - loss: 14.2956 - precision: 0.0994 - recall: 0.0731 - val_accuracy: 0.9150 - val_auc: 0.5000 - val_loss: 1.3702 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00
Epoch 4/30

Epoch 4: val_auc did not improve from 0.5303

Calibrar conformal (split-conformal sobre cal):

In [None]:
qhat, conf_threshold = MOEConformalTransformer.calibrate_conformal(model, cal,
                                           numerical_cols, binary_cols, categorical_cols,
                                           target_col=y, alpha=0.1)
print("Conformal threshold p >=", conf_threshold)

Evaluar en el conjunto test:

In [None]:
# usar umbral conforme
results_df, cm = MOEConformalTransformer.evaluate(model, test, numerical_cols, binary_cols, categorical_cols, target_col=y,
                          threshold=conf_threshold, thresholds=[0.3, 0.35, 0.4, 0.45, 0.5, conf_threshold])
print(results_df.to_markdown(index=False))
print("matriz confusion:", cm)

## Resultados y *Benchmark*

In [15]:
print("\n" + "="*100)
print(" "*35 + "TABLA COMPARATIVA DE RESULTADOS")
print("="*100)
results_df.style.hide(axis="index").format({
    "Precision": "{:.3f}",
    "Recall": "{:.3f}",
    "F1-Score": "{:.3f}",
    "Accuracy": "{:.3f}",
    "ROC-AUC": "{:.3f}"
})


                                   TABLA COMPARATIVA DE RESULTADOS


Model,Precision,Recall,F1-Score,Accuracy,ROC-AUC,Best Params,TN,TP,FP,FN
KNN,0.987,0.522,0.683,0.959,0.929,"{'metric': 'euclidean', 'n_neighbors': 15, 'weights': 'distance'}",27428,1331,18,1219
LogisticRegression,0.875,0.628,0.732,0.961,0.96,"{'C': 0.1, 'penalty': 'l1', 'solver': 'liblinear'}",27218,1602,228,948
DecisionTree,0.971,0.676,0.797,0.971,0.974,"{'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 10}",27394,1725,52,825
RandomForest,0.998,0.666,0.799,0.971,0.964,"{'max_depth': None, 'min_samples_split': 10, 'n_estimators': 300}",27442,1699,4,851
XGBoost,0.985,0.677,0.803,0.972,0.978,"{'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 100, 'subsample': 1.0}",27420,1727,26,823
SVM,0.922,0.597,0.725,0.961,0.959,"{'C': 10, 'gamma': 'scale', 'kernel': 'linear'}",27318,1522,128,1028
MLP,0.938,0.682,0.79,0.969,0.972,"{'activation': 'relu', 'alpha': 0.01, 'hidden_layer_sizes': (50,), 'solver': 'adam'}",27332,1738,114,812


In [20]:
# Agrupar por métrica y seleccionar el índice del máximo en la columna 'Valor'
idx = best_config_df.groupby("Métrica")["Valor"].idxmax()

# Filtrar el DataFrame con esos índices
best_overall_df = best_config_df.loc[idx].reset_index(drop=True)

print("\n" + "="*90)
print(" "*30 + "TABLA MEJORES CONFIGURACIONES")
print("="*90)
best_overall_df.style.hide(axis="index").format({
    "Valor": "{:.3f}"
})


                              TABLA MEJORES CONFIGURACIONES


Modelo,Métrica,Valor,Parámetros
XGBoost,Accuracy,0.972,"{'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 100, 'subsample': 1.0}"
XGBoost,F1-Score,0.803,"{'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 100, 'subsample': 1.0}"
RandomForest,Precision,0.998,"{'max_depth': None, 'min_samples_split': 10, 'n_estimators': 300}"
XGBoost,ROC-AUC,0.978,"{'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 100, 'subsample': 1.0}"
MLP,Recall,0.682,"{'activation': 'relu', 'alpha': 0.01, 'hidden_layer_sizes': (50,), 'solver': 'adam'}"


In [17]:
# Entreaniemto de modelo con algoritmo de Regresión Logística y evaluación con thresholds entre 0.30 y 0.48

# Definición de umbrales para la evaluación.
thresholds = [round(0.30 + i *0.02, 2) for i in range(10)]

results_df, best_configs = train_LogisticRegression(
    X_train=X_train,
    X_test=X_test,
    y_train=y_train,
    y_test=y_test,
    preprocessor=preprocessor,
    thresholds=thresholds,
    param_grid={
        'classifier__C': [0.01, 0.1, 1, 10, 100],
        'classifier__solver': ['liblinear', 'lbfgs']
    },
    cv_folds=5,
    max_iter=1000,
    random_state=101
)

In [18]:
results_df

Unnamed: 0,Threshold,Precision,Recall,F1-Score,Accuracy,ROC-AUC,Best Params,TN,TP,FP,FN
0,0.3,0.714342,0.70902,0.711671,0.95116,0.960556,C: 0.1,26723,1808,723,742
1,0.32,0.733664,0.695686,0.714171,0.95266,0.960556,C: 0.1,26802,1774,644,776
2,0.34,0.751398,0.685098,0.716718,0.953961,0.960556,C: 0.1,26868,1747,578,803
3,0.36,0.768,0.677647,0.72,0.955194,0.960556,C: 0.1,26924,1728,522,822
4,0.38,0.783809,0.668235,0.721423,0.956127,0.960556,C: 0.1,26976,1704,470,846
5,0.4,0.804379,0.662745,0.726725,0.957628,0.960556,C: 0.1,27035,1690,411,860
6,0.42,0.820098,0.656078,0.728976,0.958528,0.960556,C: 0.1,27079,1673,367,877
7,0.44,0.836198,0.648627,0.730565,0.959328,0.960556,C: 0.1,27122,1654,324,896
8,0.46,0.850731,0.639216,0.72996,0.959795,0.960556,C: 0.1,27160,1630,286,920
9,0.48,0.862179,0.632941,0.729986,0.960195,0.960556,C: 0.1,27188,1614,258,936


In [19]:
best_configs

Unnamed: 0,Métrica,Mejor Threshold,Valor,Parámetros
0,Precision,0.48,0.862179,C: 0.1
1,Recall,0.3,0.70902,C: 0.1
2,F1-Score,0.44,0.730565,C: 0.1
3,Accuracy,0.48,0.960195,C: 0.1
4,ROC-AUC,0.3,0.960556,C: 0.1


# Evaluación de Métricas de Performance

In [11]:
print("=== DIAGNÓSTICO DE COLUMNAS ===")
print("Columnas en train:", train.columns.tolist())
print("\nColumnas numéricas definidas:", numerical_cols)
print("Columnas binarias definidas:", binary_cols)  
print("Columnas categóricas definidas:", categorical_cols)

# Verificar si las columnas definidas existen en el dataset
print("\n=== VERIFICACIÓN DE EXISTENCIA ===")
all_defined_cols = numerical_cols + binary_cols + categorical_cols
missing_cols = [col for col in all_defined_cols if col not in train.columns]
extra_cols = [col for col in train.columns if col not in all_defined_cols and col != 'diabetes']

print("Columnas definidas pero no en dataset:", missing_cols)
print("Columnas en dataset pero no definidas:", extra_cols)

=== DIAGNÓSTICO DE COLUMNAS ===
Columnas en train: ['year', 'gender', 'age', 'location', 'race:AfricanAmerican', 'race:Asian', 'race:Caucasian', 'race:Hispanic', 'race:Other', 'hypertension', 'heart_disease', 'smoking_history', 'bmi', 'hbA1c_level', 'blood_glucose_level', 'diabetes']

Columnas numéricas definidas: ['age', 'bmi', 'hbA1c_level', 'blood_glucose_level']
Columnas binarias definidas: ['race:AfricanAmerican', 'race:Asian', 'race:Caucasian', 'race:Hispanic', 'race:Other', 'hypertension', 'heart_disease']
Columnas categóricas definidas: ['year', 'gender', 'location', 'smoking_history']

=== VERIFICACIÓN DE EXISTENCIA ===
Columnas definidas pero no en dataset: []
Columnas en dataset pero no definidas: []
