In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import torch
import torch.distributed as dist

# Utilidades pre-procesamiento, pipelines y automatización de entrenamiento
from sklearn.model_selection import (
    train_test_split, 
    cross_val_score, 
    KFold, 
    StratifiedKFold, 
    GridSearchCV
    )

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
data = pd.read_csv('../../data/diabetes_dataset.csv')

In [3]:
data.head()

Unnamed: 0,year,gender,age,location,race:AfricanAmerican,race:Asian,race:Caucasian,race:Hispanic,race:Other,hypertension,heart_disease,smoking_history,bmi,hbA1c_level,blood_glucose_level,diabetes
0,2020,Female,32.0,Alabama,0,0,0,0,1,0,0,never,27.32,5.0,100,0
1,2015,Female,29.0,Alabama,0,1,0,0,0,0,0,never,19.95,5.0,90,0
2,2015,Male,18.0,Alabama,0,0,0,0,1,0,0,never,23.76,4.8,160,0
3,2015,Male,41.0,Alabama,0,0,1,0,0,0,0,never,27.32,4.0,159,0
4,2016,Female,52.0,Alabama,1,0,0,0,0,0,0,never,23.75,6.5,90,0


In [4]:
# =================================== PREPROCESAMIENTO ===================================

# ---------------------------   Eliminación de observaciones duplicadas ----------------------------
data = data.drop_duplicates()
print(f"Nuevo total de registros: {len(data)}")
print(data.shape)

# 1. --------    Separar las features predictoras (X) de la variable objetivo/target (y). ----------
TARGET_COL = 'diabetes'

X = data.drop('diabetes', axis=1)
y = data[TARGET_COL]

# 2. Clasificar los campos por tipo de variable para el pre-procesamiento adecuado
numerical_cols = ["age", "bmi", "hbA1c_level", "blood_glucose_level"] 

binary_cols = ['race:AfricanAmerican','race:Asian','race:Caucasian',
               'race:Hispanic','race:Other','hypertension','heart_disease'] 


categorical_cols = [col for col in data.columns if col not in numerical_cols + binary_cols]
categorical_cols.remove('diabetes') 

# 3. --------------- Definir vocabularios categóricos para el featurizer (modo 'index') -------------
cat_vocab_sizes = {
    col: data[col].nunique() + 1  # +1 para UNK (índice 0)
    for col in categorical_cols
}
print("\nVocabularios categóricos (cat_vocab_sizes):")
print(cat_vocab_sizes)

data_num = data.copy()

for col in categorical_cols:
    # Si hay NaNs, puedes hacer fillna("UNK") antes
    # data_num[col] = data_num[col].fillna("UNK")
    data_num[col] = data_num[col].astype("category").cat.codes.astype("int64")

# ============================================================
# 4. Separar X / y usando la versión NUMÉRICA
# ============================================================

X = data_num.drop(TARGET_COL, axis=1)
y = data_num[TARGET_COL]

# 4. ---------------------------- Splits Train / Calibración / Test -------------------------------

# 4.1 Train vs temp
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42)

# 4.2 temp -> Calibración vs Test
X_cal, X_test, y_cal, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)


print("\nTamaños:")
print(f"Train:{X_train.shape}")
print(f"Calibration:{X_cal.shape}")
print(f"Test:{X_test.shape}")

# 5. -------------------------- Preprocesador sklearn para modelos baseline ----------------------------

preprocessor = ColumnTransformer(transformers=[
    ("cat", OneHotEncoder(drop="first", handle_unknown='ignore'), categorical_cols),  
    ("num", StandardScaler(), numerical_cols),
    ("bin", "passthrough", binary_cols)  
])

# 6. --------------------- Reconstruir DataFrames completos para el pipeline MoE ---------------------

# 6.1 Crear el DataFrame de entrenamiento completo
train_df = X_train.copy()
train_df[TARGET_COL] = y_train

# 6.2 Crear el DataFrame de calibración completo
cal_df = X_cal.copy()
cal_df[TARGET_COL] = y_cal

# 6.3 DataFrame para la función evaluate
test_df = X_test.copy()
test_df[TARGET_COL] = y_test # <-- Re-unimos X_test con y_test

print("Preprocesamiento terminado.")

Nuevo total de registros: 99986
(99986, 16)

Vocabularios categóricos (cat_vocab_sizes):
{'year': 8, 'gender': 4, 'location': 56, 'smoking_history': 7}

Tamaños:
Train:(69990, 15)
Calibration:(14998, 15)
Test:(14998, 15)
Preprocesamiento terminado.


In [5]:
from src import spartam
import inspect
from src.spartam import pipeline, distributed_trainer, featurizer

In [6]:
d_model = 128

# Parámetros para construir la arquitectura del modelo
model_params = {
    'cont_cols': numerical_cols,
    'bin_cols': binary_cols,
    'cat_cols': categorical_cols,
    'cat_vocab_sizes': cat_vocab_sizes,
    'd_model': d_model,
    'num_heads': 4,
    'd_ff': 4 * d_model,
    'num_experts': 4,
    'num_layers': 2,
    'dropout': 0.1,
    'k': 2,
    'conformal_predictor': 'LabelConditionalConformalPredictor'
}

# Parámetros para el proceso de entrenamiento
training_params = {
    # Config FocalLoss:
    "loss_type": "focal",
    "focal_gamma": 2.0,
    "focal_alpha": 0.9,  # Se le da mayor peso a la clase positiva para compensar el desbalanceo entre clases.
    "loss_reduction": "mean",
    'lr': 3e-4,
    'epochs': 30, # Puedes reducir las épocas para una CV más rápida
    'batch_size': 256,
    'target_col': TARGET_COL,
    'col_groups': { # Agrupamos las listas de columnas
        'cont_cols': numerical_cols,
        'bin_cols': binary_cols,
        'cat_cols': categorical_cols,
        'target_col': TARGET_COL
    }
}

In [7]:
def setup_distributed_training():
    """Configura el entorno de entrenamiento distribuido"""
    dist.init_process_group(backend='nccl')
    local_rank = int(os.environ['LOCAL_RANK'])
    torch.cuda.set_device(local_rank)
    return torch.device(f'cuda:{local_rank}')


In [8]:

best_model, cv_summary, history, eval_report = pipeline.run_cv_calibration_evaluation(
    df_trainval=train_df,
    cal_df=cal_df,
    test_df=test_df,
    model_params=model_params,
    training_params=training_params,
    k_folds=5,
    featurizer_mode="index",        # recomendado p/NN
    featurizer_fit_scope="global",  # o "in_fold" para evitar cualquier leak
)

In [9]:
print("\n=== Resumen CV ===")
print(cv_summary)

print("\n=== Historial entrenamiento (por época) ===")
print(history)

print("\n=== Reporte de evaluación CP (cal + test) ===")
print(eval_report)


=== Resumen CV ===
{'cv_best_roc_auc': 0.9604003734535663}

=== Historial entrenamiento (por época) ===
[{'fold': 0, 'val_metrics': {'loss': 0.015511903293769468, 'accuracy': 0.9210601514502071, 'f1': 0.6411172458590452, 'roc_auc': 0.9530791548349509}}, {'fold': 1, 'val_metrics': {'loss': 0.013413635023276914, 'accuracy': 0.9337762537505357, 'f1': 0.6695187165775401, 'roc_auc': 0.9575487549798184}}, {'fold': 2, 'val_metrics': {'loss': 0.011969652992080559, 'accuracy': 0.921488784112016, 'f1': 0.6501114294810569, 'roc_auc': 0.9604003734535663}}, {'fold': 3, 'val_metrics': {'loss': 0.013048823160881346, 'accuracy': 0.9242034576368052, 'f1': 0.6476253736300233, 'roc_auc': 0.9467766338265475}}, {'fold': 4, 'val_metrics': {'loss': 0.015276300229809501, 'accuracy': 0.9191313044720675, 'f1': 0.6355441081777206, 'roc_auc': 0.9562370419748162}}]

=== Reporte de evaluación CP (cal + test) ===
{'cal_metrics': {'accuracy': 0.9203227096946259, 'f1': 0.6423226578868603, 'roc_auc': 0.957300167316070