# Implementación de Modelos

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import graphviz
import sys
import os

# Añadir carpeta raíz del proyecto al path
root_dir = os.path.abspath(os.path.join(os.getcwd(), ".."))

# Añadir root al sys.path
if root_dir not in sys.path:
    sys.path.append(root_dir)

# Utilidades pre-procesamiento, pipelines y automatización de entrenamiento
from sklearn.model_selection import (
    train_test_split, 
    cross_val_score, 
    KFold, 
    StratifiedKFold, 
    GridSearchCV
    )

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
# from sklearn.impute import SimpleImputer

# Métricas de performance
from sklearn.metrics import (
    classification_report,
    accuracy_score, 
    precision_score, 
    recall_score, 
    f1_score, 
    roc_auc_score, 
    roc_curve, auc,
    confusion_matrix
)

# Modelos
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVR, SVC
from sklearn.neural_network import MLPClassifier

from src.trainClassifiers import (
    train_LogisticRegression    # Custom, para evaluación con tresholds diferentes de 0.5 (valor por defecto)
)

# Utilidades
import missingno as msno
from functools import wraps
import time
from datetime import datetime
from typing import Dict, List, Tuple, Optional, Any, Union
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Importación de los datos
url = 'https://raw.githubusercontent.com/tuliorozco/applied-statistics/refs/heads/main/data/diabetes_dataset.csv'
data = pd.read_csv(url)

In [3]:
data.head()

Unnamed: 0,year,gender,age,location,race:AfricanAmerican,race:Asian,race:Caucasian,race:Hispanic,race:Other,hypertension,heart_disease,smoking_history,bmi,hbA1c_level,blood_glucose_level,diabetes
0,2020,Female,32.0,Alabama,0,0,0,0,1,0,0,never,27.32,5.0,100,0
1,2015,Female,29.0,Alabama,0,1,0,0,0,0,0,never,19.95,5.0,90,0
2,2015,Male,18.0,Alabama,0,0,0,0,1,0,0,never,23.76,4.8,160,0
3,2015,Male,41.0,Alabama,0,0,1,0,0,0,0,never,27.32,4.0,159,0
4,2016,Female,52.0,Alabama,1,0,0,0,0,0,0,never,23.75,6.5,90,0


In [4]:
# Eliminación de observaciones duplicadas
data1 = data.drop_duplicates()
print(f"Nuevo total de registros: {len(data)}")
data1.shape

Nuevo total de registros: 100000


(99986, 16)

In [5]:
# =================================== PREPROCESAMIENTO ===================================
# 1. Separar las features predictoras (X) de la variable objetivo/target (y).
X = data1.drop('diabetes', axis=1)
y = data1['diabetes']

# 2. Clasificar los campos por tipo de variable para el pre-procesamiento adecuado
numerical_cols = ["age", "bmi", "hbA1c_level", "blood_glucose_level"] 
categorical_cols = [col for col in data1.columns if col not in numerical_cols]
categorical_cols.remove('diabetes') 


# 3. División Train/Test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42)


# 4. Pre-procesamiento de los datos
preprocessor = ColumnTransformer(transformers=[
    ("cat", OneHotEncoder(drop="first", handle_unknown='ignore'), categorical_cols),  
    ("num", StandardScaler(), numerical_cols) 
])


Algoritmos a evaluar: 

* `KNeighborsClassifier`
* `LogisticRegression`
* `DecisionTreeClassifier`
* `Random Forest`
* `XGBoost`
* `SVC` (Support Vector Machine/Máquinas de Vectores de Soporte)
* `MLP` (Multilayer Perceptron/Red Neuronal Multicapa)
* `MOE Conformal Transformer` (Mixture of Experts-Conformal Predictors Transformed Based)

### Especificación del Grid de Hiperparámetros por Tipo de Modelo

Teniendo en cuenta que los modelos a entrenar y analizar pueden ser ejecutados de forma automática mediante `GridSearchCV` (A excepción del algoritmo propuesto), se especificará un diccionario *model_config* estructurado de la siguiente forma:

* Key (clave): Identificación del tipo de algoritmo, ejemplo: KNN, LogisticRegresion, DecisionTree, etc.
* Value (valor): El conjunto de funciones (la implementación de cada algortimo en los paquetes `scikit-learn` o `xgboost`), los parámetros de inicialización del algoritmo (si los tiene) y sus respectivos hiperparámetros a evaluar en el entrenamiento. 

En la especificación se muestra de la siguiente manera:

```python
models_config = {
        'KNN': {
            'function': KNeighborsClassifier,
            'param_grid': {
            'classifier__n_neighbors': [3, 5, 7, 9, 11, 13, 15],
            'classifier__weights': ['uniform', 'distance'],
            'classifier__metric': ['euclidean']
            }   
        } ,
         'LogisticRegression': {
            'function': LogisticRegression,
            'init_params': {'max_iter': 1000, 'random_state': random_state}, 
            'param_grid': {
                'classifier__C': [0.01, 0.1, 1, 10, 100],
                'classifier__penalty': ['l1', 'l2'],
                'classifier__solver': ['liblinear', 'saga']
            }
         },
    # Y así sucesivamente ...
    'MLP': {
        'function': MLPClassifier,
        'init_params': {
            'max_iter': 500,
            'random_state': 101
        },
        'param_grid': {
            'classifier__hidden_layer_sizes': [(50,), (100,), (50, 50)],
            'classifier__activation': ['relu', 'tanh'],
            'classifier__solver': ['adam', 'sgd'],
            'classifier__alpha': [0.0001, 0.001, 0.01]
        }
    }
}
```
Así mismo, se define una variable tipo `Dict` denominada *common_params* con las siguientes *key-values*:

* `X_train`: Variables predictoras del conjunto de entrenamiento.
* `X_test`: Variables predictoras del conjunto de test.
* `y_train`: Variable *target* del conjunto de entrenamiento.
* `y_test`: Variable *target* del conjunto de test.
* `cv_folds`: Número de pliegues para la validación cruzada (cross validation).
* `random_state`: Semilla de reproducibilidad.

In [None]:
# =================================== ENTRENAMIENTO ===================================
random_state = 101

models_config = {
        'KNN': {
            'function': KNeighborsClassifier,
            'param_grid': {
            # 'classifier__n_neighbors': [3, 5, 7, 9, 11, 13, 15],
            'classifier__n_neighbors': [3, 5],
            'classifier__weights': ['uniform', 'distance'],
            'classifier__metric': ['euclidean']
            }   
        } ,
         'LogisticRegression': {
            'function': LogisticRegression,
            'init_params': {'max_iter': 1000, 'random_state': random_state}, 
            'param_grid': {
                'classifier__C': [0.01, 0.1, 1, 10, 100],
                'classifier__penalty': ['l1', 'l2'],
                'classifier__solver': ['liblinear', 'saga']
            }
         },
        'DecisionTree': {
            'function': DecisionTreeClassifier,
            'init_params': {'random_state': random_state}, 
            'param_grid': {
                'classifier__max_depth': [3, 5, 7, 10, None],
                'classifier__min_samples_split': [2, 5, 10],
                'classifier__min_samples_leaf': [1, 2, 4]
            }
        },
        'RandomForest': {
            'function': RandomForestClassifier, 
            'init_params': {'random_state': random_state}, 
            'param_grid': {
                'classifier__n_estimators': [100, 200, 300],
                'classifier__max_depth': [3, 5, 7, 10, None],
                'classifier__min_samples_split': [2, 5, 10]
            }
        } ,
        'XGBoost': {
            'function': XGBClassifier,
            'init_params': {   # 👈 parámetros por defecto especiales de XGBoost
                'eval_metric': 'logloss',
                'use_label_encoder': False,
                'random_state': random_state
            },
            'param_grid': {
                'classifier__n_estimators': [100, 200, 300],
                'classifier__learning_rate': [0.01, 0.1, 0.2],
                'classifier__max_depth': [3, 4, 5, 6],
                'classifier__subsample': [0.8, 1.0]
            }
        },
    'SVM': {
        'function': SVC,
        'init_params': {   # Necesario porque SVC no devuelve proba por defecto
            'probability': True,
            'random_state': 101
        },
        'param_grid': {
            'classifier__C': [0.1, 1, 10],
            'classifier__kernel': ['linear', 'rbf', 'poly'],
            'classifier__gamma': ['scale', 'auto']
        }
    },
    'MLP': {
        'function': MLPClassifier,
        'init_params': {
            'max_iter': 500,
            'random_state': 101
        },
        'param_grid': {
            'classifier__hidden_layer_sizes': [(50,), (100,), (50, 50)],
            'classifier__activation': ['relu', 'tanh'],
            'classifier__solver': ['adam', 'sgd'],
            'classifier__alpha': [0.0001, 0.001, 0.01]
        }
    }
}

# Parámetros comunes para el entrenamiento
common_params = {
    'X_train': X_train,
    'X_test': X_test,
    'y_train': y_train,
    'y_test': y_test,
    'cv_folds': 5,
    'random_state': 101
}

metrics_rows = []
best_configs_list = []
cv_results_dfs = []   # lista para guardar todos los resultados de GridSearchCV
roc_curves = {}       # diccionario para guardar las curvas ROC

for model_name, config in models_config.items():
    print(f"\nEntrenando modelo: {model_name}")
    
    # 1. Construcción del pipeline
    steps = []
    steps.append(("preprocessing", preprocessor))

    # Usar init_params si existen, si no dict vacío
    init_params = config.get('init_params', {})
    steps.append(("classifier", config['function'](**init_params)))
    pipeline = Pipeline(steps)
    
    # 2. Cross-validation
    skf = StratifiedKFold(n_splits=common_params['cv_folds'], shuffle=True, random_state=common_params['random_state'])
    grid_search = GridSearchCV(pipeline, config['param_grid'], cv=skf, scoring='roc_auc', n_jobs=-1, verbose=1)
    grid_search.fit(common_params['X_train'], common_params['y_train'])

    # Guardar resultados completos de GridSearchCV
    cv_results_df = pd.DataFrame(grid_search.cv_results_)
    cv_results_df["Model"] = model_name  # para identificar de qué modelo son
    cv_results_dfs.append(cv_results_df)
    
    # 3. Predicciones
    y_pred = grid_search.predict(common_params['X_test'])
    y_proba = grid_search.predict_proba(common_params['X_test'])[:, 1]
    
    # 4. Resultados principales
    cm = confusion_matrix(common_params['y_test'], y_pred)
    metrics_row = {
        'Model': model_name,
        'Precision': precision_score(common_params['y_test'], y_pred, zero_division=0),
        'Recall': recall_score(common_params['y_test'], y_pred, zero_division=0),
        'F1-Score': f1_score(common_params['y_test'], y_pred, zero_division=0),
        'Accuracy': accuracy_score(common_params['y_test'], y_pred),
        'ROC-AUC': roc_auc_score(common_params['y_test'], y_proba),
        'Best Params': {k.replace("classifier__", ""): v for k, v in grid_search.best_params_.items()},
        'TN': int(cm[0, 0]),
        'TP': int(cm[1, 1]),
        'FP': int(cm[0, 1]),
        'FN': int(cm[1, 0])
    }
    metrics_rows.append(metrics_row)
    
    # 5) Guardar las configuraciones de mejores métricas en la lista
    for metric in ['Precision', 'Recall', 'F1-Score', 'Accuracy', 'ROC-AUC']:
        best_configs_list.append({
            'Modelo': model_name,
            'Métrica': metric,
            'Valor': metrics_row[metric],
            'Parámetros': metrics_row['Best Params']
        })

    # 6. Guardar curva ROC
    fpr, tpr, _ = roc_curve(common_params['y_test'], y_proba)
    roc_auc = auc(fpr, tpr)
    roc_curves[model_name] = (fpr, tpr, roc_auc)
    
    print("Entrenamiento completado.")

col_order = [
    'Model', 'Precision', 'Recall', 'F1-Score', 'Accuracy', 'ROC-AUC',
    'Best Params', 'TN', 'TP', 'FP', 'FN'
]
results_df = pd.DataFrame(metrics_rows)[col_order]

best_config_df = pd.DataFrame(best_configs_list)[['Modelo', 'Métrica', 'Valor', 'Parámetros']]


Entrenando modelo: KNN
Fitting 5 folds for each of 4 candidates, totalling 20 fits
Entrenamiento completado.

Entrenando modelo: LogisticRegression
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Entrenamiento completado.

Entrenando modelo: DecisionTree
Fitting 5 folds for each of 45 candidates, totalling 225 fits
Entrenamiento completado.

Entrenando modelo: RandomForest
Fitting 5 folds for each of 45 candidates, totalling 225 fits
Entrenamiento completado.

Entrenando modelo: XGBoost
Fitting 5 folds for each of 72 candidates, totalling 360 fits
Entrenamiento completado.

Entrenando modelo: SVM
Fitting 5 folds for each of 18 candidates, totalling 90 fits


In [None]:
print("\n" + "="*100)
print(" "*35 + "TABLA COMPARATIVA DE RESULTADOS")
print("="*100)
results_df.style.hide(axis="index").format({
    "Precision": "{:.3f}",
    "Recall": "{:.3f}",
    "F1-Score": "{:.3f}",
    "Accuracy": "{:.3f}",
    "ROC-AUC": "{:.3f}"
})

In [None]:
print("\n" + "="*90)
print(" "*30 + "TABLA MEJORES CONFIGURACIONES")
print("="*90)
best_config_df.style.hide(axis="index").format({
    "Valor": "{:.3f}"
})

In [None]:
# Entreaniemto de modelo con algoritmo de Regresión Logística

thresholds = [0.3, 0.35, 0.4, 0.45, 0.5]

results_df, best_configs = train_LogisticRegression(
    X_train=X_train,
    X_test=X_test,
    y_train=y_train,
    y_test=y_test,
    preprocessor=preprocessor,
    thresholds=thresholds,
    param_grid={
        'classifier__C': [0.01, 0.1, 1, 10, 100],
        'classifier__solver': ['liblinear', 'lbfgs']
    },
    cv_folds=5,
    max_iter=1000,
    random_state=101
)

# Evaluación de Métricas de Performance