# Implementación de Modelos

In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import graphviz
import sys
import os

# Añadir carpeta raíz del proyecto al path
root_dir = os.path.abspath(os.path.join(os.getcwd(), ".."))

# Añadir root al sys.path
if root_dir not in sys.path:
    sys.path.append(root_dir)

# Utilidades pre-procesamiento, pipelines y automatización de entrenamiento
from sklearn.model_selection import (
    train_test_split, 
    cross_val_score, 
    KFold, 
    StratifiedKFold, 
    GridSearchCV
    )

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Métricas de performance
from sklearn.metrics import (
    mean_squared_error, 
    mean_absolute_error, 
    r2_score, 
    mean_absolute_percentage_error, 
    classification_report,
    accuracy_score, 
    precision_score, 
    recall_score, 
    f1_score, 
    roc_auc_score, 
    confusion_matrix, 
    RocCurveDisplay
)

# Modelos
from src.trainClassifiers import (
    train_KNN,
    train_LogisticRegression,
    train_DecisionTree,
    train_RandomForest
)

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVR, SVC



# Utilidades
import logging
import missingno as msno
from functools import wraps
import time
from datetime import datetime
from typing import Dict, List, Tuple, Optional, Any, Union
import warnings
warnings.filterwarnings('ignore')

In [16]:
# Importación de los datos
url = 'https://raw.githubusercontent.com/tuliorozco/applied-statistics/refs/heads/main/data/diabetes_dataset.csv'
data = pd.read_csv(url)

In [17]:
data.head()

Unnamed: 0,year,gender,age,location,race:AfricanAmerican,race:Asian,race:Caucasian,race:Hispanic,race:Other,hypertension,heart_disease,smoking_history,bmi,hbA1c_level,blood_glucose_level,diabetes
0,2020,Female,32.0,Alabama,0,0,0,0,1,0,0,never,27.32,5.0,100,0
1,2015,Female,29.0,Alabama,0,1,0,0,0,0,0,never,19.95,5.0,90,0
2,2015,Male,18.0,Alabama,0,0,0,0,1,0,0,never,23.76,4.8,160,0
3,2015,Male,41.0,Alabama,0,0,1,0,0,0,0,never,27.32,4.0,159,0
4,2016,Female,52.0,Alabama,1,0,0,0,0,0,0,never,23.75,6.5,90,0


In [18]:
# Eliminación de observaciones duplicadas
data1 = data.drop_duplicates()
print(f"Nuevo total de registros: {len(data)}")
data1.shape

Nuevo total de registros: 100000


(99986, 16)

In [19]:
# =================================== PREPROCESAMIENTO ===================================
# 1. Separar las features predictoras (X) de la variable objetivo/target (y).
X = data1.drop('diabetes', axis=1)
y = data1['diabetes']

# 2. Clasificar los campos por tipo de variable para el pre-procesamiento adecuado
numerical_cols = ["age", "bmi", "hbA1c_level", "blood_glucose_level"] 
categorical_cols = [col for col in data1.columns if col not in numerical_cols]
categorical_cols.remove('diabetes') 


# 3. División Train/Test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42)


# 4. Pre-procesamiento de los datos
preprocessor = ColumnTransformer(transformers=[
    ("cat", OneHotEncoder(drop="first", handle_unknown='ignore'), categorical_cols),  
    ("num", StandardScaler(), numerical_cols) 
])

## Algoritmos de Vecinos y Distancias

### `KNeighborsClassifier`

In [20]:
# Entrenamiento de modelo con algoritmo de KNN

results_df_KNN, *best_configs_KNN = train_KNN(
    X_train=X_train,
    X_test=X_test,
    y_train=y_train,
    y_test=y_test,
    preprocessor=preprocessor,
    param_grid={
            'classifier__n_neighbors': [3, 5, 7, 9, 11, 13, 15],
            'classifier__weights': ['uniform', 'distance'],
            'classifier__metric': ['euclidean']
        },
    cv_folds=5,
    random_state=101
)

Iniciando entrenamiento del modelo KNN...
Fitting 5 folds for each of 14 candidates, totalling 70 fits

Evaluando cada valor de neighbors en el conjunto de test...
  Evaluando k=3...
  Evaluando k=5...
  Evaluando k=7...
  Evaluando k=9...
  Evaluando k=11...
  Evaluando k=13...
  Evaluando k=15...
Entrenamiento completado.


In [21]:
print("\n" + "="*90)
print(" "*20 + "TABLA COMPARATIVA DE RESULTADOS: KNeighborsClassifier")
print("="*90)
results_df_KNN.style.hide(axis="index").format({
    "Threshold": "{:.2f}",
    "Precision": "{:.3f}",
    "Recall": "{:.3f}",
    "F1-Score": "{:.3f}",
    "Accuracy": "{:.3f}",
    "ROC-AUC": "{:.3f}"
})


                    TABLA COMPARATIVA DE RESULTADOS: KNeighborsClassifier


Neighbors,Precision,Recall,F1-Score,Accuracy,ROC-AUC,Best Params,TN,TP,FP,FN
3,0.832,0.563,0.672,0.953,0.844,"weights: distance, metric: euclidean",27157,1435,289,1115
5,0.91,0.545,0.682,0.957,0.878,"weights: distance, metric: euclidean",27309,1390,137,1160
7,0.937,0.534,0.68,0.957,0.894,"weights: distance, metric: euclidean",27355,1362,91,1188
9,0.956,0.53,0.682,0.958,0.906,"weights: distance, metric: euclidean",27384,1351,62,1199
11,0.97,0.526,0.682,0.958,0.915,"weights: distance, metric: euclidean",27405,1341,41,1209
13,0.978,0.521,0.68,0.958,0.923,"weights: distance, metric: euclidean",27416,1329,30,1221
15,0.987,0.522,0.683,0.959,0.929,"weights: distance, metric: euclidean",27428,1331,18,1219


In [22]:
if isinstance(best_configs_KNN, list) and len(best_configs_KNN) == 1:
    best_configs_KNN = best_configs_KNN[0]

print("\n" + "="*90)
print(" "*30 + "TABLA MEJORES CONFIGURACIONES")
print("="*90)
best_configs_KNN.style.hide(axis="index").format({
    # "Mejor Threshold": "{:.2f}",
    "Valor": "{:.3f}"
})


                              TABLA MEJORES CONFIGURACIONES


Métrica,Mejor n_neighbors,Valor,Parámetros
Precision,15,0.987,"weights: distance, metric: euclidean"
Recall,3,0.563,"weights: distance, metric: euclidean"
F1-Score,15,0.683,"weights: distance, metric: euclidean"
Accuracy,15,0.959,"weights: distance, metric: euclidean"
ROC-AUC,15,0.929,"weights: distance, metric: euclidean"


## Algoritmos Basados en Modelos Lineales

### `LogisticRegression`

In [23]:
# Entreaniemto de modelo con algoritmo de Regresión Logística

thresholds = [0.3, 0.35, 0.4, 0.45, 0.5]

results_df, best_configs = train_LogisticRegression(
    X_train=X_train,
    X_test=X_test,
    y_train=y_train,
    y_test=y_test,
    preprocessor=preprocessor,
    thresholds=thresholds,
    param_grid={
        'classifier__C': [0.01, 0.1, 1, 10, 100],
        'classifier__solver': ['liblinear', 'lbfgs']
    },
    cv_folds=5,
    max_iter=1000,
    random_state=101
)

In [24]:
print("\n" + "="*90)
print(" "*20 + "TABLA COMPARATIVA DE RESULTADOS: LogisticRegression")
print("="*90)
results_df.style.hide(axis="index").format({
    "Threshold": "{:.2f}",
    "Precision": "{:.3f}",
    "Recall": "{:.3f}",
    "F1-Score": "{:.3f}",
    "Accuracy": "{:.3f}",
    "ROC-AUC": "{:.3f}"
})


                    TABLA COMPARATIVA DE RESULTADOS: LogisticRegression


Threshold,Precision,Recall,F1-Score,Accuracy,ROC-AUC,Best Params,TN,TP,FP,FN
0.3,0.714,0.709,0.712,0.951,0.961,C: 0.1,26723,1808,723,742
0.35,0.759,0.68,0.717,0.954,0.961,C: 0.1,26896,1734,550,816
0.4,0.804,0.663,0.727,0.958,0.961,C: 0.1,27035,1690,411,860
0.45,0.845,0.642,0.73,0.96,0.961,C: 0.1,27145,1638,301,912
0.5,0.877,0.627,0.731,0.961,0.961,C: 0.1,27221,1600,225,950


In [25]:
print("\n" + "="*90)
print(" "*30 + "TABLA MEJORES CONFIGURACIONES")
print("="*90)
best_configs.style.hide(axis="index").format({
    "Mejor Threshold": "{:.2f}",
    "Valor": "{:.3f}"
})


                              TABLA MEJORES CONFIGURACIONES


Métrica,Mejor Threshold,Valor,Parámetros
Precision,0.5,0.877,C: 0.1
Recall,0.3,0.709,C: 0.1
F1-Score,0.5,0.731,C: 0.1
Accuracy,0.5,0.961,C: 0.1
ROC-AUC,0.3,0.961,C: 0.1


## Algoritmos Basados en Árboles de Decisión

### `DecisionTreeClassifier`

In [26]:
results_dt, best_dt = train_DecisionTree(
    X_train, X_test, y_train, y_test,
    preprocessor=preprocessor,
    max_depth_list=[3, 5, 7, 9, None],  # lista explícita
    cv_folds=5,
    random_state=101
)

In [27]:
print("\n" + "="*90)
print(" "*20 + "TABLA COMPARATIVA DE RESULTADOS: DecisionTreeClassifier")
print("="*90)
results_dt.style.hide(axis="index").format({
    "Precision": "{:.3f}",
    "Recall": "{:.3f}",
    "F1-Score": "{:.3f}",
    "Accuracy": "{:.3f}",
    "ROC-AUC": "{:.3f}"
})


                    TABLA COMPARATIVA DE RESULTADOS: DecisionTreeClassifier


Max Depth,Precision,Recall,F1-Score,Accuracy,ROC-AUC,Best Params,TN,TP,FP,FN
3.0,1.0,0.664,0.798,0.971,0.901,{'classifier__criterion': 'entropy'},27446,1694,0,856
5.0,1.0,0.664,0.798,0.971,0.959,{'classifier__criterion': 'entropy'},27446,1694,0,856
7.0,1.0,0.664,0.798,0.971,0.97,{'classifier__criterion': 'entropy'},27446,1694,0,856
9.0,0.989,0.673,0.801,0.972,0.972,{'classifier__criterion': 'entropy'},27427,1715,19,835
,0.726,0.73,0.728,0.954,0.852,{'classifier__criterion': 'entropy'},26743,1861,703,689


In [28]:
print("\n" + "="*90)
print(" "*30 + "TABLA MEJORES CONFIGURACIONES")
print("="*90)
best_dt.style.hide(axis="index").format({
    "Valor": "{:.3f}"
})


                              TABLA MEJORES CONFIGURACIONES


Métrica,Mejor max_depth,Valor,Parámetros
Precision,3.0,1.0,{'classifier__criterion': 'entropy'}
Recall,,0.73,{'classifier__criterion': 'entropy'}
F1-Score,9.0,0.801,{'classifier__criterion': 'entropy'}
Accuracy,9.0,0.972,{'classifier__criterion': 'entropy'}
ROC-AUC,9.0,0.972,{'classifier__criterion': 'entropy'}


### `Random Forest`

### `XGBoost`

## Algoritmos Basados en `Support Vector Machines (SVM)`

### `SVM con Kernel Lineal`

### `SVM con Kernel RBF`

# Evaluación de Métricas de Performance