In [1]:
# Importando todos las Librerias
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Modelos a experimentar
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

# Librerías para MLflow (importante para el seguimiento de experimentos)
import mlflow
import mlflow.sklearn
import logging
logging.basicConfig(level=logging.WARN)
logger = logging.getLogger(__name__)

# Configurar MLflow (opcional, si quieres un server remoto o un archivo específico)
mlflow.set_tracking_uri("http://127.0.0.1:5000/") # Si tienes un servidor MLflow local
mlflow.set_experiment("Absenteeism_Prediction_Experiments")



<Experiment: artifact_location='mlflow-artifacts:/383079581240596013', creation_time=1760290200678, experiment_id='383079581240596013', last_update_time=1760290200678, lifecycle_stage='active', name='Absenteeism_Prediction_Experiments', tags={'mlflow.sharedViewState.3894e7dac091113a949e1a0b144bdfbf23f857b1cfb2b6251e919052fe25b155': 'deflate;eJxdUtFO4zAQ/JWTnysEr3kLpdwhWoRSqE6qTq1rb8lKjh1519Ac6r+zbgIpPO7szOx61u+KQEdT36JjiKpQaqJCtBCvu3vopNbMEXeJgS6IdeQNYwMjqSSjir12BBN16j/ldqHK+VxIDvdgOuPgy740jK9ZbzVrAqbPzvrfRDXBgltBJAx+VDj3q0qeREPgwDDYaXCpEaRYn6+3XYYUDWyFeI4usin9RG/68VslY6O4zw6t9hasKt6PPfKI3ud6PTD+oLXgx3qFhDt0yN1Ct6Osp+WhsvvtXbV82lxdbqrnh6Vs8IrwttAHbPB/th5yE9kciT8HDChSaYykLFlUcEr7TJIIfseQWrAr7RLQnZ/WEr5EwjFJv0YLs6bl7jv8kjXXclafnBvKH0/XiUMF+whUz7zeuYwPYhd22s3Rw8l0GvweX0SkDuUBqf8txNCe7u5h2YTAtQeS6Zfj7f5m8gLkEqaXqOPxA/ej5lc='}>

In [2]:
# Ruta del dataset procesad
ruta_dataset = '../data/interim/absenteeism_eda_fe_intermediate.csv'

try:
    df = pd.read_csv(ruta_dataset)
    print(f"✅ Dataset cargado correctamente desde: {ruta_dataset}")
    print(f"Dimensiones del dataset: {df.shape}")
    print("\nPrimeras 5 filas del dataset:")
    display(df.head()) # Usa display() en Jupyter para una mejor visualización
except Exception as e:
    logger.exception(f"No se pudo cargar el dataset. Asegúrate de que la ruta sea correcta y el archivo exista. Error: {e}")

✅ Dataset cargado correctamente desde: ../data/interim/absenteeism_eda_fe_intermediate.csv
Dimensiones del dataset: (514, 79)

Primeras 5 filas del dataset:


Unnamed: 0,ID,Transportation expense,Distance from Residence to Work,Service time,Age,Work load Average/day,Hit target,Disciplinary failure,Son,Social drinker,...,Education_High school,Education_Postgraduate,BMI_category_Obese,BMI_category_Overweight,Age_group_Middle-aged,Age_group_Senior,Service_group_Medium,Service_group_Long,Distance_group_Moderate,Distance_group_Far
0,-0.589167,1.298575,0.372291,0.043166,-0.550156,-0.81621,0.664538,-0.195918,0.819848,0.969341,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
1,-1.307203,-0.673158,1.435389,1.502873,0.427646,-0.81621,0.664538,-0.195918,-0.86576,0.969341,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
2,-0.948185,1.119326,-1.824778,0.335108,0.623206,-0.81621,0.664538,-0.195918,0.819848,0.969341,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
3,-0.589167,1.298575,0.372291,0.043166,-0.550156,-0.81621,0.664538,-0.195918,0.819848,0.969341,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
4,-1.307203,-0.673158,1.435389,1.502873,0.427646,-0.81621,0.664538,-0.195918,-0.86576,0.969341,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0


## Preparación para las variables

In [None]:
# --- Definir la Variable Objetivo (Target) ---
# La columna 'Absenteeism time in hours' es nuestro punto de partida.
# Vamos a crear un problema de clasificación binaria para predecir si una ausencia será larga o corta.
# Usaremos la mediana como umbral para hacer esta división.

# Calcular la mediana de las horas de ausentismo
try:
    median_hours = df['Absenteeism time in hours'].median()
    print(f"La mediana de las horas de ausentismo es: {median_hours:.2f} horas.")
    print("Crearemos una variable objetivo binaria usando este umbral.")

    # Crear la variable objetivo: 1 si es mayor que la mediana, 0 en caso contrario.
    df['Target_Binary'] = (df['Absenteeism time in hours'] > median_hours).astype(int)
    print("\nDistribución de la nueva variable objetivo 'Target_Binary':")
    print(df['Target_Binary'].value_counts(normalize=True))

except KeyError:
    print("Error: Asegúrate de que la columna 'Absenteeism time in hours' exista en tu DataFrame.")
    # Detener la ejecución si la columna no existe.
    # Y luego re-intentar las operaciones.


# --- Paso 3 Definir Características (X) y Variable Objetivo (y) ---

# Columnas a eliminar por ser el target o por causar fuga de datos
COLUMNAS_A_ELIMINAR = [
    'ID', 
    'Absenteeism time in hours', # La usamos para crear el target
    'Target_Binary',             # Es nuestro target
    'Reliability_score',         # ¡NUEVO! Causa Fuga de Datos.
    'Reliability_score_norm'     # ¡NUEVO! Causa Fuga de Datos.
]

# 'y' sigue siendo la misma.
y = df['Target_Binary']

# 'X' ahora excluye las columnas con fugas de datos.
X = df.drop(columns=COLUMNAS_A_ELIMINAR)

print(f"Eliminando {len(COLUMNAS_A_ELIMINAR) - 3} característica(s) con fuga de datos.")
print("\nNuevas características (X) usadas para el entrenamiento:", X.shape)
print(X.columns.tolist())


# --- El resto de tu código en la celda (train_test_split, StandardScaler) sigue igual ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("\n✅ Datos listos para re-entrenar sin la fuga de datos.")

La mediana de las horas de ausentismo es: 3.00 horas.
Crearemos una variable objetivo binaria usando este umbral.

Distribución de la nueva variable objetivo 'Target_Binary':
Target_Binary
0    0.595331
1    0.404669
Name: proportion, dtype: float64
Eliminando 2 característica(s) con fuga de datos.

Nuevas características (X) usadas para el entrenamiento: (514, 75)
['Transportation expense', 'Distance from Residence to Work', 'Service time', 'Age', 'Work load Average/day', 'Hit target', 'Disciplinary failure', 'Son', 'Social drinker', 'Social smoker', 'Pet', 'Weight', 'Height', 'Body mass index', 'Lifestyle_risk_score', 'Dependents_count', 'Has_dependents', 'Has_family_or_pets', 'Healthy_lifestyle', 'Penalty_risk_score', 'Workload_deviation', 'Reason for absence_1 Certain infectious and parasitic diseases', 'Reason for absence_10 Diseases of the respiratory system', 'Reason for absence_11 Diseases of the digestive system', 'Reason for absence_12 Diseases of the skin and subcutaneous ti

In [4]:
# Configurar el experimento de MLflow
mlflow.set_experiment("Absenteeism_Prediction_Experiments")

# Diccionario de modelos que vamos a probar
models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42)
}

print("Iniciando experimentación de modelos...")

for name, model in models.items():
    # Iniciar una nueva ejecución en MLflow para cada modelo
    with mlflow.start_run(run_name=name):
        print(f"\n--- Entrenando y evaluando: {name} ---")

        # 1. Registrar el nombre del modelo como un parámetro
        mlflow.log_param("model_name", name)

        # 2. Entrenar el modelo con los datos escalados
        model.fit(X_train_scaled, y_train)

        # 3. Realizar predicciones sobre el conjunto de prueba
        y_pred = model.predict(X_test_scaled)
        y_proba = model.predict_proba(X_test_scaled)[:, 1] # Probabilidades para ROC AUC

        # 4. Calcular las métricas de evaluación
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        roc_auc = roc_auc_score(y_test, y_proba)

        # 5. Registrar las métricas en MLflow para poder compararlas después
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("precision", precision)
        mlflow.log_metric("recall", recall)
        mlflow.log_metric("f1_score", f1)
        mlflow.log_metric("roc_auc", roc_auc)
        
        # 6. Registrar los hiperparámetros del modelo
        mlflow.log_params(model.get_params())

        # 7. Registrar el modelo entrenado como un "artefacto" en MLflow
        mlflow.sklearn.log_model(model, f"model_{name.replace(' ', '_')}")

        print(f"  Accuracy: {accuracy:.4f}")
        print(f"  F1-Score: {f1:.4f}")
        print(f"  ROC AUC: {roc_auc:.4f}")

print("\n✅ Experimentación de modelos finalizada.")

Iniciando experimentación de modelos...

--- Entrenando y evaluando: Logistic Regression ---




  Accuracy: 0.7184
  F1-Score: 0.6329
  ROC AUC: 0.8158
🏃 View run Logistic Regression at: http://127.0.0.1:5000/#/experiments/383079581240596013/runs/c3feaafc2c1c406ca8a8ff278263dd67
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/383079581240596013

--- Entrenando y evaluando: Decision Tree ---




  Accuracy: 0.7184
  F1-Score: 0.6420
  ROC AUC: 0.6967
🏃 View run Decision Tree at: http://127.0.0.1:5000/#/experiments/383079581240596013/runs/8919fe1f9f734a66a58ee63c5b117713
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/383079581240596013

--- Entrenando y evaluando: Random Forest ---




  Accuracy: 0.7379
  F1-Score: 0.6494
  ROC AUC: 0.7539
🏃 View run Random Forest at: http://127.0.0.1:5000/#/experiments/383079581240596013/runs/76ea399f8fee447ca156f0f092eb2f3b
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/383079581240596013

--- Entrenando y evaluando: Gradient Boosting ---




  Accuracy: 0.7184
  F1-Score: 0.6420
  ROC AUC: 0.8165
🏃 View run Gradient Boosting at: http://127.0.0.1:5000/#/experiments/383079581240596013/runs/a70221d57ee74048b47b8b17b57a17da
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/383079581240596013

✅ Experimentación de modelos finalizada.


In [5]:
# --- Paso de Diagnóstico: Investigar Correlaciones ---
# Vamos a calcular la correlación de todas las columnas con nuestra variable objetivo.
# Una correlación muy cercana a 1 o -1 es una señal de fuga de datos.

print("Calculando correlaciones con la variable objetivo...")
correlations = df.corr()['Target_Binary'].sort_values(ascending=False)

print("\nTop 10 correlaciones más altas con 'Target_Binary':")
print(correlations.head(10))

print("\nTop 10 correlaciones más bajas (más negativas) con 'Target_Binary':")
print(correlations.tail(10))

# Identificar columnas con correlación perfecta (¡las culpables!)
leaky_features = correlations[abs(correlations) > 0.99].index.tolist()

# El Target_Binary siempre tendrá una correlación de 1.0 consigo mismo, así que lo removemos de la lista.
if 'Target_Binary' in leaky_features:
    leaky_features.remove('Target_Binary')

if leaky_features:
    print(f"\n🚨 ¡Fuga de datos detectada! La(s) siguiente(s) característica(s) son probablemente la causa: {leaky_features}")
else:
    print("\n✅ No se detectó una fuga de datos obvia por correlación. El overfitting podría deberse a la complejidad del modelo.")

Calculando correlaciones con la variable objetivo...

Top 10 correlaciones más altas con 'Target_Binary':
Target_Binary                                    1.000000
Absenteeism time in hours                        0.818868
Reason for absence_26 Unjustified absence        0.296590
Son                                              0.285075
Lifestyle_risk_score                             0.238655
Service_group_Long                               0.235744
Age_group_Middle-aged                            0.232805
Transportation expense                           0.223499
Reason for absence_22 Patient follow-up (CID)    0.210285
Distance_group_Moderate                          0.202714
Name: Target_Binary, dtype: float64

Top 10 correlaciones más bajas (más negativas) con 'Target_Binary':
Month of absence_January                     -0.130176
ID                                           -0.134605
Month of absence_February                    -0.135235
Penalty_risk_score                          

Para validar creamos un diagnostico de correlacines para ver si hay data leakage pero vemos que hay en esos modelos overfitting por la naturaleza del modelo

Ahora regularemos los modelos limitando la profundidad e iteracion del modelo

In [6]:
# Configurar el experimento de MLflow
mlflow.set_experiment("Absenteeism_Prediction_Experiments_Regularized") # Nuevo nombre de experimento

# Diccionario de modelos con versiones regularizadas para combatir el overfitting
models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Decision Tree (Regularized)': DecisionTreeClassifier(
        random_state=42,
        max_depth=5,            # Límite de profundidad
        min_samples_leaf=10     # Mínimo de muestras por hoja
    ),
    'Random Forest (Regularized)': RandomForestClassifier(
        random_state=42,
        max_depth=8,            # Límite de profundidad para cada árbol
        min_samples_leaf=5,     # Mínimo de muestras por hoja
        n_estimators=150        # Número de árboles
    ),
    'Gradient Boosting (Regularized)': GradientBoostingClassifier(
        random_state=42,
        max_depth=4,            # Límite de profundidad para cada árbol
        n_estimators=100
    )
}

print("Iniciando experimentación con modelos regularizados...")

for name, model in models.items():
    with mlflow.start_run(run_name=name):
        print(f"\n--- Entrenando y evaluando: {name} ---")

        mlflow.log_param("model_name", name)
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
        y_proba = model.predict_proba(X_test_scaled)[:, 1]

        # Calcular y registrar métricas
        metrics = {
            "accuracy": accuracy_score(y_test, y_pred),
            "precision": precision_score(y_test, y_pred),
            "recall": recall_score(y_test, y_pred),
            "f1_score": f1_score(y_test, y_pred),
            "roc_auc": roc_auc_score(y_test, y_proba)
        }
        mlflow.log_metrics(metrics)
        mlflow.log_params(model.get_params())
        mlflow.sklearn.log_model(model, f"model_{name.replace(' ', '_')}")

        print(f"  Accuracy: {metrics['accuracy']:.4f}")
        print(f"  F1-Score: {metrics['f1_score']:.4f}")
        print(f"  ROC AUC: {metrics['roc_auc']:.4f}")

print("\n✅ Experimentación finalizada.")

Iniciando experimentación con modelos regularizados...

--- Entrenando y evaluando: Logistic Regression ---




  Accuracy: 0.7184
  F1-Score: 0.6329
  ROC AUC: 0.8158
🏃 View run Logistic Regression at: http://127.0.0.1:5000/#/experiments/541240116825496883/runs/7a32057e1a8f46e49344664fc8fde427
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/541240116825496883

--- Entrenando y evaluando: Decision Tree (Regularized) ---




  Accuracy: 0.7767
  F1-Score: 0.7356
  ROC AUC: 0.8374
🏃 View run Decision Tree (Regularized) at: http://127.0.0.1:5000/#/experiments/541240116825496883/runs/d28fc987a4d1430d8557218907adf12f
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/541240116825496883

--- Entrenando y evaluando: Random Forest (Regularized) ---




  Accuracy: 0.7184
  F1-Score: 0.6027
  ROC AUC: 0.7838
🏃 View run Random Forest (Regularized) at: http://127.0.0.1:5000/#/experiments/541240116825496883/runs/eb25efc4e2e34a09973951ed925c223d
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/541240116825496883

--- Entrenando y evaluando: Gradient Boosting (Regularized) ---




  Accuracy: 0.7379
  F1-Score: 0.6747
  ROC AUC: 0.8206
🏃 View run Gradient Boosting (Regularized) at: http://127.0.0.1:5000/#/experiments/541240116825496883/runs/37ce86f0922f4e92810055332b5ec7e1
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/541240116825496883

✅ Experimentación finalizada.
