# 1 Cargamos librerias

In [1]:
# Importaciones
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import time
import joblib
import mlflow
from mlflow.tracking import MlflowClient

from loguru import logger

# Importar m√≥dulos del paquete product_development
from product_development.config import (
    TARGET, FEATURES, CATEGORICAL_VARS, RAW_DATA_DIR, MODELS_DIR,
    FEATURE_PIPELINE_FILE, PIPELINE_FILE, TRAIN_TEST_SPLIT_RATIO, RANDOM_STATE,
    MLFLOW_EXPERIMENT_NAME, MLFLOW_MODEL_NAME, MLFLOW_TRACKING_URI,
    MLFLOW_CHAMPION_ALIAS, MLFLOW_CHALLENGER_ALIAS
)
from product_development.dataset import load_raw_data, prepare_dataset, temporal_train_test_split
from product_development.features import load_feature_pipeline, transform_features
from product_development.modeling.train import (
    get_model_configurations, train_and_evaluate_models,
    create_full_pipeline, train_final_model, save_pipeline,
    setup_mlflow, register_model_to_mlflow, get_champion_model,
    compare_with_champion, promote_challenger_to_champion, get_experiment_summary,
    calculate_metrics
)

[32m2025-11-29 12:33:44.845[0m | [1mINFO    [0m | [36mproduct_development.config[0m:[36m<module>[0m:[36m17[0m - [1mPROJ_ROOT path is: C:\Users\fjgon\OneDrive - Universidad Galileo\Trimestre 8\product_development[0m


Configuraci√≥n de modelos, logging y rutas

In [2]:
# 2. Configuraci√≥n de modelos y MLflow
# Modo: "fast" (r√°pido, 4 modelos) o "full" (completo, 15 modelos)
TRAINING_MODE = "fast"  # Cambiar a "full" para evaluaci√≥n completa
USE_MLFLOW = True  # Habilitar/deshabilitar tracking de MLflow

# Configurar MLflow
if USE_MLFLOW:
    experiment_id = setup_mlflow(MLFLOW_EXPERIMENT_NAME, MLFLOW_TRACKING_URI)
    print(f"MLflow Experiment ID: {experiment_id}")
    print(f"MLflow Tracking URI: {MLFLOW_TRACKING_URI}")

models_configurations = get_model_configurations(mode=TRAINING_MODE)

print(f"\nModo de entrenamiento: {TRAINING_MODE}")
print(f"Total de modelos a evaluar: {len(models_configurations)}")
print("\nModelos configurados:")
for name in models_configurations.keys():
    print(f"  - {name}")

[32m2025-11-29 12:33:45.615[0m | [1mINFO    [0m | [36mproduct_development.modeling.train[0m:[36msetup_mlflow[0m:[36m155[0m - [1mMLflow configurado - Experimento: sales_prediction[0m
MLflow Experiment ID: 781360344278088554
MLflow Tracking URI: mlruns

Modo de entrenamiento: fast
Total de modelos a evaluar: 4

Modelos configurados:
  - LinearRegression
  - RandomForest
  - GradientBoosting
  - XGBoost


  return FileStore(store_uri, store_uri)


# 3 Entrenamiento y seleccion de modelos


In [3]:
# 3. Cargar y preparar datos usando los m√≥dulos del paquete
raw_data = load_raw_data(RAW_DATA_DIR / "train.csv")
dataset = prepare_dataset(raw_data)

print(f"Total registros: {len(dataset)}")

# Split temporal 80/20
X_train, X_val, y_train, y_val = temporal_train_test_split(dataset, TRAIN_TEST_SPLIT_RATIO)

print(f"Train: {X_train.shape}, Val: {X_val.shape}")

# Cargar pipeline de preprocesamiento
preproc_pipeline = load_feature_pipeline(FEATURE_PIPELINE_FILE)

# Ajustar pipeline con datos de entrenamiento
preproc_pipeline.fit(X_train, y_train)

# Transformar datos
X_train_proc = transform_features(preproc_pipeline, X_train)
X_val_proc = transform_features(preproc_pipeline, X_val)

print(f"Shape X_train_proc: {X_train_proc.shape}")
print(f"Shape X_val_proc: {X_val_proc.shape}")

[32m2025-11-29 12:33:45.627[0m | [1mINFO    [0m | [36mproduct_development.dataset[0m:[36mload_raw_data[0m:[36m49[0m - [1mCargando datos crudos desde C:\Users\fjgon\OneDrive - Universidad Galileo\Trimestre 8\product_development\data\raw\train.csv[0m
[32m2025-11-29 12:33:45.920[0m | [1mINFO    [0m | [36mproduct_development.dataset[0m:[36mload_raw_data[0m:[36m51[0m - [1mCargados 913000 registros[0m
[32m2025-11-29 12:33:45.920[0m | [1mINFO    [0m | [36mproduct_development.dataset[0m:[36madd_temporal_features[0m:[36m69[0m - [1mAgregando caracter√≠sticas temporales[0m
[32m2025-11-29 12:33:46.059[0m | [1mINFO    [0m | [36mproduct_development.dataset[0m:[36mconvert_categorical_types[0m:[36m92[0m - [1mConvirtiendo columnas categ√≥ricas a tipo object[0m
Total registros: 913000
[32m2025-11-29 12:33:46.114[0m | [1mINFO    [0m | [36mproduct_development.dataset[0m:[36mtemporal_train_test_split[0m:[36m140[0m - [1mDividiendo datos con 80% para

  X[var] = X[var].fillna(self.fill_value).astype(str)
  X[var] = X[var].fillna(self.fill_value).astype(str)


[32m2025-11-29 12:33:47.336[0m | [1mINFO    [0m | [36mproduct_development.features[0m:[36mtransform_features[0m:[36m126[0m - [1mTransformando 730400 registros[0m


  X[var] = X[var].fillna(self.fill_value).astype(str)
  X[var] = X[var].fillna(self.fill_value).astype(str)


[32m2025-11-29 12:33:48.100[0m | [1mINFO    [0m | [36mproduct_development.features[0m:[36mtransform_features[0m:[36m126[0m - [1mTransformando 182600 registros[0m
Shape X_train_proc: (730400, 5)
Shape X_val_proc: (182600, 5)


  X[var] = X[var].fillna(self.fill_value).astype(str)
  X[var] = X[var].fillna(self.fill_value).astype(str)


In [4]:
# 4. Entrenamiento y evaluaci√≥n de modelos con MLflow tracking
start = time.time()

results, best_model_name, best_model = train_and_evaluate_models(
    X_train_proc, y_train,
    X_val_proc, y_val,
    models_configurations,
    use_mlflow=USE_MLFLOW
)

elapsed_time = round(time.time() - start, 2)

print(f"\nTiempo total de entrenamiento: {elapsed_time} segundos")
print(f"\nResultados de todos los modelos:")
for model_name, metrics in results.items():
    print(f"  {model_name}: RMSE={metrics['rmse']}, R2={metrics['r2']}")

print(f"\nüèÜ Mejor modelo: {best_model_name}")
print(f"   RMSE: {results[best_model_name]['rmse']}")
print(f"   R2: {results[best_model_name]['r2']}")

[32m2025-11-29 12:33:51.104[0m | [1mINFO    [0m | [36mproduct_development.modeling.train[0m:[36mtrain_and_evaluate_models[0m:[36m245[0m - [1mIniciando entrenamiento y evaluaci√≥n de modelos[0m
[32m2025-11-29 12:33:51.118[0m | [1mINFO    [0m | [36mproduct_development.modeling.train[0m:[36msetup_mlflow[0m:[36m155[0m - [1mMLflow configurado - Experimento: sales_prediction[0m


                                                         

Entrenando modelos:   0%|          | 0/4 [00:00<?, ?it/s]

[32m2025-11-29 12:33:51.120[0m | [1mINFO    [0m | [36mproduct_development.modeling.train[0m:[36mtrain_and_evaluate_models[0m:[36m252[0m - [1mEntrenando modelo: LinearRegression[0m


Entrenando modelos:  25%|‚ñà‚ñà‚ñå       | 1/4 [00:00<00:01,  2.60it/s]

[32m2025-11-29 12:33:51.497[0m | [1mINFO    [0m | [36mproduct_development.modeling.train[0m:[36mtrain_and_evaluate_models[0m:[36m281[0m - [1mModelo LinearRegression - RMSE: 30.0452, R2: 0.0931[0m
[32m2025-11-29 12:33:51.504[0m | [1mINFO    [0m | [36mproduct_development.modeling.train[0m:[36mtrain_and_evaluate_models[0m:[36m252[0m - [1mEntrenando modelo: RandomForest[0m


Entrenando modelos:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 2/4 [00:04<00:05,  2.67s/it]

[32m2025-11-29 12:33:55.762[0m | [1mINFO    [0m | [36mproduct_development.modeling.train[0m:[36mtrain_and_evaluate_models[0m:[36m281[0m - [1mModelo RandomForest - RMSE: 27.4841, R2: 0.2411[0m
[32m2025-11-29 12:33:55.778[0m | [1mINFO    [0m | [36mproduct_development.modeling.train[0m:[36mtrain_and_evaluate_models[0m:[36m252[0m - [1mEntrenando modelo: GradientBoosting[0m


Entrenando modelos:  75%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå  | 3/4 [00:23<00:09,  9.90s/it]

[32m2025-11-29 12:34:14.275[0m | [1mINFO    [0m | [36mproduct_development.modeling.train[0m:[36mtrain_and_evaluate_models[0m:[36m281[0m - [1mModelo GradientBoosting - RMSE: 27.1293, R2: 0.2605[0m
[32m2025-11-29 12:34:14.289[0m | [1mINFO    [0m | [36mproduct_development.modeling.train[0m:[36mtrain_and_evaluate_models[0m:[36m252[0m - [1mEntrenando modelo: XGBoost[0m


Entrenando modelos: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4/4 [00:24<00:00,  6.10s/it]

[32m2025-11-29 12:34:15.491[0m | [1mINFO    [0m | [36mproduct_development.modeling.train[0m:[36mtrain_and_evaluate_models[0m:[36m281[0m - [1mModelo XGBoost - RMSE: 26.9235, R2: 0.2717[0m
[32m2025-11-29 12:34:15.507[0m | [1mINFO    [0m | [36mproduct_development.modeling.train[0m:[36mtrain_and_evaluate_models[0m:[36m301[0m - [1mMejor modelo: XGBoost con RMSE = 26.9235[0m

Tiempo total de entrenamiento: 24.4 segundos

Resultados de todos los modelos:
  LinearRegression: RMSE=30.0452, R2=0.0931
  RandomForest: RMSE=27.4841, R2=0.2411
  GradientBoosting: RMSE=27.1293, R2=0.2605
  XGBoost: RMSE=26.9235, R2=0.2717

üèÜ Mejor modelo: XGBoost
   RMSE: 26.9235
   R2: 0.2717





# 4. Importamos Pipeline de preproc y Agregamos Modelo Ganador


In [5]:
# 5. Crear pipeline completo con el mejor modelo
full_pipeline = create_full_pipeline(preproc_pipeline, best_model)

print("Pipeline completo creado:")
for step_name, step in full_pipeline.steps:
    print(f"  - {step_name}: {type(step).__name__}")

[32m2025-11-29 12:34:21.192[0m | [1mINFO    [0m | [36mproduct_development.modeling.train[0m:[36mcreate_full_pipeline[0m:[36m325[0m - [1mCreando pipeline completo con modelo[0m
Pipeline completo creado:
  - cat_missing_imputation: SimpleCategoricalImputer
  - num_median_imputation: MeanMedianImputer
  - cat_freq_encoder: CountFrequencyEncoder
  - dayofweek_mapper: Mapper
  - feature_scaler: MinMaxScaler
  - regressor: XGBRegressor


# 5. Re entrenamos pipelines con el Modelo Ganador


In [6]:
# 6. Re-entrenar con el dataset completo
X_full = dataset[FEATURES].copy()
y_full = dataset[TARGET].copy()

# Convertir tipos categ√≥ricos
for col in CATEGORICAL_VARS:
    X_full[col] = X_full[col].astype("O")

full_pipeline = train_final_model(full_pipeline, X_full, y_full)

print("Pipeline final entrenado correctamente.")

[32m2025-11-29 12:34:24.159[0m | [1mINFO    [0m | [36mproduct_development.modeling.train[0m:[36mtrain_final_model[0m:[36m357[0m - [1mEntrenando modelo final con dataset completo[0m


  X[var] = X[var].fillna(self.fill_value).astype(str)
  X[var] = X[var].fillna(self.fill_value).astype(str)


[32m2025-11-29 12:34:26.473[0m | [1mINFO    [0m | [36mproduct_development.modeling.train[0m:[36mtrain_final_model[0m:[36m359[0m - [1mEntrenamiento del modelo final completado[0m
Pipeline final entrenado correctamente.


In [7]:
# 7. Guardar pipeline localmente
save_pipeline(full_pipeline, PIPELINE_FILE)
print(f"Pipeline completo guardado en: {PIPELINE_FILE}")

[32m2025-11-29 12:34:28.446[0m | [1mINFO    [0m | [36mproduct_development.modeling.train[0m:[36msave_pipeline[0m:[36m379[0m - [1mPipeline guardado en C:\Users\fjgon\OneDrive - Universidad Galileo\Trimestre 8\product_development\models\sales_pipeline.pkl[0m
Pipeline completo guardado en: C:\Users\fjgon\OneDrive - Universidad Galileo\Trimestre 8\product_development\models\sales_pipeline.pkl


# 8. Registro en MLflow Model Registry y Gesti√≥n Champion/Challenger

In [8]:
# 8.1 Calcular m√©tricas finales del pipeline completo
if USE_MLFLOW:
    # Predecir con el pipeline completo (usando datos sin transformar)
    X_val_raw = dataset.iloc[int(len(dataset) * TRAIN_TEST_SPLIT_RATIO):][FEATURES].copy()
    y_val_raw = dataset.iloc[int(len(dataset) * TRAIN_TEST_SPLIT_RATIO):][TARGET].copy()
    
    # Convertir tipos categ√≥ricos
    for col in CATEGORICAL_VARS:
        X_val_raw[col] = X_val_raw[col].astype("O")
    
    y_pred_final = full_pipeline.predict(X_val_raw)
    final_metrics = calculate_metrics(y_val_raw, y_pred_final)
    
    print("üìä M√©tricas finales del pipeline completo:")
    print(f"   RMSE: {final_metrics['rmse']}")
    print(f"   MAE: {final_metrics['mae']}")
    print(f"   R2: {final_metrics['r2']}")
    print(f"   MSE: {final_metrics['mse']}")

üìä M√©tricas finales del pipeline completo:
   RMSE: 24.1248
   MAE: 20.1097
   R2: 0.0946
   MSE: 582.0082


  X[var] = X[var].fillna(self.fill_value).astype(str)
  X[var] = X[var].fillna(self.fill_value).astype(str)


In [9]:
# 8.2 Comparar con el modelo Champion actual (si existe)
if USE_MLFLOW:
    is_better, champion_metrics = compare_with_champion(
        final_metrics, 
        model_name=MLFLOW_MODEL_NAME,
        metric_key="rmse"
    )
    
    if champion_metrics:
        print("\nüìà Comparaci√≥n con el modelo Champion actual:")
        print(f"   Champion RMSE: {champion_metrics.get('rmse', 'N/A')}")
        print(f"   Challenger RMSE: {final_metrics['rmse']}")
        
        if is_better:
            print("\n‚úÖ El nuevo modelo es MEJOR que el Champion actual!")
        else:
            print("\n‚ö†Ô∏è El nuevo modelo NO supera al Champion actual")
    else:
        print("\nüìù No hay modelo Champion registrado. Este ser√° el primero.")

[32m2025-11-29 12:34:33.851[0m | [1mINFO    [0m | [36mproduct_development.modeling.train[0m:[36msetup_mlflow[0m:[36m155[0m - [1mMLflow configurado - Experimento: sales_prediction[0m
[32m2025-11-29 12:34:33.895[0m | [1mINFO    [0m | [36mproduct_development.modeling.train[0m:[36mcompare_with_champion[0m:[36m533[0m - [1mChampion rmse: 24.1248[0m
[32m2025-11-29 12:34:33.895[0m | [1mINFO    [0m | [36mproduct_development.modeling.train[0m:[36mcompare_with_champion[0m:[36m534[0m - [1mChallenger rmse: 24.1248[0m

üìà Comparaci√≥n con el modelo Champion actual:
   Champion RMSE: 24.1248
   Challenger RMSE: 24.1248

‚ö†Ô∏è El nuevo modelo NO supera al Champion actual


  return FileStore(store_uri)


In [10]:
# 8.3 Registrar modelo en MLflow Model Registry
if USE_MLFLOW:
    # Preparar input_example para la signature del modelo (muestra de datos sin transformar)
    input_example_df = X_val_raw.head(5)
    
    # Decidir si registrar como champion o challenger
    if champion_metrics is None or is_better:
        # Primera vez o mejor que el champion -> registrar como champion
        version = register_model_to_mlflow(
            pipeline=full_pipeline,
            model_name=MLFLOW_MODEL_NAME,
            best_model_name=best_model_name,
            metrics=final_metrics,
            register_as_champion=True,
            input_example=input_example_df
        )
        print(f"\nüèÜ Modelo registrado como CHAMPION (versi√≥n {version})")
    else:
        # No es mejor -> registrar como challenger para comparaci√≥n futura
        version = register_model_to_mlflow(
            pipeline=full_pipeline,
            model_name=MLFLOW_MODEL_NAME,
            best_model_name=best_model_name,
            metrics=final_metrics,
            register_as_champion=False,
            input_example=input_example_df
        )
        print(f"\nü•à Modelo registrado como CHALLENGER (versi√≥n {version})")

[32m2025-11-29 12:34:36.599[0m | [1mINFO    [0m | [36mproduct_development.modeling.train[0m:[36msetup_mlflow[0m:[36m155[0m - [1mMLflow configurado - Experimento: sales_prediction[0m
[32m2025-11-29 12:34:36.721[0m | [1mINFO    [0m | [36mproduct_development.modeling.train[0m:[36mregister_model_to_mlflow[0m:[36m430[0m - [1mPipeline registrado en MLflow como artefacto[0m
[32m2025-11-29 12:34:36.893[0m | [1mINFO    [0m | [36mproduct_development.modeling.train[0m:[36mregister_model_to_mlflow[0m:[36m450[0m - [1mModelo registrado como versi√≥n: 5[0m
[32m2025-11-29 12:34:36.970[0m | [1mINFO    [0m | [36mproduct_development.modeling.train[0m:[36mregister_model_to_mlflow[0m:[36m463[0m - [1mModelo versi√≥n 5 marcado como 'challenger'[0m

ü•à Modelo registrado como CHALLENGER (versi√≥n 5)


# 9. Resumen del Experimento MLflow

In [11]:
# 9.1 Obtener resumen de todos los experimentos
if USE_MLFLOW:
    experiment_df = get_experiment_summary(MLFLOW_EXPERIMENT_NAME)
    
    if not experiment_df.empty:
        print("üìã Resumen de todos los runs del experimento:")
        print(f"   Total de runs: {len(experiment_df)}")
        
        # Mostrar las m√©tricas m√°s importantes
        metric_cols = [col for col in experiment_df.columns if 'rmse' in col or 'r2' in col]
        display_cols = ['run_name', 'status'] + metric_cols
        available_cols = [col for col in display_cols if col in experiment_df.columns]
        
        display(experiment_df[available_cols].head(10))
    else:
        print("No hay runs registrados en el experimento")

[32m2025-11-29 12:34:39.770[0m | [1mINFO    [0m | [36mproduct_development.modeling.train[0m:[36msetup_mlflow[0m:[36m155[0m - [1mMLflow configurado - Experimento: sales_prediction[0m
üìã Resumen de todos los runs del experimento:
   Total de runs: 38


Unnamed: 0,status,metrics.r2,metrics.rmse
0,FINISHED,0.0946,24.1248
1,FINISHED,0.2717,26.9235
2,FINISHED,0.2605,27.1293
3,FINISHED,0.2411,27.4841
4,FINISHED,0.0931,30.0452
5,FAILED,-0.0051,25.4187
6,FAILED,0.2717,26.9235
7,FAILED,0.2605,27.1293
8,FAILED,0.2411,27.4841
9,FAILED,0.0931,30.0452


In [12]:
# 9.2 Ver informaci√≥n del modelo registrado en Model Registry
if USE_MLFLOW:
    client = MlflowClient()
    
    try:
        # Obtener todas las versiones del modelo
        versions = client.search_model_versions(f"name='{MLFLOW_MODEL_NAME}'")
        
        print(f"\nüì¶ Modelo: {MLFLOW_MODEL_NAME}")
        print(f"   Total de versiones: {len(versions)}")
        
        # Mostrar informaci√≥n del champion actual
        try:
            champion_version = client.get_model_version_by_alias(MLFLOW_MODEL_NAME, MLFLOW_CHAMPION_ALIAS)
            champion_run = client.get_run(champion_version.run_id)
            print(f"\nüèÜ Champion actual (versi√≥n {champion_version.version}):")
            print(f"   RMSE: {champion_run.data.metrics.get('rmse', 'N/A')}")
            print(f"   R2: {champion_run.data.metrics.get('r2', 'N/A')}")
        except:
            print("\n   No hay modelo Champion asignado")
        
        # Mostrar informaci√≥n del challenger actual
        try:
            challenger_version = client.get_model_version_by_alias(MLFLOW_MODEL_NAME, MLFLOW_CHALLENGER_ALIAS)
            challenger_run = client.get_run(challenger_version.run_id)
            print(f"\nü•à Challenger actual (versi√≥n {challenger_version.version}):")
            print(f"   RMSE: {challenger_run.data.metrics.get('rmse', 'N/A')}")
            print(f"   R2: {challenger_run.data.metrics.get('r2', 'N/A')}")
        except:
            print("\n   No hay modelo Challenger asignado")
            
    except Exception as e:
        print(f"Error obteniendo informaci√≥n del modelo: {e}")


üì¶ Modelo: sales_prediction_model
   Total de versiones: 5

üèÜ Champion actual (versi√≥n 1):
   RMSE: 24.1248
   R2: 0.0946

ü•à Challenger actual (versi√≥n 5):
   RMSE: 24.1248
   R2: 0.0946


In [13]:
# 9.3 (Opcional) Promover Challenger a Champion manualmente
# Descomenta las siguientes l√≠neas si deseas promover manualmente un challenger a champion

# if USE_MLFLOW:
#     success = promote_challenger_to_champion(model_name=MLFLOW_MODEL_NAME)
#     if success:
#         print("‚úÖ Challenger promovido a Champion exitosamente!")
#     else:
#         print("‚ùå Error al promover Challenger")

In [14]:
# 9.4 Instrucciones para visualizar MLflow UI
print("=" * 60)
print("üñ•Ô∏è  Para visualizar los experimentos en MLflow UI ejecuta:")
print("=" * 60)
print("\n   mlflow ui --backend-store-uri mlruns")
print("\n   Luego abre en tu navegador: http://localhost:5000")
print("\n" + "=" * 60)
print("üìä En la UI podr√°s ver:")
print("   - Todos los experimentos y runs")
print("   - Comparaci√≥n de m√©tricas entre modelos")
print("   - Hiperpar√°metros de cada modelo")
print("   - Model Registry con versiones Champion/Challenger")
print("=" * 60)

üñ•Ô∏è  Para visualizar los experimentos en MLflow UI ejecuta:

   mlflow ui --backend-store-uri mlruns

   Luego abre en tu navegador: http://localhost:5000

üìä En la UI podr√°s ver:
   - Todos los experimentos y runs
   - Comparaci√≥n de m√©tricas entre modelos
   - Hiperpar√°metros de cada modelo
   - Model Registry con versiones Champion/Challenger
