# 1 Cargamos librerias

In [1]:
# Importaciones
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import time
import joblib

from loguru import logger

# Importar módulos del paquete product_development
from product_development.config import (
    TARGET, FEATURES, CATEGORICAL_VARS, RAW_DATA_DIR, MODELS_DIR,
    FEATURE_PIPELINE_FILE, PIPELINE_FILE, TRAIN_TEST_SPLIT_RATIO, RANDOM_STATE
)
from product_development.dataset import load_raw_data, prepare_dataset, temporal_train_test_split
from product_development.features import load_feature_pipeline, transform_features
from product_development.modeling.train import (
    get_model_configurations, train_and_evaluate_models,
    create_full_pipeline, train_final_model, save_pipeline
)

[32m2025-11-29 10:00:20.596[0m | [1mINFO    [0m | [36mproduct_development.config[0m:[36m<module>[0m:[36m17[0m - [1mPROJ_ROOT path is: C:\Users\fjgon\OneDrive - Universidad Galileo\Trimestre 8\product_development[0m


Configuración de modelos, logging y rutas

In [2]:
# 2. Configuración de modelos usando el módulo train
# Modo: "fast" (rápido, 5 modelos) o "full" (completo, 15 modelos)
TRAINING_MODE = "fast"  # Cambiar a "full" para evaluación completa

models_configurations = get_model_configurations(mode=TRAINING_MODE)

print(f"Modo de entrenamiento: {TRAINING_MODE}")
print(f"Total de modelos a evaluar: {len(models_configurations)}")
print("\nModelos configurados:")
for name in models_configurations.keys():
    print(f"  - {name}")

Modo de entrenamiento: fast
Total de modelos a evaluar: 4

Modelos configurados:
  - LinearRegression
  - RandomForest
  - GradientBoosting
  - XGBoost


# 3 Entrenamiento y seleccion de modelos


In [3]:
# 3. Cargar y preparar datos usando los módulos del paquete
raw_data = load_raw_data(RAW_DATA_DIR / "train.csv")
dataset = prepare_dataset(raw_data)

print(f"Total registros: {len(dataset)}")

# Split temporal 80/20
X_train, X_val, y_train, y_val = temporal_train_test_split(dataset, TRAIN_TEST_SPLIT_RATIO)

print(f"Train: {X_train.shape}, Val: {X_val.shape}")

# Cargar pipeline de preprocesamiento
preproc_pipeline = load_feature_pipeline(FEATURE_PIPELINE_FILE)

# Ajustar pipeline con datos de entrenamiento
preproc_pipeline.fit(X_train, y_train)

# Transformar datos
X_train_proc = transform_features(preproc_pipeline, X_train)
X_val_proc = transform_features(preproc_pipeline, X_val)

print(f"Shape X_train_proc: {X_train_proc.shape}")
print(f"Shape X_val_proc: {X_val_proc.shape}")

[32m2025-11-29 10:00:21.587[0m | [1mINFO    [0m | [36mproduct_development.dataset[0m:[36mload_raw_data[0m:[36m49[0m - [1mCargando datos crudos desde C:\Users\fjgon\OneDrive - Universidad Galileo\Trimestre 8\product_development\data\raw\train.csv[0m
[32m2025-11-29 10:00:21.895[0m | [1mINFO    [0m | [36mproduct_development.dataset[0m:[36mload_raw_data[0m:[36m51[0m - [1mCargados 913000 registros[0m
[32m2025-11-29 10:00:21.895[0m | [1mINFO    [0m | [36mproduct_development.dataset[0m:[36madd_temporal_features[0m:[36m69[0m - [1mAgregando características temporales[0m
[32m2025-11-29 10:00:22.033[0m | [1mINFO    [0m | [36mproduct_development.dataset[0m:[36mconvert_categorical_types[0m:[36m92[0m - [1mConvirtiendo columnas categóricas a tipo object[0m
Total registros: 913000
[32m2025-11-29 10:00:22.082[0m | [1mINFO    [0m | [36mproduct_development.dataset[0m:[36mtemporal_train_test_split[0m:[36m140[0m - [1mDividiendo datos con 80% para e

  X[var] = X[var].fillna(self.fill_value).astype(str)
  X[var] = X[var].fillna(self.fill_value).astype(str)


[32m2025-11-29 10:00:23.290[0m | [1mINFO    [0m | [36mproduct_development.features[0m:[36mtransform_features[0m:[36m126[0m - [1mTransformando 730400 registros[0m


  X[var] = X[var].fillna(self.fill_value).astype(str)
  X[var] = X[var].fillna(self.fill_value).astype(str)


[32m2025-11-29 10:00:24.097[0m | [1mINFO    [0m | [36mproduct_development.features[0m:[36mtransform_features[0m:[36m126[0m - [1mTransformando 182600 registros[0m
Shape X_train_proc: (730400, 5)
Shape X_val_proc: (182600, 5)


  X[var] = X[var].fillna(self.fill_value).astype(str)
  X[var] = X[var].fillna(self.fill_value).astype(str)


In [4]:
# 4. Entrenamiento y evaluación de modelos
start = time.time()

results, best_model_name, best_model = train_and_evaluate_models(
    X_train_proc, y_train,
    X_val_proc, y_val,
    models_configurations
)

elapsed_time = round(time.time() - start, 2)

print(f"\nTiempo total de entrenamiento: {elapsed_time} segundos")
print(f"\nMejor modelo: {best_model_name} con RMSE = {results[best_model_name]}")

[32m2025-11-29 10:00:24.303[0m | [1mINFO    [0m | [36mproduct_development.modeling.train[0m:[36mtrain_and_evaluate_models[0m:[36m157[0m - [1mIniciando entrenamiento y evaluación de modelos[0m


Entrenando modelos:   0%|          | 0/4 [00:00<?, ?it/s]

[32m2025-11-29 10:00:24.305[0m | [1mINFO    [0m | [36mproduct_development.modeling.train[0m:[36mtrain_and_evaluate_models[0m:[36m160[0m - [1mEntrenando modelo: LinearRegression[0m
[32m2025-11-29 10:00:24.381[0m | [1mINFO    [0m | [36mproduct_development.modeling.train[0m:[36mtrain_and_evaluate_models[0m:[36m170[0m - [1mModelo LinearRegression RMSE: 30.05[0m
[32m2025-11-29 10:00:24.381[0m | [1mINFO    [0m | [36mproduct_development.modeling.train[0m:[36mtrain_and_evaluate_models[0m:[36m160[0m - [1mEntrenando modelo: RandomForest[0m


Entrenando modelos:  50%|█████     | 2/4 [00:03<00:03,  1.98s/it]

[32m2025-11-29 10:00:28.259[0m | [1mINFO    [0m | [36mproduct_development.modeling.train[0m:[36mtrain_and_evaluate_models[0m:[36m170[0m - [1mModelo RandomForest RMSE: 27.48[0m
[32m2025-11-29 10:00:28.259[0m | [1mINFO    [0m | [36mproduct_development.modeling.train[0m:[36mtrain_and_evaluate_models[0m:[36m160[0m - [1mEntrenando modelo: GradientBoosting[0m


Entrenando modelos:  75%|███████▌  | 3/4 [00:21<00:08,  8.67s/it]

[32m2025-11-29 10:00:46.291[0m | [1mINFO    [0m | [36mproduct_development.modeling.train[0m:[36mtrain_and_evaluate_models[0m:[36m170[0m - [1mModelo GradientBoosting RMSE: 27.13[0m
[32m2025-11-29 10:00:46.291[0m | [1mINFO    [0m | [36mproduct_development.modeling.train[0m:[36mtrain_and_evaluate_models[0m:[36m160[0m - [1mEntrenando modelo: XGBoost[0m


Entrenando modelos: 100%|██████████| 4/4 [00:22<00:00,  5.75s/it]

[32m2025-11-29 10:00:47.288[0m | [1mINFO    [0m | [36mproduct_development.modeling.train[0m:[36mtrain_and_evaluate_models[0m:[36m170[0m - [1mModelo XGBoost RMSE: 26.92[0m
[32m2025-11-29 10:00:47.293[0m | [1mINFO    [0m | [36mproduct_development.modeling.train[0m:[36mtrain_and_evaluate_models[0m:[36m181[0m - [1mMejor modelo: XGBoost con RMSE = 26.92[0m

Tiempo total de entrenamiento: 22.99 segundos

Mejor modelo: XGBoost con RMSE = 26.92





# 4. Importamos Pipeline de preproc y Agregamos Modelo Ganador


In [5]:
# 5. Crear pipeline completo con el mejor modelo
full_pipeline = create_full_pipeline(preproc_pipeline, best_model)

print("Pipeline completo creado:")
for step_name, step in full_pipeline.steps:
    print(f"  - {step_name}: {type(step).__name__}")

[32m2025-11-29 10:00:47.305[0m | [1mINFO    [0m | [36mproduct_development.modeling.train[0m:[36mcreate_full_pipeline[0m:[36m205[0m - [1mCreando pipeline completo con modelo[0m
Pipeline completo creado:
  - cat_missing_imputation: SimpleCategoricalImputer
  - num_median_imputation: MeanMedianImputer
  - cat_freq_encoder: CountFrequencyEncoder
  - dayofweek_mapper: Mapper
  - feature_scaler: MinMaxScaler
  - regressor: XGBRegressor


# 5. Re entrenamos pipelines con el Modelo Ganador


In [6]:
# 6. Re-entrenar con el dataset completo
X_full = dataset[FEATURES].copy()
y_full = dataset[TARGET].copy()

# Convertir tipos categóricos
for col in CATEGORICAL_VARS:
    X_full[col] = X_full[col].astype("O")

full_pipeline = train_final_model(full_pipeline, X_full, y_full)

print("Pipeline final entrenado correctamente.")

[32m2025-11-29 10:00:47.467[0m | [1mINFO    [0m | [36mproduct_development.modeling.train[0m:[36mtrain_final_model[0m:[36m237[0m - [1mEntrenando modelo final con dataset completo[0m


  X[var] = X[var].fillna(self.fill_value).astype(str)
  X[var] = X[var].fillna(self.fill_value).astype(str)
  X[var] = X[var].fillna(self.fill_value).astype(str)


[32m2025-11-29 10:00:49.679[0m | [1mINFO    [0m | [36mproduct_development.modeling.train[0m:[36mtrain_final_model[0m:[36m239[0m - [1mEntrenamiento del modelo final completado[0m
Pipeline final entrenado correctamente.


In [7]:
# 7. Guardar pipeline completo
save_pipeline(full_pipeline, PIPELINE_FILE)

print(f"Pipeline completo guardado en: {PIPELINE_FILE}")

[32m2025-11-29 10:00:49.698[0m | [1mINFO    [0m | [36mproduct_development.modeling.train[0m:[36msave_pipeline[0m:[36m259[0m - [1mPipeline guardado en C:\Users\fjgon\OneDrive - Universidad Galileo\Trimestre 8\product_development\models\sales_pipeline.pkl[0m
Pipeline completo guardado en: C:\Users\fjgon\OneDrive - Universidad Galileo\Trimestre 8\product_development\models\sales_pipeline.pkl
