# 1 Cargamos librerias

In [None]:

import pandas as pd
import numpy as np
import os
import time
import logging

from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

import joblib


In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class SimpleCategoricalImputer(BaseEstimator, TransformerMixin):

    def __init__(self, variables, fill_value="Missing"):
        self.variables = variables
        self.fill_value = fill_value

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        for var in self.variables:
            if var in X.columns:
                X[var] = X[var].fillna(self.fill_value).astype(str)
        return X

Configuración de modelos, logging y rutas

In [None]:
# 2.1 Diccionario de modelos y configuraciones (15 en total)

models_configurations = {
    # 3 configuraciones de LinearRegression
    "LinearRegression_1": LinearRegression(),
    "LinearRegression_2": LinearRegression(fit_intercept=False),
    "LinearRegression_3": LinearRegression(positive=True),

    # 3 configuraciones de RandomForest
    "RandomForest_1": RandomForestRegressor(n_estimators=100, max_depth=10, random_state=2025, n_jobs=-1),
    "RandomForest_2": RandomForestRegressor(n_estimators=200, max_depth=20, random_state=2025, n_jobs=-1),
    "RandomForest_3": RandomForestRegressor(n_estimators=300, max_depth=None, random_state=2025, n_jobs=-1),

    # 3 configuraciones de GradientBoosting
    "GradientBoosting_1": GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=2025),
    "GradientBoosting_2": GradientBoostingRegressor(n_estimators=200, learning_rate=0.05, random_state=2025),
    "GradientBoosting_3": GradientBoostingRegressor(n_estimators=300, learning_rate=0.03, random_state=2025),

    # 3 configuraciones de SVR
    "SVR_1": SVR(kernel="rbf", C=1.0, epsilon=0.1),
    "SVR_2": SVR(kernel="rbf", C=10.0, epsilon=0.1),
    "SVR_3": SVR(kernel="rbf", C=100.0, epsilon=0.1),

    # 3 configuraciones de XGBoost
    "XGBoost_1": XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5,random_state=2025, n_jobs=-1),
    "XGBoost_2": XGBRegressor(n_estimators=200, learning_rate=0.05, max_depth=7,random_state=2025, n_jobs=-1),
    "XGBoost_3": XGBRegressor(n_estimators=300, learning_rate=0.03, max_depth=9,random_state=2025, n_jobs=-1),
}


logging.basicConfig(
    filename="system.log",
    encoding="utf-8",
    filemode="a",
    level=logging.INFO,
    format="{asctime}, {levelname}, {message}",
    style="{",
    datefmt="%Y-%m-%d %H:%M"
)



# 3 Entrenamiento y seleccion de modelos


In [None]:
#  80/20

dataset = pd.read_csv("../data/raw/train.csv")

dataset["date"] = pd.to_datetime(dataset["date"])
dataset = dataset.sort_values("date").reset_index(drop=True)

dataset["year"] = dataset["date"].dt.year
dataset["month"] = dataset["date"].dt.month
dataset["day_of_week_name"] = dataset["date"].dt.day_name()


FEATURES = ["store", "item", "year", "month", "day_of_week_name"]
TARGET = "sales"

X_all = dataset[FEATURES].copy()
y_all = dataset[TARGET].copy()


n_samples = len(dataset)
n_train = int(n_samples * 0.8)

X_train = X_all.iloc[:n_train].copy()
y_train = y_all.iloc[:n_train].copy()

X_val = X_all.iloc[n_train:].copy()
y_val = y_all.iloc[n_train:].copy()

print(f"Total registros: {n_samples}")
print(f"Train: {X_train.shape}, Val: {X_val.shape}")


preproc_pipeline = joblib.load("../models/feature_engineering_pipeline.pkl")

# Ajustar pipeline con datos de entrenamiento
preproc_pipeline.fit(X_train, y_train)


X_train_proc = preproc_pipeline.transform(X_train)
X_val_proc = preproc_pipeline.transform(X_val)

print("Shape X_train_proc:", X_train_proc.shape)
print("Shape X_val_proc:", X_val_proc.shape)



# 3.3 Entrenamiento RMSE

results = {}
logging.info("-------- Iniciando entrenamiento y evaluación de modelos --------")

start = time.time()

# Tomamos solo los 3 primeros modelos del diccionario para pruebas
# subset_models = list(models_configurations.items())[:3]

# Todos los modelos
subset_models = list(models_configurations.items())

for model_name, model in subset_models:
    print(f"\nEntrenando modelo: {model_name}...")

    try:
        # Entrenamos en el 80% inicial
        model.fit(X_train_proc, y_train)

        # Predicción en el 20% final 
        y_val_pred = model.predict(X_val_proc)

        # MSE y luego RMSE (SIN usar 'squared=False')
        mse = mean_squared_error(y_val, y_val_pred) 
        rmse = np.sqrt(mse)
        rmse = np.round(rmse, 2)

        results[model_name] = rmse

        print(f"RMSE del Modelo {model_name} es {rmse}")
        logging.info(f"RMSE del Modelo {model_name}: {rmse}, ENTRENAMIENTO")

    except Exception as ex:
        print(f"Error en modelo {model_name}: {ex}")
        logging.error(f"Error en entrenamiento del modelo {model_name}: {ex}, ENTRENAMIENTO")

end = time.time()
elapsed_time = round(end - start, 2)
logging.info(f"Tiempo de Entrenamiento: {elapsed_time}, ENTRENAMIENTO")
print(f"\nTiempo total de entrenamiento: {elapsed_time} segundos")



Total registros: 913000
Train: (730400, 5), Val: (182600, 5)
Shape X_train_proc: (730400, 5)
Shape X_val_proc: (182600, 5)

Entrenando modelo: LinearRegression_1...
RMSE del Modelo LinearRegression_1 es 30.05

Entrenando modelo: RandomForest_1...
RMSE del Modelo RandomForest_1 es 27.48

Entrenando modelo: GradientBoosting_1...
RMSE del Modelo GradientBoosting_1 es 27.13

Tiempo total de entrenamiento: 63.92 segundos


# 4. Importamos Pipeline de preproc y Agregamos Modelo Ganador


In [None]:
#  Seleccionar modelo con menor RMSE

best_model_name = min(results, key=results.get)
best_rmse = results[best_model_name]

print(f"Mejor modelo: {best_model_name} con RMSE = {best_rmse}")
logging.info(f"Mejor modelo seleccionado: {best_model_name} con RMSE = {best_rmse}")

best_model_config = models_configurations[best_model_name]

# pipeline de pre-procesamiento 
sales_preproc_pipeline = joblib.load("../models/feature_engineering_pipeline.pkl")

# Agregamos el modelo ganador al ppl

model_step = ("regressor", best_model_config)
sales_preproc_pipeline.steps.append(model_step)

sales_preproc_pipeline


Mejor modelo: GradientBoosting_1 con RMSE = 27.13


0,1,2
,steps,"[('cat_missing_imputation', ...), ('num_median_imputation', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,variables,"['store', 'item']"
,fill_value,'Missing'

0,1,2
,imputation_method,'median'
,variables,"['year', 'month']"

0,1,2
,encoding_method,'frequency'
,variables,"['store', 'item']"
,missing_values,'raise'
,ignore_format,False
,unseen,'ignore'

0,1,2
,mappings,"{'Friday': 4, 'Monday': 0, 'Saturday': 5, 'Sunday': 6, ...}"
,variables,['day_of_week_name']

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,loss,'squared_error'
,learning_rate,0.1
,n_estimators,100
,subsample,1.0
,criterion,'friedman_mse'
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_depth,3
,min_impurity_decrease,0.0


# 5. Re entrenamos pipelines con el Modelo Ganador


In [None]:
# cargar nuevamente el dataset completo

dataset = pd.read_csv("../data/raw/train.csv")

dataset["date"] = pd.to_datetime(dataset["date"])
dataset["year"] = dataset["date"].dt.year
dataset["month"] = dataset["date"].dt.month
dataset["day_of_week_name"] = dataset["date"].dt.day_name()


dataset["store"] = dataset["store"].astype("O")
dataset["item"] = dataset["item"].astype("O")
dataset["day_of_week_name"] = dataset["day_of_week_name"].astype("O")

X_full = dataset[FEATURES].copy()
y_full = dataset[TARGET].copy()



sales_preproc_pipeline.fit(X_full, y_full)

print("Pipeline final entrenado correctamente.")


  X[var] = X[var].fillna(self.fill_value).astype(str)
  X[var] = X[var].fillna(self.fill_value).astype(str)


Pipeline final entrenado correctamente.


In [8]:
joblib.dump(
    sales_preproc_pipeline,
    "../models/sales_pipeline.pkl"
)

print("Pipeline completo guardado en ../models/sales_pipeline.pkl")
logging.info("Pipeline completo guardado en ../models/sales_pipeline.pkl")


Pipeline completo guardado en ../models/sales_pipeline.pkl
