In [2]:
lags_elegidos = [1, 7, 14, 28]
# Lista de procesos y la operación aplicada (código seguido de criterio):

# i) Imputación de variables numéricas (sales & lag features)
num_imputer = SimpleImputer(strategy='median')

num_cols = ['sales'] + [f'sales_lag_{lag}' for lag in lags_elegidos] + ['roll_mean_7','roll_mean_28']

# Imputación global como ejemplo (en producción: por grupo sería mejor)
df[num_cols] = num_imputer.fit_transform(df[num_cols])

# ii) Imputación de variables categóricas (store)
df['store'] = df['store'].cat.add_categories(['unknown']).fillna('unknown')

# iii) Codificación de variables categóricas
item_freq = df['item'].value_counts(normalize=True)
df['item_freq_enc'] = df['item'].map(item_freq)

# One-hot para store verificar si se cambia por otro tipo de categorización
top_stores = df['store'].value_counts().nlargest(10).index.tolist()
for s in top_stores:
    df[f'store_is_{s}'] = (df['store'] == s).astype(int)

# iv) Tratamiento de outliers (sales)
Q1 = df['sales'].quantile(0.25)
Q3 = df['sales'].quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR

df['sales_clipped'] = df['sales'].clip(lower=lower, upper=upper)

# v) Transformación de variables numéricas
df['sales_log1p'] = np.log1p(df['sales_clipped'])

# vi) Escalado de características
scale_cols = ['item_freq_enc','sales_lag_1','sales_lag_7','roll_mean_7','sales_log1p']
scaler = StandardScaler()
df[scale_cols] = scaler.fit_transform(df[scale_cols])

print('Transformaciones aplicadas. Muestra:')
print(df[scale_cols + ['sales']].head())

# Guardar dataset transformado para siguiente etapa (pipeline/operadores)
out_cols = ['date','store','item','sales'] + [f'sales_lag_{lag}' for lag in lags_elegidos] + ['roll_mean_7','roll_mean_28','item_freq_enc','sales_clipped','sales_log1p'] + [f'store_is_{s}' for s in top_stores]

df_out = df[out_cols].copy()
df_out.to_parquet('train_feature_exploration.parquet', index=False)
print('\nGuardado: train_feature_exploration.parquet con shape', df_out.shape)

NameError: name 'SimpleImputer' is not defined

In [None]:
import pandas as pd
import numpy as np
import os

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

from feature_engine.imputation import MeanMedianImputer
from feature_engine.encoding import CountFrequencyEncoder

from sklearn.base import BaseEstimator, TransformerMixin

import operators  


import joblib 

ModuleNotFoundError: No module named 'operators'

In [None]:
data_train = pd.read_csv('../data/raw/train.csv', parse_dates=['date'])

data_train.head()

In [None]:
class SimpleCategoricalImputer(BaseEstimator, TransformerMixin):
    """
    Imputador categórico sencillo:
    - Rellena NaNs con un string (por defecto 'Missing').
    - No revisa tipos, solo hace fillna().
    """
    def __init__(self, variables, fill_value="Missing"):
        self.variables = variables
        self.fill_value = fill_value

    def fit(self, X, y=None):
        # no aprende nada, solo dejamos el objeto listo
        return self

    def transform(self, X):
        X = X.copy()
        for var in self.variables:
            if var in X.columns:
                X[var] = X[var].fillna(self.fill_value).astype(str)
        return X


In [None]:
# 3. Crear variables de calendario a partir de `date`
data_train['date'] = pd.to_datetime(data_train['date'])

data_train['year']  = data_train['date'].dt.year
data_train['month'] = data_train['date'].dt.month
data_train['day_of_week_name'] = data_train['date'].dt.day_name()

# tratamos store e item como categóricas
data_train['store'] = data_train['store'].astype('O')
data_train['item']  = data_train['item'].astype('O')

data_train.head()


In [None]:
# 3. Definición de variables

TARGET = "sales"

FEATURES = ["store", "item", "year", "month", "day_of_week_name"]

# categóricas
CATEGORICAL_VARS = ["store", "item", "day_of_week_name"]
CATEGORICAL_VARS_IMPUTE = ["store", "item"]   # para imputador categórico
CATEGORICAL_VARS_FREQ   = ["store", "item"]   # para CountFrequencyEncoder

# numéricas
NUMERICAL_VARS = ["year", "month"]

# mapping para día de la semana -> número
DAY_OF_WEEK_MAPPING = {
    "Monday": 0,
    "Tuesday": 1,
    "Wednesday": 2,
    "Thursday": 3,
    "Friday": 4,
    "Saturday": 5,
    "Sunday": 6
}

# Aseguramos tipos categóricos (similar al ejemplo de MSSubClass, GarageCars, etc.)
for col in CATEGORICAL_VARS:
    data_train[col] = data_train[col].astype("O")


In [None]:
# 4. Train-Test Split

X = data_train[FEATURES].copy()
y = data_train[TARGET].copy()

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, shuffle=True, random_state=2025
)

X_train.head()


In [None]:
# 5. Construcción del Pipeline de ingeniería de características

sales_feature_pipeline = Pipeline([

    # 1. Imputación de variables categóricas (store, item)
    ("cat_missing_imputation", SimpleCategoricalImputer(
        variables=CATEGORICAL_VARS_IMPUTE,
        fill_value="Missing"
    )),

    # 2. Imputación de variables numéricas (year, month) con mediana
    ("num_median_imputation", MeanMedianImputer(
        imputation_method="median",
        variables=NUMERICAL_VARS
    )),

    # 3. Codificación de variables categóricas store e item por frecuencia
    ("cat_freq_encoder", CountFrequencyEncoder(
        encoding_method="frequency",
        variables=CATEGORICAL_VARS_FREQ
    )),

    # 4. Mapeo de variable categórica ordinal (día de la semana) con Mapper
    ("dayofweek_mapper", operators.Mapper(
        mappings=DAY_OF_WEEK_MAPPING,
        variables=["day_of_week_name"]
    )),

    # 5. Normalización de variables (MinMaxScaler)
    ("feature_scaler", MinMaxScaler())

])

sales_feature_pipeline


In [None]:
# 6. Aplicamos preprocesamiento a los datos

# Ajustar pipeline
sales_feature_pipeline.fit(X_train, y_train)

# Transformar datos de entrenamiento
X_train_transformed = sales_feature_pipeline.transform(X_train)

# Lo pasamos a DataFrame solo para inspección / guardado
X_train_transformed_df = pd.DataFrame(
    X_train_transformed,
    columns=FEATURES
)

X_train_transformed_df.head()


In [None]:
# 7. Guardar datos procesados y pipeline

# Guardamos datos transformados
os.makedirs("../data/interim", exist_ok=True)
preproc_train_path = "../data/interim/preproc_train.csv"
Xy_train_preproc = pd.concat(
    [X_train_transformed_df.reset_index(drop=True),
     y_train.reset_index(drop=True)],
    axis=1
)
Xy_train_preproc.to_csv(preproc_train_path, index=False)

print(f"Datos de entrenamiento preprocesados guardados en: {preproc_train_path}")


In [None]:
pipeline_path = "../models/feature_engineering_pipeline.pkl"
joblib.dump(sales_feature_pipeline, pipeline_path)

print(f"Pipeline de ingeniería de características guardado en: {pipeline_path}")
