lags_elegidos = [1, 7, 14, 28]
# Lista de procesos y la operación aplicada (código seguido de criterio):

# i) Imputación de variables numéricas (sales & lag features)
num_imputer = SimpleImputer(strategy='median')

num_cols = ['sales'] + [f'sales_lag_{lag}' for lag in lags_elegidos] + ['roll_mean_7','roll_mean_28']

# Imputación global como ejemplo (en producción: por grupo sería mejor)
df[num_cols] = num_imputer.fit_transform(df[num_cols])

# ii) Imputación de variables categóricas (store)
df['store'] = df['store'].cat.add_categories(['unknown']).fillna('unknown')

# iii) Codificación de variables categóricas
item_freq = df['item'].value_counts(normalize=True)
df['item_freq_enc'] = df['item'].map(item_freq)

# One-hot para store verificar si se cambia por otro tipo de categorización
top_stores = df['store'].value_counts().nlargest(10).index.tolist()
for s in top_stores:
    df[f'store_is_{s}'] = (df['store'] == s).astype(int)

# iv) Tratamiento de outliers (sales)
Q1 = df['sales'].quantile(0.25)
Q3 = df['sales'].quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR

df['sales_clipped'] = df['sales'].clip(lower=lower, upper=upper)

# v) Transformación de variables numéricas
df['sales_log1p'] = np.log1p(df['sales_clipped'])

# vi) Escalado de características
scale_cols = ['item_freq_enc','sales_lag_1','sales_lag_7','roll_mean_7','sales_log1p']
scaler = StandardScaler()
df[scale_cols] = scaler.fit_transform(df[scale_cols])

print('Transformaciones aplicadas. Muestra:')
print(df[scale_cols + ['sales']].head())

# Guardar dataset transformado para siguiente etapa (pipeline/operadores)
out_cols = ['date','store','item','sales'] + [f'sales_lag_{lag}' for lag in lags_elegidos] + ['roll_mean_7','roll_mean_28','item_freq_enc','sales_clipped','sales_log1p'] + [f'store_is_{s}' for s in top_stores]

df_out = df[out_cols].copy()
df_out.to_parquet('train_feature_exploration.parquet', index=False)
print('\nGuardado: train_feature_exploration.parquet con shape', df_out.shape)

In [1]:
import pandas as pd
import numpy as np
import os

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

from feature_engine.imputation import MeanMedianImputer
from feature_engine.encoding import CountFrequencyEncoder

from sklearn.base import BaseEstimator, TransformerMixin

import operators  


import joblib 

In [2]:
data_train = pd.read_csv('../data/raw/train.csv', parse_dates=['date'])

data_train.head()

Unnamed: 0,date,store,item,sales
0,2013-01-01,1,1,13
1,2013-01-02,1,1,11
2,2013-01-03,1,1,14
3,2013-01-04,1,1,13
4,2013-01-05,1,1,10


In [None]:
class SimpleCategoricalImputer(BaseEstimator, TransformerMixin):
  
    def __init__(self, variables, fill_value="Missing"):
        self.variables = variables
        self.fill_value = fill_value

    def fit(self, X, y=None):
       
        return self

    def transform(self, X):
        X = X.copy()
        for var in self.variables:
            if var in X.columns:
                X[var] = X[var].fillna(self.fill_value).astype(str)
        return X


In [None]:

data_train['date'] = pd.to_datetime(data_train['date'])

data_train['year']  = data_train['date'].dt.year
data_train['month'] = data_train['date'].dt.month
data_train['day_of_week_name'] = data_train['date'].dt.day_name()

# categóricas
data_train['store'] = data_train['store'].astype('O')
data_train['item']  = data_train['item'].astype('O')

data_train.head()


Unnamed: 0,date,store,item,sales,year,month,day_of_week_name
0,2013-01-01,1,1,13,2013,1,Tuesday
1,2013-01-02,1,1,11,2013,1,Wednesday
2,2013-01-03,1,1,14,2013,1,Thursday
3,2013-01-04,1,1,13,2013,1,Friday
4,2013-01-05,1,1,10,2013,1,Saturday


In [None]:
# 3. Definición de variables

TARGET = "sales"

FEATURES = ["store", "item", "year", "month", "day_of_week_name"]

# categóricas
CATEGORICAL_VARS = ["store", "item", "day_of_week_name"]
CATEGORICAL_VARS_IMPUTE = ["store", "item"]   
CATEGORICAL_VARS_FREQ   = ["store", "item"]   

# numéricas
NUMERICAL_VARS = ["year", "month"]


DAY_OF_WEEK_MAPPING = {
    "Monday": 0,
    "Tuesday": 1,
    "Wednesday": 2,
    "Thursday": 3,
    "Friday": 4,
    "Saturday": 5,
    "Sunday": 6
}


for col in CATEGORICAL_VARS:
    data_train[col] = data_train[col].astype("O")


In [6]:
# 4. Train-Test Split

X = data_train[FEATURES].copy()
y = data_train[TARGET].copy()

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, shuffle=True, random_state=2025
)

X_train.head()


Unnamed: 0,store,item,year,month,day_of_week_name
686013,6,38,2016,6,Friday
706742,8,39,2013,3,Friday
641860,2,36,2015,7,Friday
186225,2,11,2017,12,Tuesday
462179,4,26,2013,7,Sunday


In [None]:
# 5. Construccin del Pipeline 

sales_feature_pipeline = Pipeline([

    #Imputación de variables categóricas
    ("cat_missing_imputation", SimpleCategoricalImputer(
        variables=CATEGORICAL_VARS_IMPUTE,
        fill_value="Missing"
    )),

    #Imputación de variables numéricas 
    ("num_median_imputation", MeanMedianImputer(
        imputation_method="median",
        variables=NUMERICAL_VARS
    )),

    #Codificación de variables categóricas 
    ("cat_freq_encoder", CountFrequencyEncoder(
        encoding_method="frequency",
        variables=CATEGORICAL_VARS_FREQ
    )),

    #Mapeo de variable categórica 
    ("dayofweek_mapper", operators.Mapper(
        mappings=DAY_OF_WEEK_MAPPING,
        variables=["day_of_week_name"]
    )),

    #Normalización de variables (MinMaxScaler)
    ("feature_scaler", MinMaxScaler())

])

sales_feature_pipeline


0,1,2
,steps,"[('cat_missing_imputation', ...), ('num_median_imputation', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,variables,"['store', 'item']"
,fill_value,'Missing'

0,1,2
,imputation_method,'median'
,variables,"['year', 'month']"

0,1,2
,encoding_method,'frequency'
,variables,"['store', 'item']"
,missing_values,'raise'
,ignore_format,False
,unseen,'ignore'

0,1,2
,mappings,"{'Friday': 4, 'Monday': 0, 'Saturday': 5, 'Sunday': 6, ...}"
,variables,['day_of_week_name']

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False


In [None]:
# preprocesamiento a los datos


sales_feature_pipeline.fit(X_train, y_train)


X_train_transformed = sales_feature_pipeline.transform(X_train)

X_train_transformed_df = pd.DataFrame(
    X_train_transformed,
    columns=FEATURES
)

X_train_transformed_df.head()


  X[var] = X[var].fillna(self.fill_value).astype(str)
  X[var] = X[var].fillna(self.fill_value).astype(str)
  X[var] = X[var].fillna(self.fill_value).astype(str)
  X[var] = X[var].fillna(self.fill_value).astype(str)


Unnamed: 0,store,item,year,month,day_of_week_name
0,0.610209,0.539683,0.75,0.454545,0.666667
1,0.053364,0.48254,0.0,0.181818,0.666667
2,0.259861,0.0,0.5,0.545455,0.666667
3,0.259861,0.425397,1.0,1.0,0.166667
4,0.529002,0.4,0.0,0.545455,1.0


In [None]:
preproc_train_path = "../data/processed/preproc_train.csv"
Xy_train_preproc = pd.concat(
    [X_train_transformed_df.reset_index(drop=True),
     y_train.reset_index(drop=True)],
    axis=1
)
import os
os.makedirs("../data/processed", exist_ok=True)
Xy_train_preproc.to_csv(preproc_train_path, index=False)

print(f"Datos de entrenamiento preprocesados guardados en: {preproc_train_path}")

Datos de entrenamiento preprocesados guardados en: ../data/raw/preproc_train.csv


In [10]:
pipeline_path = "../models/feature_engineering_pipeline.pkl"
joblib.dump(sales_feature_pipeline, pipeline_path)

print(f"Pipeline de ingeniería de características guardado en: {pipeline_path}")


Pipeline de ingeniería de características guardado en: ../models/feature_engineering_pipeline.pkl
