# Feature Creation Notebook

Este notebook demuestra el proceso de ingeniería de características utilizando los módulos del paquete `product_development`.

## Proceso de Feature Engineering

1. **Imputación de variables categóricas**: Rellenar valores nulos con 'Missing'
2. **Imputación de variables numéricas**: Usar mediana para robustez
3. **Codificación de frecuencia**: Para variables categóricas de alta cardinalidad
4. **Mapeo de día de la semana**: Convertir nombres a valores numéricos
5. **Escalado**: MinMaxScaler para normalización

In [1]:
# Importaciones
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import os
import joblib

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

from feature_engine.imputation import MeanMedianImputer
from feature_engine.encoding import CountFrequencyEncoder

# Importar módulos del paquete product_development
from product_development.config import (
    TARGET, FEATURES, CATEGORICAL_VARS, CATEGORICAL_VARS_IMPUTE,
    CATEGORICAL_VARS_FREQ, NUMERICAL_VARS, DAY_OF_WEEK_MAPPING,
    RAW_DATA_DIR, MODELS_DIR, PROCESSED_DATA_DIR
)
from product_development.transformers import Mapper, SimpleCategoricalImputer
from product_development.features import build_feature_pipeline, save_feature_pipeline

[32m2025-11-29 12:28:57.409[0m | [1mINFO    [0m | [36mproduct_development.config[0m:[36m<module>[0m:[36m17[0m - [1mPROJ_ROOT path is: C:\Users\fjgon\OneDrive - Universidad Galileo\Trimestre 8\product_development[0m


In [2]:
# 1. Cargar datos de entrenamiento
data_train = pd.read_csv(RAW_DATA_DIR / 'train.csv', parse_dates=['date'])

print(f"Dataset cargado: {data_train.shape}")
data_train.head()

Dataset cargado: (913000, 4)


Unnamed: 0,date,store,item,sales
0,2013-01-01,1,1,13
1,2013-01-02,1,1,11
2,2013-01-03,1,1,14
3,2013-01-04,1,1,13
4,2013-01-05,1,1,10


In [3]:
# 2. Preparar datos: agregar features temporales y convertir tipos
data_train['date'] = pd.to_datetime(data_train['date'])
data_train['year'] = data_train['date'].dt.year
data_train['month'] = data_train['date'].dt.month
data_train['day_of_week_name'] = data_train['date'].dt.day_name()

# Convertir a tipo categórico
for col in CATEGORICAL_VARS:
    data_train[col] = data_train[col].astype('O')

print(f"Features utilizadas: {FEATURES}")
print(f"Target: {TARGET}")
data_train.head()

Features utilizadas: ['store', 'item', 'year', 'month', 'day_of_week_name']
Target: sales


Unnamed: 0,date,store,item,sales,year,month,day_of_week_name
0,2013-01-01,1,1,13,2013,1,Tuesday
1,2013-01-02,1,1,11,2013,1,Wednesday
2,2013-01-03,1,1,14,2013,1,Thursday
3,2013-01-04,1,1,13,2013,1,Friday
4,2013-01-05,1,1,10,2013,1,Saturday


In [4]:
# 3. Train-Test Split
X = data_train[FEATURES].copy()
y = data_train[TARGET].copy()

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, shuffle=True, random_state=2025
)

print(f"Train size: {X_train.shape}, Test size: {X_test.shape}")
X_train.head()

Train size: (639100, 5), Test size: (273900, 5)


Unnamed: 0,store,item,year,month,day_of_week_name
686013,6,38,2016,6,Friday
706742,8,39,2013,3,Friday
641860,2,36,2015,7,Friday
186225,2,11,2017,12,Tuesday
462179,4,26,2013,7,Sunday


In [5]:
# 4. Construcción del Pipeline usando los módulos del paquete
# El pipeline se construye usando la función del módulo features
sales_feature_pipeline = build_feature_pipeline()

# Mostrar estructura del pipeline
print("Pipeline de Feature Engineering:")
for step_name, step in sales_feature_pipeline.steps:
    print(f"  - {step_name}: {type(step).__name__}")

sales_feature_pipeline

[32m2025-11-29 12:28:58.076[0m | [1mINFO    [0m | [36mproduct_development.features[0m:[36mbuild_feature_pipeline[0m:[36m48[0m - [1mConstruyendo pipeline de ingeniería de características[0m
[32m2025-11-29 12:28:58.076[0m | [1mINFO    [0m | [36mproduct_development.features[0m:[36mbuild_feature_pipeline[0m:[36m75[0m - [1mPipeline de características construido exitosamente[0m
Pipeline de Feature Engineering:
  - cat_missing_imputation: SimpleCategoricalImputer
  - num_median_imputation: MeanMedianImputer
  - cat_freq_encoder: CountFrequencyEncoder
  - dayofweek_mapper: Mapper
  - feature_scaler: MinMaxScaler


0,1,2
,steps,"[('cat_missing_imputation', ...), ('num_median_imputation', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,variables,"['store', 'item']"
,fill_value,'Missing'

0,1,2
,imputation_method,'median'
,variables,"['year', 'month']"

0,1,2
,encoding_method,'frequency'
,variables,"['store', 'item']"
,missing_values,'raise'
,ignore_format,False
,unseen,'ignore'

0,1,2
,mappings,"{'Friday': 4, 'Monday': 0, 'Saturday': 5, 'Sunday': 6, ...}"
,variables,['day_of_week_name']

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False


In [6]:
# 5. Ajustar y transformar datos
sales_feature_pipeline.fit(X_train, y_train)

X_train_transformed = sales_feature_pipeline.transform(X_train)

X_train_transformed_df = pd.DataFrame(
    X_train_transformed,
    columns=FEATURES
)

print(f"Datos transformados: {X_train_transformed_df.shape}")
X_train_transformed_df.head()

  X[var] = X[var].fillna(self.fill_value).astype(str)
  X[var] = X[var].fillna(self.fill_value).astype(str)
  X[var] = X[var].fillna(self.fill_value).astype(str)
  X[var] = X[var].fillna(self.fill_value).astype(str)


Datos transformados: (639100, 5)


Unnamed: 0,store,item,year,month,day_of_week_name
0,0.610209,0.539683,0.75,0.454545,0.666667
1,0.053364,0.48254,0.0,0.181818,0.666667
2,0.259861,0.0,0.5,0.545455,0.666667
3,0.259861,0.425397,1.0,1.0,0.166667
4,0.529002,0.4,0.0,0.545455,1.0


In [7]:
# 6. Guardar datos preprocesados
preproc_train_path = PROCESSED_DATA_DIR / "preproc_train.csv"

Xy_train_preproc = pd.concat(
    [X_train_transformed_df.reset_index(drop=True),
     y_train.reset_index(drop=True)],
    axis=1
)

PROCESSED_DATA_DIR.mkdir(parents=True, exist_ok=True)
Xy_train_preproc.to_csv(preproc_train_path, index=False)

print(f"Datos de entrenamiento preprocesados guardados en: {preproc_train_path}")

Datos de entrenamiento preprocesados guardados en: C:\Users\fjgon\OneDrive - Universidad Galileo\Trimestre 8\product_development\data\processed\preproc_train.csv


In [8]:
# 7. Guardar pipeline usando la función del módulo
save_feature_pipeline(sales_feature_pipeline)

print(f"Pipeline de ingeniería de características guardado en: {MODELS_DIR / 'feature_engineering_pipeline.pkl'}")

[32m2025-11-29 12:29:02.573[0m | [1mINFO    [0m | [36mproduct_development.features[0m:[36msave_feature_pipeline[0m:[36m147[0m - [1mPipeline de características guardado en C:\Users\fjgon\OneDrive - Universidad Galileo\Trimestre 8\product_development\models\feature_engineering_pipeline.pkl[0m
Pipeline de ingeniería de características guardado en: C:\Users\fjgon\OneDrive - Universidad Galileo\Trimestre 8\product_development\models\feature_engineering_pipeline.pkl
