# feature engineering

## lag features 

In [1]:
import pandas as pd

# Load the data
train_df = pd.read_parquet('../data/train.parquet')
test_df = pd.read_parquet('../data/test.parquet')
bu_feat_df = pd.read_parquet('../data/bu_feat.parquet')

# Merge train_df with bu_feat_df for additional store information
train_df = train_df.merge(bu_feat_df, on='but_num_business_unit', how='left')
test_df = test_df.merge(bu_feat_df, on='but_num_business_unit', how='left')

In [2]:
# Create datetime features
train_df['date'] = pd.to_datetime(train_df['day_id'])
test_df['date'] = pd.to_datetime(test_df['day_id'])

In [3]:
train_df.head()

Unnamed: 0,day_id,but_num_business_unit,dpt_num_department,turnover,but_postcode,but_latitude,but_longitude,but_region_idr_region,zod_idr_zone_dgr,date
0,2017-09-30,64,127,580.308443,16400,45.625172,0.111939,70,10,2017-09-30
1,2017-09-30,119,127,1512.995918,74100,46.195037,6.254448,51,4,2017-09-30
2,2017-09-30,4,88,668.593556,6600,43.600994,7.07816,55,10,2017-09-30
3,2017-09-30,425,127,0.0,59000,50.617921,3.084186,33,3,2017-09-30
4,2017-09-30,513,73,0.0,33610,44.717366,-0.733429,33,3,2017-09-30


# date function

In [4]:
def create_datetime_features(df, datetime_col):
    df['year'] = df[datetime_col].dt.year
    df['month'] = df[datetime_col].dt.month
    df['week'] = df[datetime_col].dt.isocalendar().week
    df['day'] = df[datetime_col].dt.day
    df['dayofweek'] = df[datetime_col].dt.dayofweek
    df['is_weekend'] = df['dayofweek'].isin([5, 6]).astype(int)
    return df

In [5]:
train_df = create_datetime_features(train_df, 'date')
test_df = create_datetime_features(test_df, 'date')

In [6]:
import numpy as np

def create_cyclical_features(df, col, max_val):
    df[f'{col}_sin'] = np.sin(2 * np.pi * df[col] / max_val)
    df[f'{col}_cos'] = np.cos(2 * np.pi * df[col] / max_val)
    return df

In [7]:
# Create cyclical features
train_df = create_cyclical_features(train_df, 'dayofweek', 7)
test_df = create_cyclical_features(test_df, 'dayofweek', 7)

In [8]:
def create_interaction_features(df, bu_col, dept_col):
    df['bu_depart_interaction'] = df[bu_col].astype(str) + '_' + df[dept_col].astype(str)
    return df

In [9]:
# Create store-department interaction features
train_df = create_interaction_features(train_df, 'but_num_business_unit', 'dpt_num_department')
test_df = create_interaction_features(test_df, 'but_num_business_unit', 'dpt_num_department')

In [10]:
train_df.dropna(inplace=True)

In [17]:
# Separate features and target
X_train = train_df.drop(columns=['turnover'])
y_train = train_df['turnover']

# Process the test set
X_test = test_df

# Save processed data to csv for reproducibility
X_train.to_csv('../data/processed/train_processed.csv', index=False)
X_test.to_csv('../data/processed/test_processed.csv', index=False)

# Feature engineering complete - ready for model training

In [15]:
X_train

Unnamed: 0,day_id,but_num_business_unit,dpt_num_department,but_postcode,but_latitude,but_longitude,but_region_idr_region,zod_idr_zone_dgr,date,year,month,week,day,dayofweek,is_weekend,dayofweek_sin,dayofweek_cos,bu_depart_interaction
0,2017-09-30,64,127,16400,45.625172,0.111939,70,10,2017-09-30,2017,9,39,30,5,1,-0.974928,-0.222521,64_127
1,2017-09-30,119,127,74100,46.195037,6.254448,51,4,2017-09-30,2017,9,39,30,5,1,-0.974928,-0.222521,119_127
2,2017-09-30,4,88,6600,43.600994,7.078160,55,10,2017-09-30,2017,9,39,30,5,1,-0.974928,-0.222521,4_88
3,2017-09-30,425,127,59000,50.617921,3.084186,33,3,2017-09-30,2017,9,39,30,5,1,-0.974928,-0.222521,425_127
4,2017-09-30,513,73,33610,44.717366,-0.733429,33,3,2017-09-30,2017,9,39,30,5,1,-0.974928,-0.222521,513_73
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
277714,2012-12-29,131,73,69760,45.822363,4.767741,8,4,2012-12-29,2012,12,52,29,5,1,-0.974928,-0.222521,131_73
277715,2012-12-29,237,127,13500,43.430995,5.047929,71,10,2012-12-29,2012,12,52,29,5,1,-0.974928,-0.222521,237_127
277716,2012-12-29,129,117,14124,49.154936,-0.287441,30,6,2012-12-29,2012,12,52,29,5,1,-0.974928,-0.222521,129_117
277717,2012-12-29,468,127,94320,48.757857,2.385381,75,6,2012-12-29,2012,12,52,29,5,1,-0.974928,-0.222521,468_127
