# Feature Selection for Gradient Boosting Model

In [8]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

from sklearn.metrics import (
    mean_absolute_error,
    mean_squared_error,
    r2_score,
    mean_absolute_percentage_error,
    explained_variance_score
)

In [9]:
DATA_PATH = 'Data_for_taining_18072025.csv'
TARGET = 'cogs_eur'
FEATURES = [
'aov_eur',
'available_stock_value_after_discount_complete_eur',
'avg_temp',
'cpc',
'cr_tracked_%',
'email_recipients',
'email_visits',
'internalWeeks_until_SeasonalSaleStart',
'internal_Week_of_FW_Season',
'internal_Week_of_SS_Season',
'is_Peak_Driving_Public_Holiday_week',
'is_Sun_to_Mon_Shift_week',
'is_black_week_event',
'is_email_campaign_type_deal',
'is_email_campaign_type_liveshop',
'is_email_campaign_type_newsletter',
'is_percentage_on_top',
'is_percentage_on_top_applicable',
'is_season_sale_event',
'is_temp_drop_flag',
'number_days_after_last_event',
'number_days_till_next_event',
'number_orders',
'number_visits',
'sku_with_discount_%',
'stock_discount_rate_total_%',
'target_cpr'
]
df = pd.read_csv(DATA_PATH)
df.rename(columns={'Unnamed: 0': 'date'}, inplace=True)
df['date'] = pd.to_datetime(df['date'])
df.sort_values('date', inplace=True)
df.set_index('date', inplace=True)
df = df.iloc[1:-1]  # drop first and last week

number_of_weeks_to_be_consider = 78
val = df.iloc[-8:]
val_start = val.index[0]
train_1y = df.loc[val_start - pd.DateOffset(weeks=number_of_weeks_to_be_consider) : val_start - pd.DateOffset(weeks=1)]

X_train = train_1y[FEATURES]
y_train = train_1y[TARGET]
X_valid = val[FEATURES]
y_valid = val[TARGET]

print('Training data range', X_train.index.min(), X_train.index.max())
print('Validation data range', X_valid.index.min(), X_valid.index.max())

Training data range 2023-11-06 00:00:00 2025-04-28 00:00:00
Validation data range 2025-05-05 00:00:00 2025-06-23 00:00:00


## Forward Feature Selection

In [10]:
# --- feature selection setup ---
from tabnanny import verbose


base_model = GradientBoostingRegressor(random_state=42, min_samples_split=0.04, n_estimators= 300, max_depth= 7, learning_rate=0.2)

# forward selection: add features one at a time
sfs_forward = SequentialFeatureSelector(
    base_model,
    n_features_to_select='auto',        # let it choose until score stops improving
    direction='forward',
    scoring='neg_mean_squared_error',
    cv=5,                               # 5‑fold CV inside the selector
    n_jobs=-1
)
sfs_forward.fit(X_train, y_train)
selected_forward = list(X_train.columns[sfs_forward.get_support()])
print("Forward‑selected features:", selected_forward)

Forward‑selected features: ['aov_eur', 'cpc', 'internalWeeks_until_SeasonalSaleStart', 'internal_Week_of_FW_Season', 'is_Peak_Driving_Public_Holiday_week', 'is_Sun_to_Mon_Shift_week', 'is_email_campaign_type_deal', 'is_email_campaign_type_liveshop', 'is_percentage_on_top_applicable', 'is_season_sale_event', 'is_temp_drop_flag', 'number_orders', 'sku_with_discount_%']


## Backward Feature Selection

In [11]:
# backward selection: remove features one at a time
sfs_backward = SequentialFeatureSelector(
    base_model,
    n_features_to_select='auto',
    direction='backward',
    scoring='neg_mean_squared_error',
    cv=5,
    n_jobs=-1
)
sfs_backward.fit(X_train, y_train)
selected_backward = list(X_train.columns[sfs_backward.get_support()])
print("Backward‑selected features:", selected_backward)

Backward‑selected features: ['aov_eur', 'available_stock_value_after_discount_complete_eur', 'email_visits', 'internal_Week_of_FW_Season', 'is_Sun_to_Mon_Shift_week', 'is_email_campaign_type_deal', 'is_email_campaign_type_liveshop', 'is_percentage_on_top', 'is_season_sale_event', 'is_temp_drop_flag', 'number_days_till_next_event', 'number_orders', 'sku_with_discount_%', 'stock_discount_rate_total_%']


## Combining both features

In [12]:
 # add both 
all_feature_selected_by_selector = selected_forward + selected_backward
all_feature_selected_by_selector = list(set(all_feature_selected_by_selector))
print('Total features', len(all_feature_selected_by_selector))
all_feature_selected_by_selector

Total features 18


['internal_Week_of_FW_Season',
 'aov_eur',
 'number_orders',
 'is_percentage_on_top_applicable',
 'sku_with_discount_%',
 'email_visits',
 'is_email_campaign_type_liveshop',
 'stock_discount_rate_total_%',
 'available_stock_value_after_discount_complete_eur',
 'cpc',
 'is_Peak_Driving_Public_Holiday_week',
 'is_Sun_to_Mon_Shift_week',
 'is_email_campaign_type_deal',
 'is_season_sale_event',
 'number_days_till_next_event',
 'is_temp_drop_flag',
 'is_percentage_on_top',
 'internalWeeks_until_SeasonalSaleStart']

In [13]:
# --- evaluate both on the hold‑out validation set ---
def train_and_eval(base_model, features):
    result_dict = {}
    mdl = base_model.fit(X_train[features], y_train)
    preds = mdl.predict(X_valid[features])
    mse = mean_squared_error(y_valid, preds)
    r2 = r2_score(y_valid, preds)
    result_dict['r2'] = r2
    result_dict['mse'] = mse
    return result_dict

all_features = train_and_eval(base_model, FEATURES)
mse_fwd = train_and_eval(base_model, selected_forward)
mse_bwd = train_and_eval(base_model, selected_backward)
fwd_bwd = train_and_eval(base_model, all_feature_selected_by_selector)

print(f"Validation MSE (all_features): {all_features['mse']:.4f}")
print(f"Validation R2 (all_features): {all_features['r2']:.4f}")

print(f"Validation MSE (forward): {mse_fwd['mse']:.4f}")
print(f"Validation R2 (forward): {mse_fwd['r2']:.4f}")

print(f"Validation MSE (backward): {mse_bwd['mse']:.4f}")
print(f"Validation R2 (backward): {mse_bwd['r2']:.4f}")

print(f"Validation MSE (fwd_bwd): {fwd_bwd['mse']:.4f}")
print(f"Validation R2 (fwd_bwd): {fwd_bwd['r2']:.4f}")


Validation MSE (all_features): 46468388323.0920
Validation R2 (all_features): 0.5378
Validation MSE (forward): 23729628322.6419
Validation R2 (forward): 0.7640
Validation MSE (backward): 25664420836.5939
Validation R2 (backward): 0.7448
Validation MSE (fwd_bwd): 31796418733.5991
Validation R2 (fwd_bwd): 0.6838
