# Feature Selection for Gradient Boosting Model

In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.metrics import mean_squared_error

In [2]:
DATA_PATH = 'Data_for_taining_14072025.csv'
TARGET = 'marketing_cost'
FEATURES = [
        'aov_eur',
        'available_stock_value_after_discount_complete_eur',
        'avg_temp',
        'cpc',
        'cr_tracked_%',
        'email_recipients',
        'email_visits',
        'internalWeeks_until_SeasonalSaleStart',
        'internal_Week_of_FW_Season',
        'internal_Week_of_SS_Season',
        'is_Peak_Driving_Public_Holiday_week',
        'is_Sun_to_Mon_Shift_week',
        'is_black_week_event',
        'is_email_campaign_type_deal',
        'is_email_campaign_type_liveshop',
        'is_email_campaign_type_newsletter',
        'is_percentage_on_top',
        'is_percentage_on_top_applicable',
        'is_season_sale_event',
        'is_temp_drop_flag',
        'number_days_after_last_event',
        'number_days_till_next_event',
        'number_orders',
        'number_visits',
        'sku_with_discount_%',
        'stock_discount_rate_total_%',
        'target_cpr'
]

df = pd.read_csv(DATA_PATH)
df.rename(columns={'Unnamed: 0': 'date'}, inplace=True)
df['date'] = pd.to_datetime(df['date'])
df.sort_values('date', inplace=True)
df.set_index('date', inplace=True)
df = df.iloc[1:-1]  # drop first and last week

number_of_weeks_to_be_consider = 104
val = df.iloc[-8:]
val_start = val.index[0]
train_1y = df.loc[val_start - pd.DateOffset(weeks=number_of_weeks_to_be_consider) : val_start - pd.DateOffset(weeks=1)]

X_train = train_1y[FEATURES]
y_train = train_1y[TARGET]
X_valid = val[FEATURES]
y_valid = val[TARGET]

print('Training data range', X_train.index.min(), X_train.index.max())
print('Validation data range', X_valid.index.min(), X_valid.index.max())

Training data range 2023-05-08 00:00:00 2025-04-28 00:00:00
Validation data range 2025-05-05 00:00:00 2025-06-23 00:00:00


In [3]:
# --- feature selection setup ---
from tabnanny import verbose
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

base_model = GradientBoostingRegressor(random_state=42, n_estimators=300, max_depth=4, min_samples_split=0.03, learning_rate=0.2)

# forward selection: add features one at a time
sfs_forward = SequentialFeatureSelector(
    base_model,
    n_features_to_select='auto',        # let it choose until score stops improving
    direction='forward',
    scoring='neg_mean_squared_error',
    cv=5,                               # 5‑fold CV inside the selector
    n_jobs=-1
)
sfs_forward.fit(X_train, y_train)
selected_forward = list(X_train.columns[sfs_forward.get_support()])
print("Forward‑selected features:", selected_forward)

Forward‑selected features: ['available_stock_value_after_discount_complete_eur', 'avg_temp', 'cpc', 'is_Peak_Driving_Public_Holiday_week', 'is_black_week_event', 'is_email_campaign_type_deal', 'is_percentage_on_top', 'is_percentage_on_top_applicable', 'is_season_sale_event', 'is_temp_drop_flag', 'number_days_after_last_event', 'number_visits', 'sku_with_discount_%']


In [4]:
# backward selection: remove features one at a time
sfs_backward = SequentialFeatureSelector(
    base_model,
    n_features_to_select='auto',
    direction='backward',
    scoring='neg_mean_squared_error',
    cv=5,
    n_jobs=-1
)
sfs_backward.fit(X_train, y_train)
selected_backward = list(X_train.columns[sfs_backward.get_support()])
print("Backward‑selected features:", selected_backward)

Backward‑selected features: ['available_stock_value_after_discount_complete_eur', 'cpc', 'email_visits', 'internalWeeks_until_SeasonalSaleStart', 'internal_Week_of_FW_Season', 'is_Peak_Driving_Public_Holiday_week', 'is_Sun_to_Mon_Shift_week', 'is_email_campaign_type_newsletter', 'is_percentage_on_top', 'is_percentage_on_top_applicable', 'is_season_sale_event', 'number_orders', 'number_visits', 'stock_discount_rate_total_%']


In [6]:
from sklearn.metrics import (
    mean_absolute_error,
    mean_squared_error,
    r2_score,
    mean_absolute_percentage_error,
    explained_variance_score
)

In [7]:
# --- evaluate both on the hold‑out validation set ---
def train_and_eval(features):
    result_dict = {}
    mdl = GradientBoostingRegressor(random_state=42, n_estimators=300, max_depth=4, min_samples_split=0.03, learning_rate=0.2).fit(X_train[features], y_train)
    preds = mdl.predict(X_valid[features])
    mse = mean_squared_error(y_valid, preds)
    r2 = r2_score(y_valid, preds)
    result_dict['r2'] = r2
    result_dict['mse'] = mse
    return result_dict

mse_fwd = train_and_eval(selected_forward)
mse_bwd = train_and_eval(selected_backward)

print(f"Validation MSE (forward): {mse_fwd['mse']:.4f}")
print(f"Validation R2 (forward): {mse_fwd['r2']:.4f}")

print(f"Validation MSE (backward): {mse_bwd['mse']:.4f}")
print(f"Validation R2 (backward): {mse_bwd['r2']:.4f}")


Validation MSE (forward): 1189384635.8166
Validation R2 (forward): 0.4221
Validation MSE (backward): 883037418.3451
Validation R2 (backward): 0.5710


In [None]:
# taking intersection of forward and backward
list(set(selected_forward).intersection(selected_backward))

['internal_Week_of_SS_Season',
 'is_season_sale_event',
 'is_percentage_on_top',
 'number_visits',
 'internalWeeks_until_SeasonalSaleStart',
 'is_percentage_on_top_applicable',
 'number_days_after_last_event']

In [None]:
# add both 
all_feature_selected_by_selector = selected_forward + selected_backward
all_feature_selected_by_selector = list(set(all_feature_selected_by_selector))
print('Total features', len(all_feature_selected_by_selector))
all_feature_selected_by_selector

Total features 20


['number_orders',
 'avg_temp',
 'is_season_sale_event',
 'cr_tracked_%',
 'number_days_after_last_event',
 'internal_Week_of_SS_Season',
 'available_stock_value_after_discount_complete_eur',
 'aov_eur',
 'is_Peak_Driving_Public_Holiday_week',
 'stock_discount_rate_total_%',
 'is_percentage_on_top_applicable',
 'is_percentage_on_top',
 'is_black_week_event',
 'number_days_till_next_event',
 'internalWeeks_until_SeasonalSaleStart',
 'cpc',
 'is_Sun_to_Mon_Shift_week',
 'email_recipients',
 'is_temp_drop_flag',
 'number_visits']

In [8]:
df.isna().sum()

aov_eur                                              0
available_stock_value_after_discount_complete_eur    0
avg_temp                                             0
cpc                                                  0
cr_tracked_%                                         0
email_recipients                                     0
email_visits                                         0
internalWeeks_until_SeasonalSaleStart                0
internal_Week_of_FW_Season                           0
internal_Week_of_SS_Season                           0
is_Peak_Driving_Public_Holiday_week                  0
is_Sun_to_Mon_Shift_week                             0
is_black_week_event                                  0
is_email_campaign_type_deal                          0
is_email_campaign_type_liveshop                      0
is_email_campaign_type_newsletter                    0
is_percentage_on_top                                 0
is_percentage_on_top_applicable                      0
is_season_

In [9]:
df[all_feature_selected_by_selector].info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 171 entries, 2022-03-21 to 2025-06-23
Data columns (total 20 columns):
 #   Column                                             Non-Null Count  Dtype  
---  ------                                             --------------  -----  
 0   number_orders                                      171 non-null    float64
 1   avg_temp                                           171 non-null    float64
 2   is_season_sale_event                               171 non-null    float64
 3   cr_tracked_%                                       171 non-null    float64
 4   number_days_after_last_event                       171 non-null    float64
 5   internal_Week_of_SS_Season                         171 non-null    float64
 6   available_stock_value_after_discount_complete_eur  171 non-null    float64
 7   aov_eur                                            171 non-null    float64
 8   is_Peak_Driving_Public_Holiday_week                171 non-null    floa

In [10]:
# Debug: what types are in your feature list?
for f in all_feature_selected_by_selector:
    print(f, type(f))

number_orders <class 'str'>
avg_temp <class 'str'>
is_season_sale_event <class 'str'>
cr_tracked_% <class 'str'>
number_days_after_last_event <class 'str'>
internal_Week_of_SS_Season <class 'str'>
available_stock_value_after_discount_complete_eur <class 'str'>
aov_eur <class 'str'>
is_Peak_Driving_Public_Holiday_week <class 'str'>
stock_discount_rate_total_% <class 'str'>
is_percentage_on_top_applicable <class 'str'>
is_percentage_on_top <class 'str'>
is_black_week_event <class 'str'>
number_days_till_next_event <class 'str'>
internalWeeks_until_SeasonalSaleStart <class 'str'>
cpc <class 'str'>
is_Sun_to_Mon_Shift_week <class 'str'>
email_recipients <class 'str'>
is_temp_drop_flag <class 'str'>
number_visits <class 'str'>


In [11]:
print(df[all_feature_selected_by_selector].dtypes)

number_orders                                        float64
avg_temp                                             float64
is_season_sale_event                                 float64
cr_tracked_%                                         float64
number_days_after_last_event                         float64
internal_Week_of_SS_Season                           float64
available_stock_value_after_discount_complete_eur    float64
aov_eur                                              float64
is_Peak_Driving_Public_Holiday_week                  float64
stock_discount_rate_total_%                          float64
is_percentage_on_top_applicable                      float64
is_percentage_on_top                                 float64
is_black_week_event                                  float64
number_days_till_next_event                          float64
internalWeeks_until_SeasonalSaleStart                float64
cpc                                                  float64
is_Sun_to_Mon_Shift_week

- Duplicates features are there

In [12]:
len(list(set(all_feature_selected_by_selector)))

16

In [13]:
list(set(all_feature_selected_by_selector))

['is_email_campaign_type_deal',
 'is_season_sale_event',
 'marketing_budget',
 'is_black_week_event',
 'is_email_campaign_type_liveshop',
 'number_visits',
 'avg_temp',
 'cpc',
 'email_visits',
 'number_days_till_next_event',
 'cr_tracked_%',
 'number_orders',
 'is_percentage_on_top',
 'is_percentage_on_top_applicable',
 'email_recipients',
 'number_days_after_last_event']

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def load_and_prepare_data(path, features, target):
    # Load dataframe
    df = pd.read_csv(path)
    df.rename(columns={'Unnamed: 0': 'date'}, inplace=True)
    df['date'] = pd.to_datetime(df['date'])
    df.sort_values('date', inplace=True)
    df.set_index('date', inplace=True)
    # Drop first and last week
    df = df.iloc[1:-1]
    return df

# Constants
DATA_PATH = 'Data_for_taining_09072025.csv'
TARGET = 'marketing_cost'
FEATURES = [
    'aov_eur',
    'available_stock_value_after_discount_complete_eur',
    'avg_temp',
    'cpc',
    'cr_tracked_%',
    'email_recipients',
    'email_visits',
    'is_black_week_event',
    'is_email_campaign_type_deal',
    'is_email_campaign_type_liveshop',
    'is_email_campaign_type_newsletter',
    'is_percentage_on_top',
    'is_percentage_on_top_applicable',
    'is_season_sale_event',
    'number_days_after_last_event',
    'number_days_till_next_event',
    'number_orders',
    'number_visits',
    'sku_with_discount_%',
    'target_cpr'
]

# Load data
df = load_and_prepare_data(DATA_PATH, FEATURES, TARGET)

# Split train/validation by time
number_of_weeks_to_be_consider = 54
val = df.iloc[-8:]
val_start = val.index[0]
train_1y = df.loc[val_start - pd.DateOffset(weeks=number_of_weeks_to_be_consider) : val_start - pd.DateOffset(weeks=1)]

X_train = train_1y[FEATURES]
y_train = train_1y[TARGET]
X_valid = val[FEATURES]
y_valid = val[TARGET]

print('Training data range', X_train.index.min(), X_train.index.max())
print('Validation data range', X_valid.index.min(), X_valid.index.max())

# Define the model and parameter grid
gb = GradientBoostingRegressor(random_state=42)
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'subsample': [0.6, 0.8, 1.0],
    'max_features': ['sqrt', 'log2', None],
    'min_samples_split' : [0.03, 0.04, 0.05]
}

# TimeSeriesSplit for cross-validation
tscv = TimeSeriesSplit(n_splits=5)

grid_search = GridSearchCV(
    estimator=gb,
    param_grid=param_grid,
    cv=tscv,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    verbose=2
)

# Fit GridSearch
print("Starting GridSearchCV...")
grid_search.fit(X_train, y_train)

# Best parameters and score
best_params = grid_search.best_params_
best_score = np.sqrt(-grid_search.best_score_)
print(f"Best parameters: {best_params}")
print(f"Best CV RMSE: {best_score:.4f}")

# Evaluate on validation set
best_model = grid_search.best_estimator_
pred_valid = best_model.predict(X_valid)

rmse_val = np.sqrt(mean_squared_error(y_valid, pred_valid))
mae_val = mean_absolute_error(y_valid, pred_valid)
r2_val = r2_score(y_valid, pred_valid)

print("Validation Metrics:")
print(f"RMSE: {rmse_val:.4f}")
print(f"MAE: {mae_val:.4f}")
print(f"R2: {r2_val:.4f}")

# Optionally, save the model
# import joblib
# joblib.dump(best_model, 'best_gb_model.pkl')


Training data range 2024-04-22 00:00:00 2025-04-28 00:00:00
Validation data range 2025-05-05 00:00:00 2025-06-23 00:00:00
Starting GridSearchCV...
Fitting 5 folds for each of 729 candidates, totalling 3645 fits
[CV] END learning_rate=0.01, max_depth=3, max_features=sqrt, min_samples_split=0.03, n_estimators=100, subsample=0.6; total time=   0.1s
[CV] END learning_rate=0.01, max_depth=3, max_features=sqrt, min_samples_split=0.03, n_estimators=100, subsample=0.6; total time=   0.1s
[CV] END learning_rate=0.01, max_depth=3, max_features=sqrt, min_samples_split=0.03, n_estimators=100, subsample=0.6; total time=   0.1s
[CV] END learning_rate=0.01, max_depth=3, max_features=sqrt, min_samples_split=0.03, n_estimators=100, subsample=0.6; total time=   0.1s
[CV] END learning_rate=0.01, max_depth=3, max_features=sqrt, min_samples_split=0.03, n_estimators=100, subsample=0.8; total time=   0.1s
[CV] END learning_rate=0.01, max_depth=3, max_features=sqrt, min_samples_split=0.03, n_estimators=100, s