# Feature Selection for Linear Regression Model

In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.metrics import mean_squared_error

In [2]:
DATA_PATH = 'Data_for_taining_14072025.csv'
TARGET = 'marketing_cost'
FEATURES = [
        'aov_eur',
        'available_stock_value_after_discount_complete_eur',
        'avg_temp',
        'cpc',
        'cr_tracked_%',
        'email_recipients',
        'email_visits',
        'internalWeeks_until_SeasonalSaleStart',
        'internal_Week_of_FW_Season',
        'internal_Week_of_SS_Season',
        'is_Peak_Driving_Public_Holiday_week',
        'is_Sun_to_Mon_Shift_week',
        'is_black_week_event',
        'is_email_campaign_type_deal',
        'is_email_campaign_type_liveshop',
        'is_email_campaign_type_newsletter',
        'is_percentage_on_top',
        'is_percentage_on_top_applicable',
        'is_season_sale_event',
        'is_temp_drop_flag',
        'number_days_after_last_event',
        'number_days_till_next_event',
        'number_orders',
        'number_visits',
        'sku_with_discount_%',
        'stock_discount_rate_total_%',
        'target_cpr'
]

df = pd.read_csv(DATA_PATH)
df.rename(columns={'Unnamed: 0': 'date'}, inplace=True)
df['date'] = pd.to_datetime(df['date'])
df.sort_values('date', inplace=True)
df.set_index('date', inplace=True)
df = df.iloc[1:-1]  # drop first and last week

number_of_weeks_to_be_consider = 104
val = df.iloc[-8:]
val_start = val.index[0]
train_1y = df.loc[val_start - pd.DateOffset(weeks=number_of_weeks_to_be_consider) : val_start - pd.DateOffset(weeks=1)]

X_train = train_1y[FEATURES]
y_train = train_1y[TARGET]
X_valid = val[FEATURES]
y_valid = val[TARGET]

print('Training data range', X_train.index.min(), X_train.index.max())
print('Validation data range', X_valid.index.min(), X_valid.index.max())

Training data range 2023-05-08 00:00:00 2025-04-28 00:00:00
Validation data range 2025-05-05 00:00:00 2025-06-23 00:00:00


In [3]:
# --- feature selection setup ---
from tabnanny import verbose


base_model = LinearRegression()

# forward selection: add features one at a time
sfs_forward = SequentialFeatureSelector(
    base_model,
    n_features_to_select='auto',        # let it choose until score stops improving
    direction='forward',
    scoring='neg_mean_squared_error',
    cv=5,                               # 5‑fold CV inside the selector
    n_jobs=-1
)
sfs_forward.fit(X_train, y_train)
selected_forward = list(X_train.columns[sfs_forward.get_support()])
print("Forward‑selected features:", selected_forward)

Forward‑selected features: ['available_stock_value_after_discount_complete_eur', 'avg_temp', 'cpc', 'email_visits', 'internal_Week_of_FW_Season', 'internal_Week_of_SS_Season', 'is_black_week_event', 'is_email_campaign_type_deal', 'is_percentage_on_top_applicable', 'is_temp_drop_flag', 'number_orders', 'number_visits', 'stock_discount_rate_total_%']


In [4]:
# backward selection: remove features one at a time
sfs_backward = SequentialFeatureSelector(
    base_model,
    n_features_to_select='auto',
    direction='backward',
    scoring='neg_mean_squared_error',
    cv=5,
    n_jobs=-1
)
sfs_backward.fit(X_train, y_train)
selected_backward = list(X_train.columns[sfs_backward.get_support()])
print("Backward‑selected features:", selected_backward)

Backward‑selected features: ['available_stock_value_after_discount_complete_eur', 'cpc', 'cr_tracked_%', 'email_recipients', 'email_visits', 'internal_Week_of_FW_Season', 'internal_Week_of_SS_Season', 'is_black_week_event', 'is_percentage_on_top_applicable', 'is_season_sale_event', 'is_temp_drop_flag', 'number_orders', 'stock_discount_rate_total_%', 'target_cpr']


In [5]:
# --- evaluate both on the hold‑out validation set ---
def train_and_eval(features):
    mdl = LinearRegression().fit(X_train[features], y_train)
    preds = mdl.predict(X_valid[features])
    mse = mean_squared_error(y_valid, preds)
    return mse

mse_fwd = train_and_eval(selected_forward)
mse_bwd = train_and_eval(selected_backward)

print(f"Validation MSE (forward): {mse_fwd:.4f}")
print(f"Validation MSE (backward): {mse_bwd:.4f}")


Validation MSE (forward): 219859172.8814
Validation MSE (backward): 420744212.1680


- Backward is performing better

In [6]:
# taking intersection of forward and backward
list(set(selected_forward).intersection(selected_backward))

['stock_discount_rate_total_%',
 'email_visits',
 'internal_Week_of_FW_Season',
 'is_temp_drop_flag',
 'number_orders',
 'cpc',
 'available_stock_value_after_discount_complete_eur',
 'internal_Week_of_SS_Season',
 'is_black_week_event',
 'is_percentage_on_top_applicable']

In [7]:
# add both 
all_feature_selected_by_selector = selected_forward + selected_backward
all_feature_selected_by_selector = list(set(all_feature_selected_by_selector))
print('Total features', len(all_feature_selected_by_selector))
all_feature_selected_by_selector

Total features 17


['target_cpr',
 'stock_discount_rate_total_%',
 'is_email_campaign_type_deal',
 'email_visits',
 'internal_Week_of_FW_Season',
 'is_temp_drop_flag',
 'number_orders',
 'email_recipients',
 'cpc',
 'available_stock_value_after_discount_complete_eur',
 'avg_temp',
 'internal_Week_of_SS_Season',
 'is_black_week_event',
 'is_percentage_on_top_applicable',
 'number_visits',
 'is_season_sale_event',
 'cr_tracked_%']

In [8]:
# taking intersection of forward and backward
list(set(selected_forward).intersection(selected_backward))


['stock_discount_rate_total_%',
 'email_visits',
 'internal_Week_of_FW_Season',
 'is_temp_drop_flag',
 'number_orders',
 'cpc',
 'available_stock_value_after_discount_complete_eur',
 'internal_Week_of_SS_Season',
 'is_black_week_event',
 'is_percentage_on_top_applicable']

In [9]:
# add both 
all_feature_selected_by_selector = selected_forward + selected_backward
print('Total features', len(all_feature_selected_by_selector))
all_feature_selected_by_selector

Total features 27


['available_stock_value_after_discount_complete_eur',
 'avg_temp',
 'cpc',
 'email_visits',
 'internal_Week_of_FW_Season',
 'internal_Week_of_SS_Season',
 'is_black_week_event',
 'is_email_campaign_type_deal',
 'is_percentage_on_top_applicable',
 'is_temp_drop_flag',
 'number_orders',
 'number_visits',
 'stock_discount_rate_total_%',
 'available_stock_value_after_discount_complete_eur',
 'cpc',
 'cr_tracked_%',
 'email_recipients',
 'email_visits',
 'internal_Week_of_FW_Season',
 'internal_Week_of_SS_Season',
 'is_black_week_event',
 'is_percentage_on_top_applicable',
 'is_season_sale_event',
 'is_temp_drop_flag',
 'number_orders',
 'stock_discount_rate_total_%',
 'target_cpr']

In [10]:
df.isna().sum()

aov_eur                                              0
available_stock_value_after_discount_complete_eur    0
avg_temp                                             0
cpc                                                  0
cr_tracked_%                                         0
email_recipients                                     0
email_visits                                         0
internalWeeks_until_SeasonalSaleStart                0
internal_Week_of_FW_Season                           0
internal_Week_of_SS_Season                           0
is_Peak_Driving_Public_Holiday_week                  0
is_Sun_to_Mon_Shift_week                             0
is_black_week_event                                  0
is_email_campaign_type_deal                          0
is_email_campaign_type_liveshop                      0
is_email_campaign_type_newsletter                    0
is_percentage_on_top                                 0
is_percentage_on_top_applicable                      0
is_season_

In [11]:
df[all_feature_selected_by_selector].info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 171 entries, 2022-03-21 to 2025-06-23
Data columns (total 27 columns):
 #   Column                                             Non-Null Count  Dtype  
---  ------                                             --------------  -----  
 0   available_stock_value_after_discount_complete_eur  171 non-null    float64
 1   avg_temp                                           171 non-null    float64
 2   cpc                                                171 non-null    float64
 3   email_visits                                       171 non-null    float64
 4   internal_Week_of_FW_Season                         171 non-null    float64
 5   internal_Week_of_SS_Season                         171 non-null    float64
 6   is_black_week_event                                171 non-null    float64
 7   is_email_campaign_type_deal                        171 non-null    float64
 8   is_percentage_on_top_applicable                    171 non-null    floa

In [12]:
# Debug: what types are in your feature list?
for f in all_feature_selected_by_selector:
    print(f, type(f))

available_stock_value_after_discount_complete_eur <class 'str'>
avg_temp <class 'str'>
cpc <class 'str'>
email_visits <class 'str'>
internal_Week_of_FW_Season <class 'str'>
internal_Week_of_SS_Season <class 'str'>
is_black_week_event <class 'str'>
is_email_campaign_type_deal <class 'str'>
is_percentage_on_top_applicable <class 'str'>
is_temp_drop_flag <class 'str'>
number_orders <class 'str'>
number_visits <class 'str'>
stock_discount_rate_total_% <class 'str'>
available_stock_value_after_discount_complete_eur <class 'str'>
cpc <class 'str'>
cr_tracked_% <class 'str'>
email_recipients <class 'str'>
email_visits <class 'str'>
internal_Week_of_FW_Season <class 'str'>
internal_Week_of_SS_Season <class 'str'>
is_black_week_event <class 'str'>
is_percentage_on_top_applicable <class 'str'>
is_season_sale_event <class 'str'>
is_temp_drop_flag <class 'str'>
number_orders <class 'str'>
stock_discount_rate_total_% <class 'str'>
target_cpr <class 'str'>


In [13]:
print(df[all_feature_selected_by_selector].dtypes)

available_stock_value_after_discount_complete_eur    float64
avg_temp                                             float64
cpc                                                  float64
email_visits                                         float64
internal_Week_of_FW_Season                           float64
internal_Week_of_SS_Season                           float64
is_black_week_event                                  float64
is_email_campaign_type_deal                          float64
is_percentage_on_top_applicable                      float64
is_temp_drop_flag                                    float64
number_orders                                        float64
number_visits                                        float64
stock_discount_rate_total_%                          float64
available_stock_value_after_discount_complete_eur    float64
cpc                                                  float64
cr_tracked_%                                         float64
email_recipients        

- Duplicates features are there

In [14]:
len(list(set(all_feature_selected_by_selector)))

17

In [15]:
list(set(all_feature_selected_by_selector))

['target_cpr',
 'stock_discount_rate_total_%',
 'is_email_campaign_type_deal',
 'email_visits',
 'internal_Week_of_FW_Season',
 'is_temp_drop_flag',
 'number_orders',
 'email_recipients',
 'cpc',
 'available_stock_value_after_discount_complete_eur',
 'avg_temp',
 'internal_Week_of_SS_Season',
 'is_black_week_event',
 'is_percentage_on_top_applicable',
 'number_visits',
 'is_season_sale_event',
 'cr_tracked_%']