#  Model Training

### Imports

In [1]:
# Imports
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import HistGradientBoostingRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.model_selection import RandomizedSearchCV, KFold, train_test_split
from sklearn.metrics import mean_absolute_percentage_error

# Load data
df_train = pd.read_pickle('data/train_processed.pkl')
df_test = pd.read_pickle('data/test_processed.pkl')

In [2]:
df_test.head()

Unnamed: 0,holiday,shops_closed,winter_school_holidays,school_holidays,id,date_year,date_month,date_day,date_day_of_week,date_week_of_year,...,holiday_name_Memorial Day of the Republic,holiday_name_Memorial day of the 1956 Revolution,holiday_name_NaN,holiday_name_National Defense Day,holiday_name_New Years Day,holiday_name_Peace Festival in Augsburg,holiday_name_Reformation Day,holiday_name_State Foundation Day,holiday_name_Whit monday,holiday_name_Whit sunday
13681,1,1,0,0,Brno_1_2024-05-08,2024,5,8,2,19,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13678,1,1,0,0,Prague_1_2024-05-08,2024,5,8,2,19,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13679,1,1,0,0,Prague_2_2024-05-08,2024,5,8,2,19,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13684,1,1,0,0,Prague_3_2024-05-08,2024,5,8,2,19,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13419,1,1,0,0,Brno_1_2024-04-01,2024,4,1,0,14,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
''' 
# Reconstruct the date column
df_train = df_train.rename(columns={'date_year': 'year', 'date_month': 'month', 'date_day': 'day'})
df_test = df_test.rename(columns={'date_year': 'year', 'date_month': 'month', 'date_day': 'day'})

df_train['date'] = pd.to_datetime(df_train[['year', 'month', 'day']])
df_test['date'] = pd.to_datetime(df_test[['year', 'month', 'day']])

# Restore the original column names
df_train = df_train.rename(columns={'year': 'date_year', 'month': 'date_month', 'day': 'date_day'})
df_test = df_test.rename(columns={'year': 'date_year', 'month': 'date_month', 'day': 'date_day'})
'''

" \n# Reconstruct the date column\ndf_train = df_train.rename(columns={'date_year': 'year', 'date_month': 'month', 'date_day': 'day'})\ndf_test = df_test.rename(columns={'date_year': 'year', 'date_month': 'month', 'date_day': 'day'})\n\ndf_train['date'] = pd.to_datetime(df_train[['year', 'month', 'day']])\ndf_test['date'] = pd.to_datetime(df_test[['year', 'month', 'day']])\n\n# Restore the original column names\ndf_train = df_train.rename(columns={'year': 'date_year', 'month': 'date_month', 'day': 'date_day'})\ndf_test = df_test.rename(columns={'year': 'date_year', 'month': 'date_month', 'day': 'date_day'})\n"

In [3]:
# Scaling features
features_to_scale = [
    'date_year', 'date_month', 'date_day', 'date_day_of_week', 'date_day_of_year',
    'date_week_of_year', 'date_quarter', 'month_sin', 'month_cos', 
    'day_sin', 'day_cos', 
    # 'lag_7_working', 'lag_14_working', 'lag_21_working', 
    # 'lag_28_working', 'rolling_mean_7_working', 'rolling_mean_14_working',
    # 'rolling_mean_21_working', 'rolling_mean_28_working'
]
feature_scaler = StandardScaler()
df_train[features_to_scale] = feature_scaler.fit_transform(df_train[features_to_scale])
df_test[features_to_scale] = feature_scaler.transform(df_test[features_to_scale])

In [4]:
# Prepare data for modeling
X = df_train.drop(columns=['orders'])
y = df_train['orders']
X_pred = df_test.drop(columns=['id'])
ids = df_test['id']

In [21]:
# Tried training seperate models for warehouses but didn't get improvement on score
# warehouse_cols_order = [
#     'warehouse_Prague_1', 'warehouse_Brno_1', 'warehouse_Prague_2', 
#     'warehouse_Prague_3', 'warehouse_Munich_1', 'warehouse_Frankfurt_1', 
#     'warehouse_Budapest_1'
# ]
# 
# # Prepare data for modeling
# X_train_list = [df_train[df_train[col] == 1].drop(columns=warehouse_cols_order + ['orders']) for col in warehouse_cols_order]
# y_train_list = [df_train[df_train[col] == 1]['orders'] for col in warehouse_cols_order]
# X_test_list = [df_test[df_test[col] == 1].drop(columns=warehouse_cols_order + ['id']) for col in warehouse_cols_order]
# ids_list = [df_test[df_test[col] == 1]['id'] for col in warehouse_cols_order]

In [58]:
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize models and parameter distributions (here, parameters of extensive hp tuning are used)
models = {
    'cbr': (CatBoostRegressor(verbose=False, iterations=10000, depth=4, learning_rate=0.1, l2_leaf_reg=0.1), {
    }),
    'hgbr': (HistGradientBoostingRegressor(max_iter=10000, learning_rate=0.1, max_depth=4, l2_regularization=1), {
    }),
    'xgbr': (XGBRegressor(n_estimators=10000, learning_rate=0.1, max_depth=4, reg_lambda=1), {
    })
}

# Train and evaluate models
df_model_predictions = pd.DataFrame()
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

for name, (model, dist) in models.items():
    rscv = RandomizedSearchCV(model, param_distributions=dist, n_iter=1, scoring='neg_mean_absolute_percentage_error', n_jobs=-1, cv=kfold, random_state=42)
    search = rscv.fit(X_train, y_train)
    
    # Predictions on validation data
    y_val_pred = rscv.predict(X_val)
    val_mape = mean_absolute_percentage_error(y_val, y_val_pred)
    
    # Predictions on training data
    y_train_pred = rscv.predict(X_train)
    train_mape = mean_absolute_percentage_error(y_train, y_train_pred)
    
    print(f'Model: {name}')
    print(f'Training MAPE: {train_mape}')
    print(f'Validation MAPE: {val_mape}')
    print('==========================')

for name, (model, dist) in models.items():
    rscv = RandomizedSearchCV(model, param_distributions=dist, n_iter=1, scoring='neg_mean_absolute_percentage_error', n_jobs=-1, cv=kfold, random_state=42)
    search = rscv.fit(X, y)
    
    # Predictions on test data
    y_pred = rscv.predict(X_pred)
    df_model_predictions[name] = y_pred

Model: cbr
Training MAPE: 0.017921237333216013
Validation MAPE: 0.0338973646597381
Model: hgbr
Training MAPE: 0.017566420788738213
Validation MAPE: 0.035766979143804424
Model: xgbr
Training MAPE: 0.009456837428049092
Validation MAPE: 0.0363445090117793


In [134]:
''' Old Implementation for lag and rolling features

# Train and evaluate models
df_model_predictions = pd.DataFrame()
kfold = KFold(n_splits=10, shuffle=True, random_state=429)

# Train and evaluate models
for name, (model, dist) in models.items():
    rscv = RandomizedSearchCV(model, param_distributions=dist, n_iter=200, scoring='neg_mean_absolute_percentage_error', n_jobs=28, cv=kfold, random_state=42)
    search = rscv.fit(X, y)
    best_model = search.best_estimator_
    
    # Initialize predictions DataFrame
    df_test_predictions = df_test.copy()
    
    # Identify warehouse columns
    warehouse_cols = [
        'warehouse_Brno_1', 'warehouse_Budapest_1', 'warehouse_Frankfurt_1',
        'warehouse_Munich_1', 'warehouse_Prague_1', 'warehouse_Prague_2', 'warehouse_Prague_3'
    ]
    
    # Iterate over each warehouse
    for warehouse_col in warehouse_cols:
        
        # Filter the test data for the current warehouse
        df_warehouse_test = df_test_predictions[df_test_predictions[warehouse_col] == 1].copy()
        
        # Get corresponding training data for the current warehouse
        df_warehouse_train = df_train[df_train[warehouse_col] == 1].copy()
        
        # Concatenate train and test data to ensure continuity
        df_warehouse_combined = pd.concat([df_warehouse_train, df_warehouse_test], ignore_index=True)
        
        # Sort by date
        df_warehouse_combined = df_warehouse_combined.sort_values('date')
        
        # Calculate initial lag and rolling features
        def calculate_lag_and_rolling_features(df):
            for lag in [7, 14, 21, 28]:
                df[f'lag_{lag}_working'] = df['orders'].shift(lag)
            for window in [7, 14, 21, 28]:
                df[f'rolling_mean_{window}_working'] = df['orders'].rolling(window).mean()
            return df
        
        df_warehouse_combined = calculate_lag_and_rolling_features(df_warehouse_combined)
        
        # Fill NaN values in lag and rolling features with -1
        lag_features = [f'lag_{lag}_working' for lag in [7, 14, 21, 28]]
        rolling_features = [f'rolling_mean_{window}_working' for window in [7, 14, 21, 28]]
        df_warehouse_combined[lag_features + rolling_features] = df_warehouse_combined[lag_features + rolling_features].fillna(-1)
        
        # Iteratively predict and update features
        for i in range(len(df_warehouse_train), len(df_warehouse_combined)):
            # Predict the next order
            X_next = df_warehouse_combined.iloc[i:i+1].drop(columns=['id', 'orders', 'date'])
            y_next_pred = best_model.predict(X_next)[0]
            
            # Update the DataFrame with the new prediction
            df_warehouse_combined.at[i, 'orders'] = y_next_pred
            
            # Update lag and rolling features for the next prediction
            for lag in [7, 14, 21, 28]:
                if i + lag < len(df_warehouse_combined):
                    df_warehouse_combined.at[i + lag, f'lag_{lag}_working'] = df_warehouse_combined['orders'].iloc[i]
            
            for window in [7, 14, 21, 28]:
                if i + window < len(df_warehouse_combined):
                    window_data = df_warehouse_combined['orders'].iloc[i+1:i+window+1]
                    if not window_data.isna().all():
                        df_warehouse_combined.at[i + window, f'rolling_mean_{window}_working'] = window_data.mean()
        
        # Align indices before updating df_test_predictions
        test_indices = df_warehouse_test.index
        combined_indices = df_warehouse_combined.index[len(df_warehouse_train):]
        df_test_predictions.loc[test_indices, 'orders'] = df_warehouse_combined.loc[combined_indices, 'orders'].values
    
    # Store predictions for the model
    df_model_predictions[name] = df_test_predictions['orders']
    
    # Print model performance
    score = search.best_score_ * -1
    print(f'Model: {name}\nMAPE: {score}\n==========================')
'''



Model: xgbr
MAPE: 0.03558733591983194




Model: cbr
MAPE: 0.035465007079825764
Model: gbr
MAPE: 0.0357215977655198
Model: hgbr
MAPE: 0.0350670563714717


In [135]:
# # Assuming df_model_predictions is your DataFrame
# df_model_predictions_sorted = df_model_predictions.sort_values(by=['id'])
# 
# # Group by 'id' and use first() to combine rows for each model type
# df_combined = df_model_predictions_sorted.groupby(df_model_predictions_sorted.index).first()

In [59]:
df_model_predictions.shape

(397, 3)

In [60]:
df_model_predictions.head(397)

Unnamed: 0,cbr,hgbr,xgbr
0,9319.817014,10298.813141,8905.837891
1,10226.274378,10955.450499,9677.518555
2,6607.920590,7361.713165,5811.672363
3,5753.572734,6577.110171,5117.410156
4,7262.061826,7073.643628,6655.518066
...,...,...,...
392,6959.441056,6957.643964,6549.690918
393,7150.867065,7043.421271,6867.246094
394,8266.672757,7628.952972,7809.608887
395,7619.185634,7160.413647,7000.818848


In [61]:
df_model_predictions.isna().sum()

cbr     0
hgbr    0
xgbr    0
dtype: int64

In [62]:
# Save model predictions
df_model_predictions.to_pickle('data/model_predictions.pkl')
print("Model predictions saved to 'data/model_predictions.pkl'")

Model predictions saved to 'data/model_predictions.pkl'


In [123]:
# y_pred_actual = target_scaler.inverse_transform(y_pred.reshape(-1, 1))

In [63]:
# Create different ensemble combinations
ensemble_combinations = [
    ['cbr', 'hgbr', 'xgbr'],
    ['hgbr'],
    ['cbr']
]

In [64]:
# Create and save ensemble submissions
for i, combination in enumerate(ensemble_combinations, 42):
    ensemble_predictions = df_model_predictions[combination].mean(axis=1)
    submission = pd.DataFrame({'id': df_test['id'].values, 'orders': ensemble_predictions})
    submission.to_csv(f'data/submission_{i}.csv', index=False)
    print(f"Submission_{i} saved with models: {', '.join(combination)}")
    
print("All submissions have been saved.")

Submission_42 saved with models: cbr, hgbr, xgbr
Submission_43 saved with models: hgbr
Submission_44 saved with models: cbr
All submissions have been saved.
