In [28]:
# Import necessary libraries for data manipulation and analysis
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split # For splitting data (though not used in the final chronological split)
from sklearn.model_selection import RandomizedSearchCV, KFold # For hyperparameter tuning and cross-validation
from sklearn.compose import ColumnTransformer # For applying different transformations to different columns
from sklearn.pipeline import Pipeline # For creating a sequence of data processing steps
from sklearn.preprocessing import StandardScaler, OneHotEncoder # For scaling numerical features and encoding categorical features
from sklearn.impute import SimpleImputer # For handling missing values
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor # Importing the tree-based models
from xgboost import XGBRegressor # Importing XGBoost regressor
import xgboost as xgb # Importing xgboost library
from sklearn.metrics import mean_squared_error, r2_score # For evaluating model performance

In [29]:
# Install the xgboost library
%pip install xgboost



In [30]:
# Load the engineered retail dataset from a CSV file into a pandas DataFrame
# Using sep=',' to explicitly specify the comma separator, although the file might have other formatting issues.
merged_df = pd.read_csv("/content/engineered_retail_dataset__for_ml.csv", sep=',')
# Display the first 5 rows of the DataFrame to get a preview of the data
display(merged_df.head())

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,...,Weekly_Sales_Lag2,Weekly_Sales_Lag3,Weekly_Sales_MA4,Weekly_Sales_STD4,Weekly_Sales_MA12,Weekly_Sales_STD12,IsHoliday_Lag1,IsHoliday_Lead1,Rolling_4,Rolling_12
0,1,1,2010-02-05,24924.5,False,42.31,2.572,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,False,True,0.0,0.0
1,1,1,2010-02-12,46039.49,True,38.51,2.548,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,False,False,0.0,0.0
2,1,1,2010-02-19,41595.55,False,39.93,2.514,0.0,0.0,0.0,...,24924.5,0.0,0.0,0.0,0.0,0.0,True,False,0.0,0.0
3,1,1,2010-02-26,19403.54,False,46.63,2.561,0.0,0.0,0.0,...,46039.49,24924.5,0.0,0.0,0.0,0.0,False,False,0.0,0.0
4,1,1,2010-03-05,21827.9,False,46.5,2.625,0.0,0.0,0.0,...,41595.55,46039.49,32990.77,12832.106391,0.0,0.0,False,False,32990.77,0.0


In [31]:
# Define features (X) and target (y) for the machine learning model
target = "Weekly_Sales" # The column we want to predict
features = [col for col in merged_df.columns if col not in ["Weekly_Sales", "Date"]] # All other columns except the target and Date

X = merged_df[features] # Feature DataFrame
y = merged_df[target] # Target Series

In [32]:
# Convert 'Date' to datetime objects to enable time-based splitting
merged_df["Date"] = pd.to_datetime(merged_df["Date"])
# Determine the date threshold for the split (80% of the data)
split_date = merged_df["Date"].quantile(0.8)
# Create training and testing sets based on the chronological split
# Data before or on the split_date goes into the training set
X_train = merged_df[merged_df["Date"] <= split_date][features]
y_train = merged_df[merged_df["Date"] <= split_date][target]
# Data after the split_date goes into the testing set
X_test  = merged_df[merged_df["Date"] > split_date][features]
y_test  = merged_df[merged_df["Date"] > split_date][target]

# Print the shapes of the training and testing feature sets to verify the split
print(X_train.shape, X_test.shape)

(338738, 32) (82832, 32)


In [33]:
# Cross-validation strategy: 5 folds, shuffled, with a fixed random state for reproducibility
cv = KFold(n_splits=5, shuffle=True, random_state=42)

# Define the hyperparameters to tune for the Random Forest model
rf_params = {
    "model__n_estimators": [100, 200], # Number of trees in the forest
    "model__max_depth": [10, 20], # Maximum depth of the trees
    "model__max_features": ["sqrt", "log2"], # Number of features to consider when looking for the best split
    "model__min_samples_split": [2, 5], # Minimum number of samples required to split an internal node
    "model__min_samples_leaf": [1, 2], # Minimum number of samples required to be at a leaf node
}

# Define the hyperparameters to tune for the Gradient Boosting model
gb_params = {
    "model__n_estimators": [100, 200], # Number of boosting stages to perform
    "model__learning_rate": [0.01, 0.05, 0.1], # Step size shrinkage used in the update to prevent overfitting
    "model__max_depth": [3, 5], # Maximum depth of the individual regression estimators
    "model__subsample": [0.8, 1.0], # Fraction of samples to be used for fitting the individual base learners
    "model__min_samples_split": [2, 5], # Minimum number of samples required to split an internal node
}

In [34]:
# Import necessary modules for preprocessing
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Identify numerical and categorical features based on data types
numerical_features = X_train.select_dtypes(include=np.number).columns.tolist()
categorical_features = X_train.select_dtypes(include='object').columns.tolist()

# Create preprocessing pipelines for numerical and categorical features
# Numerical pipeline: Impute missing values with the median and then scale the features
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Categorical pipeline: Impute missing values with the most frequent value and then one-hot encode the features
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore')) # handle_unknown='ignore' allows for unseen categories during prediction
])

# Combine preprocessing pipelines using ColumnTransformer
# This applies the numerical_transformer to numerical_features and categorical_transformer to categorical_features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [35]:
# Set up Randomized Search Cross-Validation for Random Forest
# Create a pipeline that first preprocesses the data and then applies the Random Forest Regressor model
rf_pipe = Pipeline(steps=[("preprocessor", preprocessor), ("model", RandomForestRegressor(random_state=42))])
# Initialize RandomizedSearchCV to find the best hyperparameters for the Random Forest model
rf_search = RandomizedSearchCV(rf_pipe, rf_params, n_iter=10, cv=cv, scoring="neg_root_mean_squared_error",
                               random_state=42, n_jobs=-1, verbose=2) # n_iter: number of parameter settings that are sampled, cv: cross-validation strategy, scoring: evaluation metric, n_jobs=-1: use all available CPU cores, verbose: control the verbosity output

In [36]:
# Set up Randomized Search Cross-Validation for Gradient Boosting
# Create a pipeline that first preprocesses the data and then applies the Gradient Boosting Regressor model
gb_pipe = Pipeline(steps=[("preprocessor", preprocessor), ("model", GradientBoostingRegressor(random_state=42))])
# Initialize RandomizedSearchCV to find the best hyperparameters for the Gradient Boosting model
gb_search = RandomizedSearchCV(gb_pipe, gb_params, n_iter=10, cv=cv, scoring="neg_root_mean_squared_error",
                               random_state=42, n_jobs=-1, verbose=2) # n_iter: number of parameter settings that are sampled, cv: cross-validation strategy, scoring: evaluation metric, n_jobs=-1: use all available CPU cores, verbose: control the verbosity output

In [37]:
# Fit the RandomizedSearchCV objects to the training data to find the best hyperparameters for each model
print("Tuning Random Forest...")
rf_search.fit(X_train, y_train)

print("Tuning Gradient Boosting...")
gb_search.fit(X_train, y_train)

Tuning Random Forest...
Fitting 5 folds for each of 10 candidates, totalling 50 fits


KeyboardInterrupt: 

In [24]:
print("Tuning Gradient Boosting...")
gb_search.fit(X_train, y_train)

Tuning Gradient Boosting...
Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [25]:
# Display the best hyperparameters found and the corresponding cross-validation scores for each model
print("Best RF params:", rf_search.best_params_)
# The best_score_ is the mean cross-validated score, which is negative RMSE because we used neg_root_mean_squared_error
print("Best RF CV RMSE:", -rf_search.best_score_)

print("Best GB params:", gb_search.best_params_)
print("Best GB CV RMSE:", -gb_search.best_score_)

Best RF params: {'model__n_estimators': 200, 'model__min_samples_split': 2, 'model__min_samples_leaf': 1, 'model__max_features': 'sqrt', 'model__max_depth': 20}
Best RF CV RMSE: 5191.583182205182
Best GB params: {'model__subsample': 1.0, 'model__n_estimators': 200, 'model__min_samples_split': 2, 'model__max_depth': 5, 'model__learning_rate': 0.1}
Best GB CV RMSE: 4688.508664009984
