In [1]:
# Import necessary libraries for data manipulation and analysis
import pandas as pd
import numpy as np

In [2]:
# Load the engineered retail dataset from a CSV file into a pandas DataFrame
# Using sep=',' to explicitly specify the comma separator, although the file might have other formatting issues.
merged_df = pd.read_csv("/content/engineered_retail_dataset__for_ml.csv", sep=',')
# Display the first 5 rows of the DataFrame to get a preview of the data
merged_df.head()

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,...,Weekly_Sales_Lag2,Weekly_Sales_Lag3,Weekly_Sales_MA4,Weekly_Sales_STD4,Weekly_Sales_MA12,Weekly_Sales_STD12,IsHoliday_Lag1,IsHoliday_Lead1,Rolling_4,Rolling_12
0,1,1,2010-02-05,24924.5,False,42.31,2.572,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,False,True,0.0,0.0
1,1,1,2010-02-12,46039.49,True,38.51,2.548,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,False,False,0.0,0.0
2,1,1,2010-02-19,41595.55,False,39.93,2.514,0.0,0.0,0.0,...,24924.5,0.0,0.0,0.0,0.0,0.0,True,False,0.0,0.0
3,1,1,2010-02-26,19403.54,False,46.63,2.561,0.0,0.0,0.0,...,46039.49,24924.5,0.0,0.0,0.0,0.0,False,False,0.0,0.0
4,1,1,2010-03-05,21827.9,False,46.5,2.625,0.0,0.0,0.0,...,41595.55,46039.49,32990.77,12832.106391,0.0,0.0,False,False,32990.77,0.0


In [3]:
# Display concise information about the DataFrame, including the data types and non-null values in each column
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 421570 entries, 0 to 421569
Data columns (total 34 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   Store               421570 non-null  int64  
 1   Dept                421570 non-null  int64  
 2   Date                421570 non-null  object 
 3   Weekly_Sales        421570 non-null  float64
 4   IsHoliday           421570 non-null  bool   
 5   Temperature         421570 non-null  float64
 6   Fuel_Price          421570 non-null  float64
 7   MarkDown1           421570 non-null  float64
 8   MarkDown2           421570 non-null  float64
 9   MarkDown3           421570 non-null  float64
 10  MarkDown4           421570 non-null  float64
 11  MarkDown5           421570 non-null  float64
 12  CPI                 421570 non-null  float64
 13  Unemployment        421570 non-null  float64
 14  Type                421570 non-null  int64  
 15  Size                421570 non-nul

In [4]:
from sklearn.model_selection import train_test_split

# Define features (X) and target (y)
target = "Weekly_Sales"
features = [col for col in merged_df.columns if col not in ["Weekly_Sales", "Date"]]

X = merged_df[features]
y = merged_df[target]

In [5]:
# This is an empty cell, potentially for future use or notes.

In [6]:
import pandas as pd

# Load the data
merged_df = pd.read_csv("/content/engineered_retail_dataset__for_ml.csv", sep=',')

# Split - use last 20% as test (chronological split)
# Convert 'Date' to datetime objects to enable time-based splitting
merged_df["Date"] = pd.to_datetime(merged_df["Date"])
# Determine the date threshold for the split (80% of the data)
split_date = merged_df["Date"].quantile(0.8)
# Create training and testing sets based on the chronological split
X_train = merged_df[merged_df["Date"] <= split_date][features]
y_train = merged_df[merged_df["Date"] <= split_date][target]
X_test  = merged_df[merged_df["Date"] > split_date][features]
y_test  = merged_df[merged_df["Date"] > split_date][target]

# Print the shapes of the training and testing feature sets
print(X_train.shape, X_test.shape)

(338738, 32) (82832, 32)


In [7]:
# import necessary libraries for ML
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

In [8]:
# Iterate through each column in the training features
for col in X_train.columns:
    # Check if the column data type starts with "period" (indicating a Period data type)
    if str(X_train[col].dtype).startswith("period"):
        # Convert Period data to timestamps first
        X_train[col] = X_train[col].dt.to_timestamp()
        X_test[col] = X_test[col].dt.to_timestamp()

        # Example: extract year and month separately from the timestamp
        X_train[col + "_year"] = X_train[col].dt.year
        X_train[col + "_month"] = X_train[col].dt.month
        X_test[col + "_year"] = X_test[col].dt.year
        X_test[col + "_month"] = X_test[col].dt.month

        # Drop the original period column after extracting features
        X_train = X_train.drop(columns=[col])
        X_test = X_test.drop(columns=[col])

In [9]:
# Display the data types of each column in the training feature set
X_train.dtypes

Unnamed: 0,0
Store,int64
Dept,int64
IsHoliday,bool
Temperature,float64
Fuel_Price,float64
MarkDown1,float64
MarkDown2,float64
MarkDown3,float64
MarkDown4,float64
MarkDown5,float64


In [10]:
# Identify categorical vs numeric columns based on data types
categorical_cols = [c for c in X_train.columns if X_train[c].dtype == "object" or str(X_train[c].dtype).startswith("category")]
numeric_cols = [c for c in X_train.columns if c not in categorical_cols]

In [11]:
# Create a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        # Apply median imputation and standard scaling to numeric columns
        ("num", Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler())
        ]), numeric_cols),

        # Apply most frequent imputation and one-hot encoding to categorical columns
        ("cat", Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("encoder", OneHotEncoder(handle_unknown="ignore"))
        ]), categorical_cols)
    ])

In [12]:
# Define a dictionary of models to be used for training
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=100, random_state=42),
    "XGBoost": xgb.XGBRegressor(n_estimators=100, random_state=42, objective="reg:squarederror")
}

In [13]:
# Dictionary to store the evaluation results for each model
results = {}

# Iterate through each model in the models dictionary
for name, model in models.items():
    # Create a pipeline with the preprocessor and the current model
    pipe = Pipeline(steps=[("preprocessor", preprocessor),
                           ("model", model)])

    # Train the model using the training data
    pipe.fit(X_train, y_train)
    # Make predictions on the test data
    preds = pipe.predict(X_test)

    # Calculate the Root Mean Squared Error (RMSE)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    # Calculate the R-squared score
    r2 = r2_score(y_test, preds)

    # Store the results in the results dictionary
    results[name] = {"RMSE": rmse, "R2": r2}
    # Print the results for the current model
    print(f"{name}: RMSE={rmse:.2f}, R2={r2:.2f}")

Linear Regression: RMSE=3400.66, R2=0.98
Random Forest: RMSE=2986.33, R2=0.98
Gradient Boosting: RMSE=3202.19, R2=0.98
XGBoost: RMSE=3152.35, R2=0.98


## Conclusion
#### Baseline Results (no tuning)

- Linear Regression → RMSE = 3400.66, R² = 0.98

- Random Forest → RMSE = 2986.33, R² = 0.98

- Gradient Boosting → RMSE = 3202.19, R² = 0.98

- XGBoost → RMSE = 3152.35, R² = 0.98

#### At baseline, Random Forest performed best (lowest RMSE).