# import liberies & load data 

In [2]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression

import joblib

In [3]:
df = pd.read_csv("online_retail.csv")
df

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
...,...,...,...,...,...,...,...,...
541904,581587,22613,PACK OF 20 SPACEBOY NAPKINS,12,2011-12-09 12:50:00,0.85,12680.0,France
541905,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,2011-12-09 12:50:00,2.10,12680.0,France
541906,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,2011-12-09 12:50:00,4.15,12680.0,France
541907,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,2011-12-09 12:50:00,4.15,12680.0,France


# data cleaning 

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   InvoiceNo    541909 non-null  object 
 1   StockCode    541909 non-null  object 
 2   Description  540455 non-null  object 
 3   Quantity     541909 non-null  int64  
 4   InvoiceDate  541909 non-null  object 
 5   UnitPrice    541909 non-null  float64
 6   CustomerID   406829 non-null  float64
 7   Country      541909 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 33.1+ MB


In [5]:
df.isnull().sum()

InvoiceNo           0
StockCode           0
Description      1454
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     135080
Country             0
dtype: int64

In [6]:
df = df.drop("CustomerID", axis=1)

df.fillna(0, inplace=True)  


In [7]:
df = df[(df["Quantity"] > 0) & (df["UnitPrice"] > 0)]

In [8]:
df["InvoiceDate"] = pd.to_datetime(df["InvoiceDate"])

df["Sales"] = df["Quantity"] * df["UnitPrice"]

# Daily sales compilation for each product

In [9]:
daily_sales = ( df .groupby(["StockCode", df["InvoiceDate"].dt.date])["Quantity"] .sum().reset_index())  
#A table showing each day, each product, and the quantity sold on that day.

daily_sales.columns = ["StockCode", "Date", "DailyQuantity"]  # Rename columns
daily_sales["Date"] = pd.to_datetime(daily_sales["Date"])     # Date conversion

# Feature Engineering

In [10]:
daily_sales = daily_sales.sort_values(["StockCode", "Date"]) # Data sorting 

daily_sales["lag_1"] = daily_sales.groupby("StockCode")["DailyQuantity"].shift(1)    # Value on the previous day
daily_sales["lag_7"] = daily_sales.groupby("StockCode")["DailyQuantity"].shift(7)    # Value a week (7 days) ago
daily_sales["rolling_7"] = (
    daily_sales.groupby("StockCode")["DailyQuantity"]
    .shift(1)
    .rolling(7)     # Average sales in the last 7 days
    .mean()
)

daily_sales["day_of_week"] = daily_sales["Date"].dt.dayofweek
daily_sales["month"] = daily_sales["Date"].dt.month

model_df = daily_sales.dropna()


# Preparing x & y

In [11]:
features = ["lag_1", "lag_7", "rolling_7", "day_of_week", "month"]
X = model_df[features]
y = model_df["DailyQuantity"]

# Train / Test Split

In [12]:
split_date = model_df["Date"].quantile(0.8) # Split Date

X_train = X[model_df["Date"] <= split_date] # split feature
X_test  = X[model_df["Date"] > split_date]

y_train = y[model_df["Date"] <= split_date] # split target
y_test  = y[model_df["Date"] > split_date]

# If we use the standard train_test_split, it will split randomly, potentially displaying data from the future in the train.

# This will give us unrealistic predictions because we're learning from data that's still after the date we want to predict.

# (RandomForest,XGBRegressor,RandomForestRegressor) model training

In [13]:
models = {
    "RandomForest": RandomForestRegressor(
        n_estimators=100,
        max_depth=10,
        random_state=42,
        n_jobs=-1
    ),
    "XGBoost": XGBRegressor(
        n_estimators=200,
        max_depth=6,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42
    ),
    "LinearRegression": LinearRegression()
}


In [14]:
results = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)

    mae = mean_absolute_error(y_test, preds)
    rmse = np.sqrt(mean_squared_error(y_test, preds))

    results[name] = {
        "model": model,
        "MAE": mae,
        "RMSE": rmse
    }

    print(f"{name} → MAE: {mae:.2f}, RMSE: {rmse:.2f}")


RandomForest → MAE: 18.69, RMSE: 60.18
XGBoost → MAE: 18.40, RMSE: 60.35
LinearRegression → MAE: 19.63, RMSE: 60.14


In [24]:
# chose model
best_model_name = min(results, key=lambda x: results[x]["MAE"])
best_model = results[best_model_name]["model"]

# Save the model

In [25]:
from joblib import dump

dump(
    best_model,
    "sales_forecast_model.joblib",
    compress=3
)


['sales_forecast_model.joblib']