<a href="https://colab.research.google.com/github/sultanmr/retail_demand_forecast/blob/main/Grocery_Sales_Forecasting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Step 1: Initialize Libraries

In [None]:
import seaborn as sns
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import gdown
import pickle
from sklearn.model_selection import RandomizedSearchCV
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error

required_packages = ["xgboost", "tensorflow", "mlflow", "pyngrok"]

for package in required_packages:
    try:
        __import__(package)
    except ImportError:
        print(f"{package} not found, installing now...")
        !pip install {package}


# Step 2: Loading Data Faster

In [None]:
holidays_events_url = "https://docs.google.com/spreadsheets/d/e/2PACX-1vQFI6-mQt7_iVBbY8XgYrv1Y0Qq-cVjY91K-N0CXGxVPDFZ0Vp41x5kyRoaEcGB836cjXvbT5zLB0nj/pub?gid=446609883&single=true&output=csv"
stores_url = "https://docs.google.com/spreadsheets/d/e/2PACX-1vSS4Jk3_oWrRikXPVa73ZbSk48j4XN2NuyHij8HkH_68Ma_xmwLn-Omzmb_ka35vYmYp4gawr4LgygC/pub?gid=1518879967&single=true&output=csv"
items_url = "https://docs.google.com/spreadsheets/d/e/2PACX-1vSMtQyDxmnC1bIiaznXtquMxYEe0A1rFin-CBFh2SZd2C7Tm9Qr8QxGbh1cI6XprZZ-2TTNv1oNsO_p/pub?gid=322268302&single=true&output=csv"
transactions_url = "https://docs.google.com/spreadsheets/d/e/2PACX-1vQc3XGa4BZLwERgPXpykRX1XZjrX0MM-53xz-v17AycH-a-KNV1T_ZyNB-PQJBE0Ho6Z-Lr2k11HQNY/pub?gid=1224454227&single=true&output=csv"
oil_url = "https://docs.google.com/spreadsheets/d/e/2PACX-1vRamDmxeiATZkEI2Ywe5kGisXO4GXi7RWcE8a31MUpSzXJuOehZeb2RdKoEhO5ZEu8okaTebH4rQVWf/pub?gid=310218469&single=true&output=csv"

df_holidays_events = pd.read_csv(holidays_events_url)
df_stores = pd.read_csv(stores_url)
df_items = pd.read_csv(items_url)
df_transactions = pd.read_csv(transactions_url)
df_oil = pd.read_csv(oil_url)

In [None]:
train_url = "https://drive.google.com/uc?id=1BUcG6vUSAmduBQS3_VlUsdQB_mQ_zdn2"
gdown.download(train_url, "train.csv", quiet=True)
df_train = pd.read_csv("train.csv")

In [None]:
df_train.head()

# Step 2: Loading All the Data from Google Drive

---



In [None]:
if len(df_oil)==0:
  from google.colab import drive
  drive.mount('/content/drive')

  root_path = "/content/drive/MyDrive/ML/datasets/retail_kaggle_data/"
  # Load the CSV files into pandas DataFrames
  df_stores = pd.read_csv(root_path+'stores.csv')
  df_items = pd.read_csv(root_path+'items.csv')
  df_transactions = pd.read_csv(root_path+'transactions.csv')
  df_oil = pd.read_csv(root_path+'oil.csv')
  df_holidays_events = pd.read_csv(root_path+'holidays_events.csv')
  # Select list of stores located in the 'Pichincha' region
  store_ids = df_stores[df_stores['state'] == 'Pichincha']['store_nbr'].unique()

  # Initialize an empty list to hold filtered chunks
  filtered_chunks = []

  # Define the chunk size (number of rows per chunk)
  chunk_size = 10 ** 6  # Adjust based on your system's memory capacity

  # Read the CSV file in chunks
  for chunk in pd.read_csv(root_path+'train.csv', chunksize=chunk_size):
      # Filter the chunk for the desired store IDs
      chunk_filtered = chunk[chunk['store_nbr'].isin(store_ids)]
      # Append the filtered chunk to the list
      filtered_chunks.append(chunk_filtered)
      # Optional: Delete the chunk to free up memory
      del chunk

  # Concatenate all filtered chunks into a single DataFrame
  df_train = pd.concat(filtered_chunks, ignore_index=True)

  # Clean up to free memory
  del filtered_chunks

  #Filter out train.csv file even further by selecting only the rows with the dates that are before April'14. This is what we did in the lessons of the sprint and again need to do here in the project.
  max_date = '2014-04-01'
  df_train = df_train[(df_train['date'] < max_date)]

  df_train.to_csv('train_filtered.csv', index=False, header=True)

# Step 2: Checking for Missing Data

In [None]:
# Checking missing values
df_train.isnull().sum()

In [None]:
# Focusing on missing values in the 'onpromotion' column
df_train['onpromotion'] = df_train['onpromotion'].fillna(False).astype(bool)

# Step 3: Handling Outliers

In [None]:
# Checking for negative sales (returns)
negative_sales = df_train[df_train['unit_sales'] < 0]

# Replacing negative sales with 0 to reflect returns as non-sales
df_train['unit_sales'] = df_train['unit_sales'].apply(lambda x: max(x, 0))

negative_sales.head()  # Viewing negative sales for analysis

In [None]:
df_train['z_score'] = df_train.groupby(['store_nbr', 'item_nbr'])['unit_sales'].transform(
    lambda x: (x - x.mean()) / (x.std() if x.std() != 0 else 1)
)

In [None]:
outliers = df_train[df_train['z_score'] > 5]
# Print summary
print(f"Number of outliers detected: {len(outliers)}")
outliers.head()

# Step 4: Fill missing dates with zero sales

In [None]:
# Convert 'date' column to datetime format
df_train['date'] = pd.to_datetime(df_train['date'])

# Get the minimum and maximum dates in the dataset to create a full date range
min_date = df_train['date'].min()
max_date = df_train['date'].max()

# Get full date range
full_date_range = pd.DataFrame({'date': pd.date_range(start=min_date, end=max_date, freq='D')})


# Create a DataFrame with all (store, item, date) combinations
store_item_combinations = df_train[['store_nbr', 'item_nbr']].drop_duplicates()
all_combinations = store_item_combinations.merge(full_date_range, how='cross')

# Merge with original data to fill missing dates
df_filled = all_combinations.merge(df_train, on=['store_nbr', 'item_nbr', 'date'], how='left')

# Fill missing sales values with 0
df_filled['unit_sales'] = df_filled['unit_sales'].fillna(0)

# Check the first few rows
print(df_filled.head())

# Step 5: Feature Engineering

In [None]:
# Convert date column to datetime
df_train['date'] = pd.to_datetime(df_train['date'])

# Creating new time-based features
df_train['year'] = df_train['date'].dt.year
df_train['month'] = df_train['date'].dt.month
df_train['day'] = df_train['date'].dt.day
df_train['day_of_week'] = df_train['date'].dt.dayofweek

In [None]:
# Calculating rolling average of unit_sales
df_train['unit_sales_7d_avg'] = df_train.groupby(['item_nbr', 'store_nbr'])['unit_sales'].transform(lambda x: x.rolling(window=7).mean())

# Step 6: Visualizing Time-Series Data

# a) Sales Over Time (Aggregated)

In [None]:
# Aggregating total sales by date
sales_by_date = df_train.groupby('date')['unit_sales'].sum()

# Plotting the time-series
plt.figure(figsize=(12,6))
plt.plot(sales_by_date.index, sales_by_date.values)
plt.title('Total Unit Sales Over Time in Pichincha state', fontsize=20, fontweight='bold')
plt.xlabel('Date', fontsize=16)
plt.ylabel('Unit Sales', fontsize=16)
plt.xticks(fontsize=14, rotation=45)
plt.yticks(fontsize=14)
plt.show()

# b) Sales Trend by Year and Month

In [None]:
# Aggregating sales by year and month
sales_by_month = df_train.groupby(['year', 'month'])['unit_sales'].sum().unstack()

In [None]:


plt.figure(figsize=(8, 5))  # Increase figure size for better visibility
sns.heatmap(
    sales_by_month,
    cmap='coolwarm',  # Use a diverging colormap for better contrast
    linewidths=0.5,  # Add lines between cells for clarity
    linecolor='white',  # Use white lines for a cleaner look
    cbar_kws={'label': 'Sales Volume'}  # Add a descriptive colorbar label
)

# Customizing title and axes labels
plt.title('Monthly Sales Trends Over Years', fontsize=22, fontweight='bold')
plt.xlabel('Month', fontsize=18, labelpad=10)  # Labelpad adds spacing
plt.ylabel('Year', fontsize=18, labelpad=10)

# Formatting tick labels
plt.xticks(fontsize=14, rotation=45)  # Rotate x-axis labels for better readability
plt.yticks(fontsize=14)

# Adjust layout for better spacing
plt.tight_layout()

# Display the heatmap
plt.show()

# Step 7: Examining the Impact of Holidays

In [None]:
# Convert date column to datetime
df_holidays_events['date'] = pd.to_datetime(df_holidays_events['date'])
print("Holidays range: from",df_holidays_events['date'].dt.date.min(),"till",df_holidays_events['date'].dt.date.max())

In [None]:
# Merging df_train data with holidays
df_train_holiday = pd.merge(df_train, df_holidays_events, on='date', how='left')

In [None]:
# Aggregating sales by holiday and non-holiday
holiday_sales = df_train_holiday.groupby('type')['unit_sales'].mean()

# Plotting holiday impact
plt.figure(figsize=(8,5))
holiday_sales.plot(kind='bar', color='lightgreen', edgecolor='black')
plt.title('Impact of Holidays on Sales', fontsize=20, fontweight='bold')
plt.ylabel('Average Unit Sales', fontsize=16)
plt.xlabel('')
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.show()

# Step 8: Analyzing Perishable Items

In [None]:
# Merging df_train with items to get perishable data
df_train_items = pd.merge(df_train, df_items, on='item_nbr', how='left')
df_train_items['perishable'] = df_train_items['perishable'].astype(bool)

In [None]:
# Aggregating sales by perishable and non-perishable items
perishable_sales = df_train_items.groupby('perishable')['unit_sales'].sum()

# Plotting sales for perishable and non-perishable items
plt.figure(figsize=(12,6))
perishable_sales.plot(kind='bar', color=['orange', 'green'], edgecolor='black')
plt.title('Sales of Perishable vs Non-Perishable Items', fontsize=16)
plt.ylabel('Total Unit Sales', fontsize=16)
plt.xlabel('')
plt.xticks(
    ticks=[0, 1],
    labels=['Non-Perishable', 'Perishable'],
    fontsize=16,
    rotation=0  # Keep x-axis labels horizontal
)
plt.yticks(fontsize=14)
plt.show()

# Week 2 Goals!

# Top-3 performing item families

In [None]:
df_items['family'].value_counts(sort=False).head(3)

In [None]:
# let's have a list that top-3 families in it
item_families = ['GROCERY I', 'BEVERAGES', 'CLEANING']

# now, we can get ids for the items that are in these familieis
item_ids = df_items[df_items['family'].isin(item_families)]['item_nbr'].unique()

# next, we read in the train.csv data and
# filter it out to get desriesd item ids
# here is how we filter
df_train = df_train[(df_train['item_nbr'].isin(item_ids))]

In [None]:
#Filter out train.csv file even further by selecting only the rows with the dates that are before April'14. This is what we did in the lessons of the sprint and again need to do here in the project.
max_date = '2014-04-01'
df_train = df_train[(df_train['date'] < max_date)]

In [None]:
# let's assumne that train_df is the variable that has the data in it
# and df_stores is the dataframe that contains data about stores
# so you can merge these two dataframes like this:
df_train = df_train.merge(df_stores, on='store_nbr', how='left')

# let's also assume that df_items is the dataframe that
# contains data obout items. So, let's merge now
df_train = df_train.merge(df_items, on='item_nbr', how='left')


In [None]:
df_train.head()

# Split the dataset into training and testing portions.

In [None]:
# Sort by date to ensure chronological order
df_train = df_train.sort_values(by='date')

# Determine the split index (80% for training, 20% for testing)
split_index = int(len(df_train) * 0.8)

# Split the dataset
train_data = df_train.iloc[:split_index]  # First 80%
test_data = df_train.iloc[split_index:]   # Last 20%


# Split the target variable

In [None]:
train_data.columns

In [None]:
cols_to_drop = ['id', 'date', 'unit_sales', 'onpromotion', 'store_nbr', 'item_nbr']
label_col = 'unit_sales'

y_train = train_data[label_col]
X_train = train_data.drop(cols_to_drop, axis=1)

y_test = test_data[label_col]
X_test = test_data.drop(cols_to_drop, axis=1)

# Cleaning

In [None]:
X_train['unit_sales_7d_avg'].fillna(0, inplace=True)
X_test['unit_sales_7d_avg'].fillna(0, inplace=True)

categorical_cols = ['city', 'state', 'type', 'family']

# Apply Label Encoding to categorical columns
label_encoders = {}
for col in categorical_cols:
    label_encoders[col] = LabelEncoder()
    X_train[col] = label_encoders[col].fit_transform(X_train[col])
    X_test[col] = label_encoders[col].transform(X_test[col])

# Initialize MLFlow

In [None]:
from pyngrok import ngrok

In [None]:
from pyngrok import ngrok
import mlflow
ngrok.set_auth_token("2uj6ia3aPjGFf0A4arHdWFTU4xl_3HoYKVeNMkSd31veLbVKc")


ngrok.kill()
mlflow_storage_path = "/content/mlruns"
get_ipython().system_raw("mlflow server --backend-store-uri {} --host 0.0.0.0 --port 5000 &".format(mlflow_storage_path))

mlflow_url = ngrok.connect(5000).public_url
print("MLflow Tracking UI:", mlflow_url)
#mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_tracking_uri(f"file:{mlflow_storage_path}")
mlflow.set_experiment("Grocery Sales Forecasting")

# XGBoost with RandomizedSearchCV

In [None]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
import mlflow
import mlflow.xgboost
from mlflow.models import infer_signature

param_dist = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0],
    'gamma': [0.0, 0.1, 0.2],
    'reg_alpha': [0.001, 0.01, 0.1],
    'reg_lambda': [0.001, 0.01, 0.1]
}


xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_dist,
    n_iter=10,
    scoring='neg_mean_squared_error',
    cv=3,
    verbose=2,
    n_jobs=2,
    random_state=42
)


random_search.fit(X_train, y_train)
best_params = random_search.best_params_


# Log model and metrics to MLflow

In [None]:
from mlflow.models import infer_signature

with mlflow.start_run(run_name="XGBoost Model"):
    print(f"Training with best params: {best_params}")
    #xgb_model = xgb.XGBRegressor(**best_params, objective='reg:squarederror', random_state=42)
    #xgb_model.fit(X_train, y_train)
    xgb_model = random_search.best_estimator_
    y_pred_xgb = xgb_model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred_xgb)
    rmse = np.sqrt(mse)

    mlflow.log_params(best_params)
    mlflow.log_metric("MSE", mse)
    mlflow.log_metric("RMSE", rmse)
   # xgb_model.save("best_xgb_model.h5")

    input_example = X_test
    signature = infer_signature(X_train, xgb_model.predict(X_train))

    mlflow.xgboost.log_model(
        xgb_model,
        artifact_path= "XGBoost_Best_Model",
        signature=signature,
        input_example=input_example
    )

    print(f"Run logged to MLflow - RMSE: {rmse:.4f}")

with open('xgb_model.pkl', 'wb') as file:
    pickle.dump(xgb_model, file)


print("Open MLflow UI here:", mlflow_url)

In [None]:
rmse = mean_squared_error(y_test, y_pred_xgb) ** 0.5
print(f'Root Mean Squared Error (RMSE): {rmse:.4f}')

In [None]:
test_data['date'] = pd.to_datetime(test_data['date'])

df_results = test_data.copy()
df_results['actual_sales'] = y_test
df_results['predicted_sales'] = y_pred_xgb

df_results = df_results[['date', 'actual_sales', 'predicted_sales']]
df_results = df_results.sort_values(by='date')

# Aggregate by month
df_monthly = df_results.resample('M', on='date').sum()

# Plot
plt.figure(figsize=(14, 6))
sns.set_style("whitegrid")

plt.plot(df_monthly.index, df_monthly['actual_sales'], label="Actual Sales", color="blue", marker='o', linestyle='dashed')
plt.plot(df_monthly.index, df_monthly['predicted_sales'], label="Predicted Sales", color="red", marker='s', linestyle='dashed')


plt.xlabel("Date (Monthly)")
plt.ylabel("Unit Sales")
plt.title("Actual vs. Predicted Unit Sales (Monthly Aggregation)")
plt.xticks(rotation=45)  # Rotate x-axis labels for readability
plt.legend()
plt.grid(True)
plt.show()


# LSTM

In [None]:
import tensorflow as tf
import mlflow
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import ParameterSampler
from pyngrok import ngrok
from mlflow.models import infer_signature

def create_lstm_model(units=50, learning_rate=0.001):
    model = Sequential([
        LSTM(units, activation='relu', return_sequences=True, input_shape=(X_train_lstm.shape[1], X_train_lstm.shape[2])),
        Dense(1)
    ])
    optimizer = Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='mse')

    return model

In [None]:
#RELOADING DATA (IN CASE OF CRASH)
import os
import gdown

# Define file name
train_file = "train.csv"

# Check if file exists
if not os.path.exists(train_file):
    print("train.csv not found. Downloading...")
    train_url = "https://drive.google.com/uc?id=1BUcG6vUSAmduBQS3_VlUsdQB_mQ_zdn2"
    gdown.download(train_url, train_file, quiet=True)
    print("Download complete.")
else:
    print("train.csv already exists. Skipping download.")
df_train = pd.read_csv(train_file)

df_train = df_train.sort_values(by='date')
split_index = int(len(df_train) * 0.8)

df_train['date'] = pd.to_datetime(df_train['date'])
train_data = df_train.iloc[:split_index]  # First 80%
test_data = df_train.iloc[split_index:]   # Last 20%

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from sklearn.preprocessing import MinMaxScaler
import numpy as np

train_data = train_data.sort_values(by="date")
test_data = test_data.sort_values(by="date")

scaler = MinMaxScaler(feature_range=(0,1))
train_data['unit_sales_scaled'] = scaler.fit_transform(train_data[['unit_sales']])
test_data['unit_sales_scaled'] = scaler.transform(test_data[['unit_sales']])

def create_sequences(data, seq_length=10):
    X, y = [], []
    for i in range(len(data) - seq_length):
        X.append(data[i:i+seq_length])
        y.append(data[i+seq_length])
    return np.array(X), np.array(y)

seq_length = 10

X_train_lstm, y_train_lstm = create_sequences(train_data['unit_sales_scaled'].values, seq_length)
X_test_lstm, y_test_lstm = create_sequences(test_data['unit_sales_scaled'].values, seq_length)

X_train_lstm = X_train_lstm.reshape((X_train_lstm.shape[0], X_train_lstm.shape[1], 1))
X_test_lstm = X_test_lstm.reshape((X_test_lstm.shape[0], X_test_lstm.shape[1], 1))

In [None]:
lstm_model = create_lstm_model()
lstm_model.fit(X_train_lstm, y_train_lstm,
          epochs=1,
          batch_size=128,
          validation_data=(X_test_lstm, y_test_lstm),
          verbose=1)

# Predict test values
y_pred_lstm = lstm_model.predict(X_test_lstm)

# Reverse scaling
y_pred_lstm = scaler.inverse_transform(y_pred_lstm)
y_test = scaler.inverse_transform(y_test_lstm.reshape(-1, 1))

In [None]:
#_, y_test2 = create_sequences(test_data['unit_sales_scaled'].values, seq_length)
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 5))
plt.plot(test_data['date'][seq_length:], y_test, label="Actual Sales")
plt.plot(test_data['date'][seq_length:], y_pred_lstm, label="Predicted Sales", linestyle="dashed")
plt.xlabel("Date")
plt.ylabel("Unit Sales")
plt.title("LSTM Forecasting for Unit Sales")
plt.legend()
plt.show()

In [None]:
from sklearn.metrics import mean_squared_error

with mlflow.start_run(run_name="Best LSTM Model"):

  for layer in lstm_model.layers:
        if hasattr(layer, "units"):
            mlflow.log_param(f"{layer.name}_units", layer.units)
        if hasattr(layer, "activation"):
            mlflow.log_param(f"{layer.name}_activation", layer.activation)

  mse = mean_squared_error(y_test_lstm, y_pred_lstm)
  rmse = np.sqrt (mse)
  mlflow.log_metric("MSE", mse)
  mlflow.log_metric("RMSE", rmse)

with open('lstm_model.pkl', 'wb') as file:
    pickle.dump(lstm_model, file)


In [None]:
rmse

# Compare LSTM vs. XGBoost

In [None]:

rmse_xgb = mean_squared_error(y_test[seq_length:], y_pred_xgb, squared=False)
rmse_lstm = rmse

print(f'XGBoost RMSE: {rmse_xgb:.4f}')
print(f'LSTM RMSE: {rmse_lstm:.4f}')

In [None]:
plt.figure(figsize=(14,6))

plt.plot(test_data['date'][seq_length:], y_test, label="Actual Sales", color="blue", marker='o', linestyle='dashed')
plt.plot(test_data['date'][seq_length:], y_pred, label="XGBoost Predictions", color="red", marker='s', linestyle='dashed')
plt.plot(test_data['date'][seq_length:], y_pred_lstm, label="LSTM Predictions", color="green", marker='^', linestyle='dashed')

# Formatting
plt.xlabel("Date")
plt.ylabel("Unit Sales")
plt.title("Actual vs. Predicted Sales (XGBoost vs LSTM)")
plt.legend()
plt.grid(True)

plt.show()


# Hyperparameter Tuning

In [None]:
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

param_dist = {
    'n_estimators': np.arange(100, 1000, 100),
    'max_depth': np.arange(3, 12, 1),
    'learning_rate': np.linspace(0.01, 0.3, 10),
    'subsample': np.linspace(0.5, 1.0, 5),
    'colsample_bytree': np.linspace(0.5, 1.0, 5),
    'gamma': np.linspace(0, 0.5, 5),
    'reg_alpha': np.logspace(-3, 1, 5),
    'reg_lambda': np.logspace(-3, 1, 5)
}


random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_dist,
    n_iter=25,
    scoring='neg_mean_squared_error',
    cv=3,  #
    verbose=2,
    n_jobs=-1,
    random_state=42
)


random_search.fit(X_train, y_train)

print("Best hyperparameters:", random_search.best_params_)

In [None]:

best_params = random_search.best_params_

# Train the final XGBoost model with the best parameters
final_xgb_model = xgb.XGBRegressor(**best_params, objective='reg:squarederror', random_state=42)
final_xgb_model.fit(X_train, y_train)

# Make predictions
y_pred = final_xgb_model.predict(X_test)

# Evaluate performance
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"Final Model RMSE: {rmse:.4f}")


In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x=y_test, y=y_pred, alpha=0.6)
plt.xlabel("Actual Unit Sales")
plt.ylabel("Predicted Unit Sales")
plt.title("Actual vs. Predicted Unit Sales")
plt.axline([0, 0], slope=1, color='red', linestyle="--")  # Perfect prediction line
plt.show()
