<a href="https://colab.research.google.com/github/summerolmstead/Sales-Prediction/blob/main/Pizza_Sales_ML_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Team Project | Pizza Prediction Sales

Summer, Jason, Victoria, Regan

#Importing Data in

In [151]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("rhonarosecortez/pizza-sales-dataset") #importing from kaggle

print("Path to dataset files:", path) # path to the file

Path to dataset files: /root/.cache/kagglehub/datasets/rhonarosecortez/pizza-sales-dataset/versions/2


In [152]:
import os
dataset_files = os.listdir(path)
print("Dataset files:", dataset_files)

Dataset files: ['Pizza Sales Dataset.csv']


In [153]:
import pandas as pd
csv_file_path = os.path.join(path, 'Pizza Sales Dataset.csv')
df = pd.read_csv(csv_file_path)

print(df.head()) # seeing structure of data

   pizza_id  order_id  pizza_name_id  quantity order_date order_day  \
0         1         1     hawaiian_m         1   1/1/2015  Thursday   
1         2         2  classic_dlx_m         1   1/1/2015  Thursday   
2         3         2  five_cheese_l         1   1/1/2015  Thursday   
3         4         2    ital_supr_l         1   1/1/2015  Thursday   
4         5         2     mexicana_m         1   1/1/2015  Thursday   

  order_time  unit_price  total_price pizza_size pizza_category  \
0   11:38:36       13.25        13.25          M        Classic   
1   11:57:40       16.00        16.00          M        Classic   
2   11:57:40       18.50        18.50          L         Veggie   
3   11:57:40       20.75        20.75          L        Supreme   
4   11:57:40       16.00        16.00          M         Veggie   

                                   pizza_ingredients  \
0           Sliced Ham, Pineapple, Mozzarella Cheese   
1  Pepperoni, Mushrooms, Red Onions, Red Peppers,...   
2 

In [154]:
# Print all unique values in the 'order_day' column
print(df['order_day'].unique())

# Print the first few rows of the dataframe to inspect
print(df.head())

['Thursday' 'Friday' 'Saturday' 'Sunday' 'Monday' 'Tuesday' 'Wednesday']
   pizza_id  order_id  pizza_name_id  quantity order_date order_day  \
0         1         1     hawaiian_m         1   1/1/2015  Thursday   
1         2         2  classic_dlx_m         1   1/1/2015  Thursday   
2         3         2  five_cheese_l         1   1/1/2015  Thursday   
3         4         2    ital_supr_l         1   1/1/2015  Thursday   
4         5         2     mexicana_m         1   1/1/2015  Thursday   

  order_time  unit_price  total_price pizza_size pizza_category  \
0   11:38:36       13.25        13.25          M        Classic   
1   11:57:40       16.00        16.00          M        Classic   
2   11:57:40       18.50        18.50          L         Veggie   
3   11:57:40       20.75        20.75          L        Supreme   
4   11:57:40       16.00        16.00          M         Veggie   

                                   pizza_ingredients  \
0           Sliced Ham, Pineapple, Mozzar

# Basic Check of the Data

In [155]:
df.describe() # seeing summary statistics

Unnamed: 0,pizza_id,order_id,quantity,unit_price,total_price
count,48620.0,48620.0,48620.0,48620.0,48620.0
mean,24310.5,10701.479761,1.019622,16.494132,16.821474
std,14035.529381,6180.11977,0.143077,3.621789,4.437398
min,1.0,1.0,1.0,9.75,9.75
25%,12155.75,5337.0,1.0,12.75,12.75
50%,24310.5,10682.5,1.0,16.5,16.5
75%,36465.25,16100.0,1.0,20.25,20.5
max,48620.0,21350.0,4.0,35.95,83.0


# Data Processing

In [156]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from datetime import timedelta

# Preprocess the date-related columns
df['order_date'] = pd.to_datetime(df['order_date'])
df['order_day'] = pd.to_datetime(df['order_day'], format='%A').dt.dayofweek

# Feature engineering for model input
df['month'] = df['order_date'].dt.month
df['day'] = df['order_date'].dt.day
df['year'] = df['order_date'].dt.year
df['weekday'] = df['order_date'].dt.weekday

# **New Step**: Aggregate pizza sales by pizza type and date
pizza_sales_by_day = df.groupby(['order_date', 'pizza_name']).agg(
    total_sales=('total_price', 'sum'),
    total_quantity=('quantity', 'sum'),
    month=('month', 'first'),
    day=('day', 'first'),
    year=('year', 'first'),
    weekday=('weekday', 'first')
).reset_index()

# **New Step**: Create lag features based on past month's sales
pizza_sales_by_day['lag_1'] = pizza_sales_by_day['total_sales'].shift(1)  # Lag 1: Previous month's sales
pizza_sales_by_day['lag_2'] = pizza_sales_by_day['total_sales'].shift(2)  # Lag 2: Two months back sales

# Fill NA values resulting from shifting (they'll appear at the start of the dataset)
pizza_sales_by_day.fillna(0, inplace=True)

# **New Step**: Create pizza ingredient mapping
pizza_ingredients_map = df.groupby('pizza_name')['pizza_ingredients'].apply(lambda x: ', '.join(x.unique())).to_dict()

# Prepare the feature matrix (X) and target variable (y) for predicting total sales and ingredients
X = pizza_sales_by_day[['month', 'day', 'year', 'weekday', 'total_quantity', 'lag_1', 'lag_2']]  # Added lags
y_sales = pizza_sales_by_day['total_sales']  # Target: total sales

# **New Step**: Prepare ingredient predictions
ingredient_predictions = []

for _, row in pizza_sales_by_day.iterrows():
    pizza_name = row['pizza_name']
    ingredients = pizza_ingredients_map.get(pizza_name, '')
    ingredient_predictions.append({'order_date': row['order_date'], 'pizza_name': pizza_name, 'ingredients': ingredients})

ingredient_df = pd.DataFrame(ingredient_predictions)

# Print the first few rows to verify
print("Pizza Sales by Day (with Lags):")
print(pizza_sales_by_day.head())
print("\nIngredient Predictions:")
print(ingredient_df.head())

Pizza Sales by Day (with Lags):
  order_date                    pizza_name  total_sales  total_quantity  \
0 2015-01-01    The Barbecue Chicken Pizza       204.25              11   
1 2015-01-01            The Big Meat Pizza        60.00               5   
2 2015-01-01           The Calabrese Pizza        16.25               1   
3 2015-01-01  The California Chicken Pizza        71.00               4   
4 2015-01-01     The Chicken Alfredo Pizza        29.50               2   

   month  day  year  weekday   lag_1   lag_2  
0      1    1  2015        3    0.00    0.00  
1      1    1  2015        3  204.25    0.00  
2      1    1  2015        3   60.00  204.25  
3      1    1  2015        3   16.25   60.00  
4      1    1  2015        3   71.00   16.25  

Ingredient Predictions:
  order_date                    pizza_name  \
0 2015-01-01    The Barbecue Chicken Pizza   
1 2015-01-01            The Big Meat Pizza   
2 2015-01-01           The Calabrese Pizza   
3 2015-01-01  The Californ

Hyper Parameter Tuning

In [157]:
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

# Define the parameter distribution
param_dist = {
    'learning_rate': np.arange(0.01, 0.2, 0.01),
    'max_depth': [3, 5, 7, 10],
    'n_estimators': [100, 200, 300],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0]
}

# Define the model
model = xgb.XGBRegressor(objective='reg:squarederror')

# RandomizedSearchCV with detailed output
random_search = RandomizedSearchCV(estimator=model, param_distributions=param_dist,
                                   n_iter=10, cv=5, scoring='neg_mean_absolute_error',
                                   random_state=42, n_jobs=-1, verbose=3)

# Fit the model
random_search.fit(X, y_sales)  # X and y_sales are your features and target for sales prediction

# Get the best hyperparameters
print(f"Best hyperparameters: {random_search.best_params_}")

# Use the best model
best_model = random_search.best_estimator_

Fitting 5 folds for each of 10 candidates, totalling 50 fits


KeyboardInterrupt: 

Training Model

In [None]:
# Train-test split for sales forecasting
X_train, X_test, y_train, y_test = train_test_split(X, y_sales, test_size=0.2, random_state=42)

# Train XGBoost model for total sales forecasting
model_sales = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=200, max_depth=5, learning_rate=0.13)
model_sales.fit(X_train, y_train)

# Predicting on test data for sales
y_pred_sales = model_sales.predict(X_test)
mae_sales = mean_absolute_error(y_test, y_pred_sales)
print(f'Mean Absolute Error for Sales Prediction: {mae_sales}')

# Forecasting the next 30 days of pizza sales
future_dates = pd.date_range(pizza_sales_by_day['order_date'].max(), periods=31, freq='D')[1:]  # Next 30 days
future_features = pd.DataFrame({
    'month': future_dates.month,
    'day': future_dates.day,
    'year': future_dates.year,
    'weekday': future_dates.weekday,
    'total_quantity': np.zeros(30),  # You can set this to zeros or an estimated value
    'lag_1': np.zeros(30),  # You can set this based on previous month's prediction
    'lag_2': np.zeros(30),  # Similarly, based on two months prior
})

# Predict future sales
future_sales = model_sales.predict(future_features)

# Prepare a DataFrame for the future sales predictions
future_sales_df = pd.DataFrame({
    'order_date': future_dates,
    'predicted_total_sales': future_sales
})

# Display the predicted future sales
print(future_sales_df)

# Step 1: Calculate the historical sales for each pizza type because we want dynamic percentage points
# Assuming you have a DataFrame `pizza_sales_by_day` with 'pizza_name' and 'sales' columns
pizza_sales_by_pizza = pizza_sales_by_day.groupby('pizza_name')['total_sales'].sum()  # Use 'total_sales' column

# Calculate the total sales across all pizzas
total_sales = pizza_sales_by_pizza.sum()

# Now calculate the percentage of total sales for each pizza
pizza_sales_percentage = pizza_sales_by_pizza / total_sales

# Check the calculated percentages for each pizza
print(pizza_sales_percentage)

# Predict ingredients for the future dates
ingredient_predictions_future = []

for date in future_dates:
    # Get the sales forecast for each pizza type
    daily_ingredients = []
    predicted_sales = future_sales_df.loc[future_sales_df['order_date'] == date, 'predicted_total_sales'].values[0]

    # Loop over each pizza in the historical pizza list
    for pizza_name in pizza_ingredients_map:
        # Get the historical sales percentage for this pizza
        pizza_percentage = pizza_sales_percentage.get(pizza_name, 0)  # Default to 0 if not found

        # Predicted sales for this pizza is the historical percentage * predicted total sales
        predicted_sales_for_pizza = pizza_percentage * predicted_sales

        ingredients = pizza_ingredients_map.get(pizza_name, '')
        if ingredients:
            daily_ingredients.append(f"{pizza_name}: {ingredients} ({pizza_percentage:.2%} of sales, {predicted_sales_for_pizza:.2f} sales)")

    ingredient_predictions_future.append({'date': date, 'ingredients': ', '.join(daily_ingredients)})

# Create a DataFrame to view the ingredient predictions for future dates
ingredient_df_future = pd.DataFrame(ingredient_predictions_future)

# Display the predicted ingredients
print(ingredient_df_future)

In [None]:
# Check the columns of future_sales_df to see if the date information is already in a single column
print(future_sales_df.columns)


In [None]:
# Convert 'order_date' column to datetime format
future_sales_df['order_date'] = pd.to_datetime(future_sales_df['order_date'])
# Merge sales data with ingredient predictions for future dates (e.g., January 2016)
merged_df = pd.merge(future_sales_df[['order_date', 'predicted_total_sales']], ingredient_df_future, left_on='order_date', right_on='date', how='left')

# Drop the 'date' column since it's redundant after merging
merged_df = merged_df.drop(columns=['date'])

# Display the merged data (sales and ingredient predictions)
print(merged_df)

In [None]:
import matplotlib.pyplot as plt

# Plot predicted sales for January 2016
plt.figure(figsize=(10, 6))
plt.bar(merged_df['order_date'], merged_df['predicted_total_sales'], color='skyblue')
plt.xlabel('Date')
plt.ylabel('Predicted Total Sales')
plt.title('Predicted Total Sales for January 2016')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt

# Ensure the 'order_date' column is in datetime format
pizza_sales_by_day['order_date'] = pd.to_datetime(pizza_sales_by_day['order_date'])

# Add month and year columns for grouping
pizza_sales_by_day['month'] = pizza_sales_by_day['order_date'].dt.month
pizza_sales_by_day['year'] = pizza_sales_by_day['order_date'].dt.year

# Filter the actual sales data for 2015
actual_sales_2015 = pizza_sales_by_day[pizza_sales_by_day['year'] == 2015].groupby(['month']).agg(
    actual_sales=('total_sales', 'mean')  # Using total_sales as actual sales
).reset_index()

# Filter the predicted sales data for 2016
predicted_sales_2016 = merged_df[merged_df['year'] == 2016].groupby(['month']).agg(
    predicted_sales=('predicted_total_sales', 'mean')
).reset_index()

# Merge the two datasets on the month column to plot them together
monthly_sales = pd.merge(actual_sales_2015, predicted_sales_2016, on='month', how='outer')

# Plot the monthly average sales for 2015 and 2016
plt.figure(figsize=(12, 6))

# Plot actual sales for 2015
plt.plot(monthly_sales['month'], monthly_sales['actual_sales'], label='Actual Sales (2015)', color='blue', marker='o')

# Plot predicted sales for 2016
plt.plot(monthly_sales['month'], monthly_sales['predicted_sales'], label='Predicted Sales (2016)', color='red', marker='o')

# Customize the plot
plt.xlabel('Month')
plt.ylabel('Average Sales')
plt.title('Monthly Average Sales: 2015 vs Predicted 2016')
plt.xticks(range(1, 13))  # Set x-axis to show month numbers
plt.legend()
plt.tight_layout()
plt.grid(True)
plt.show()
