In [1]:
!pip install pandas
!pip install pystan==2.19.1.1
!pip install prophet
!pip install boto3

Collecting pystan==2.19.1.1
  Using cached pystan-2.19.1.1.tar.gz (16.2 MB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting Cython!=0.25.1,>=0.22 (from pystan==2.19.1.1)
  Using cached cython-3.1.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.5 kB)
Using cached cython-3.1.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.2 MB)
Building wheels for collected packages: pystan
  Building wheel for pystan (setup.py) ... [?25lerror
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py bdist_wheel[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m [31m[3 lines of output][0m
  [31m   [0m   self.version = node.value.s
  [31m   [0m Cython>=0.22 and NumPy are required.
  [31m   [0m [31m[end of output][0m
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
[31m  ERROR: Failed building wheel for pystan[0m[31m
[

In [2]:
import pandas as pd
from prophet import Prophet
import boto3
import numpy as np
import os

In [3]:
bucket = 'thedogspaw-small-forecast-data'  # <--- change to your S3 bucket

# Connect to S3
s3 = boto3.client('s3')

def s3_get(key, local):
    s3.download_file(bucket, key, local)
    
s3_get('datasets/thedogspaw_phppos_sales.csv',         '/tmp/sales.csv')
s3_get('datasets/thedogspaw_phppos_sales_items.csv',   '/tmp/sales_items.csv')
s3_get('datasets/thedogspaw_phppos_variations_combined.csv', '/tmp/variations_combined.csv')

In [7]:
sales_df = pd.read_csv(
    '/tmp/sales.csv',
    usecols=['sale_id', 'sale_time',  'location_id'],
    parse_dates=['sale_time'],
    low_memory=False
)

sales_items_df = pd.read_csv(
    '/tmp/sales_items.csv',
    usecols=['sale_id', 'item_variation_id', 'quantity_purchased'],
    low_memory=False
)

variations_combined_df = pd.read_csv('/tmp/variations_combined.csv')

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 50)   # set to None to show all rows if needed

display(sales_items_df.head())

Unnamed: 0,sale_id,item_variation_id,quantity_purchased
0,1,,1.0
1,2,,1.0
2,3,,1.0
3,4,,1.0
4,5,,1.0


In [6]:
print(sales_df.columns)
print(sales_items_df.columns)

Index(['sale_time', 'sale_id', 'location_id'], dtype='object')
Index(['sale_id', 'item_variation_id', 'quantity_purchased'], dtype='object')


## No preprocessing columns needed for sales.csv and sales_items.csv NaN columns to Str
## Since we dont even load them to the notebook

# Variations_combined is good, but we also need the data from sales_items_df and sales_df

## We take quantity_purchased from sales_items_df We take sale_time from sales_df

In [8]:
# Only join sale_time (not location_id)
sales_items_with_time = sales_items_df.merge(
    sales_df[['sale_id', 'sale_time']],
    on='sale_id',
    how='left'
)

In [11]:
# Add location and names from variations_combined_df
combined_df= sales_items_with_time.merge(
    variations_combined_df[['item_variation_id', 'location_id', 'variation_name', 'name']].drop_duplicates(),
    on='item_variation_id',
    how='left'
)

In [12]:
# Create a daily date column
combined_df['sale_date'] = pd.to_datetime(combined_df['sale_time']).dt.date

In [13]:
# Aggregate quantity sold per day, per variation, per location, with readable names
agg_df = combined_df.groupby(
    ['sale_date', 'item_variation_id', 'location_id', 'variation_name', 'name']
)['quantity_purchased'].sum().reset_index()

recent_daily_var_sales = agg_df.rename(
    columns={
        'sale_date': 'date',
        'item_variation_id': 'variation_id',
        'quantity_purchased': 'y'
    }
)

In [14]:
# History check
var_day_counts = (
    recent_daily_var_sales.groupby(['location_id', 'variation_id'])['date']
    .nunique()
    .reset_index()
    .rename(columns={'date': 'num_days_with_sales'})
)
var_day_counts['enough_history'] = var_day_counts['num_days_with_sales'] >= 20

In [15]:
from prophet import Prophet
from tqdm.notebook import tqdm
import numpy as np

lead_time_days = 7
z = 1.65
results = []

for _, row in tqdm(var_day_counts.iterrows(), total=len(var_day_counts)):
    loc = row['location_id']
    var = row['variation_id']
    enough = row['enough_history']
    var_sales_history = recent_daily_var_sales[
        (recent_daily_var_sales['location_id'] == loc) &
        (recent_daily_var_sales['variation_id'] == var)
    ].copy()
    var_sales_history = var_sales_history.rename(columns={'date': 'ds', 'y': 'y'})
    var_sales_history['ds'] = pd.to_datetime(var_sales_history['ds'])

    cutoff_in_loop = var_sales_history['ds'].max() - pd.DateOffset(months=12)
    var_sales_history = var_sales_history[var_sales_history['ds'] >= cutoff_in_loop]

    reorder_level = None
    replenish_level = None

    if enough and len(var_sales_history) >= 20:
        try:
            m = Prophet(daily_seasonality=True)
            m.fit(var_sales_history)
            future = m.make_future_dataframe(periods=lead_time_days)
            forecast = m.predict(future)
            lead_forecast = forecast.tail(lead_time_days)
            demand_lt = lead_forecast['yhat'].sum()
            sigma_lt = (lead_forecast['yhat_upper'].sum() - lead_forecast['yhat_lower'].sum()) / 3.29
            safety_stock = z * sigma_lt
            reorder_level = int(np.round(demand_lt + safety_stock))
            replenish_level = int(np.round(reorder_level + demand_lt))
        except Exception as e:
            last_week = var_sales_history.sort_values('ds').tail(7)
            avg_daily = last_week['y'].mean() if len(last_week) else 1
            demand_lt = avg_daily * lead_time_days
            reorder_level = int(np.round(demand_lt))
            replenish_level = int(np.round(demand_lt * 2))
    else:
        last_week = var_sales_history.sort_values('ds').tail(7)
        avg_daily = last_week['y'].mean() if len(last_week) else 1
        demand_lt = avg_daily * lead_time_days
        reorder_level = int(np.round(demand_lt))
        replenish_level = int(np.round(demand_lt * 2))

    results.append({
        'location_id': loc,
        'variation_id': var,
        'reorder_level': reorder_level,
        'replenish_level': replenish_level,
        'enough_history': enough
    })

results_df = pd.DataFrame(results)


  0%|          | 0/28 [00:00<?, ?it/s]

22:35:58 - cmdstanpy - INFO - Chain [1] start processing
22:35:58 - cmdstanpy - INFO - Chain [1] done processing
22:35:58 - cmdstanpy - INFO - Chain [1] start processing
22:35:58 - cmdstanpy - INFO - Chain [1] done processing
22:35:58 - cmdstanpy - INFO - Chain [1] start processing
22:35:58 - cmdstanpy - INFO - Chain [1] done processing
22:35:58 - cmdstanpy - INFO - Chain [1] start processing
22:35:58 - cmdstanpy - INFO - Chain [1] done processing
22:35:58 - cmdstanpy - INFO - Chain [1] start processing
22:35:58 - cmdstanpy - INFO - Chain [1] done processing
22:35:58 - cmdstanpy - INFO - Chain [1] start processing
22:35:58 - cmdstanpy - INFO - Chain [1] done processing
22:35:59 - cmdstanpy - INFO - Chain [1] start processing
22:35:59 - cmdstanpy - INFO - Chain [1] done processing
22:35:59 - cmdstanpy - INFO - Chain [1] start processing
22:35:59 - cmdstanpy - INFO - Chain [1] done processing
22:35:59 - cmdstanpy - INFO - Chain [1] start processing
22:35:59 - cmdstanpy - INFO - Chain [1]

In [16]:
# Merge in names from agg_df
results_df = results_df.merge(
    agg_df[['item_variation_id', 'variation_name', 'name']].drop_duplicates(),
    left_on='variation_id',
    right_on='item_variation_id',
    how='left'
)


In [17]:
# Optionally add last sale date
last_sale_dates = (
    recent_daily_var_sales.groupby(['location_id', 'variation_id'])['date']
    .max()
    .reset_index()
    .rename(columns={'date': 'last_sale_date'})
)

In [18]:
results_df = results_df.merge(last_sale_dates, on=['location_id', 'variation_id'], how='left')

# Clean up column names for final output
clean_df = results_df[[
    'location_id', 'variation_id', 'reorder_level', 'replenish_level', 'enough_history',
    'variation_name', 'name', 'last_sale_date'
]].rename(columns={'name': 'item_name'})

display(clean_df.head())

Unnamed: 0,location_id,variation_id,reorder_level,replenish_level,enough_history,variation_name,item_name,last_sale_date
0,1.0,1.0,26,52,False,Prices: .99,Bakery,2025-02-09
1,1.0,2.0,61,90,True,Prices: 1.99,Bakery,2025-05-23
2,1.0,3.0,43,63,True,Prices: 2.99,Bakery,2025-05-23
3,1.0,4.0,32,49,True,Prices: 3.99,Bakery,2025-05-20
4,1.0,5.0,38,57,True,Prices: 4.99,Bakery,2025-05-23


In [19]:
clean_df.to_csv('/tmp/variation_reorder_report.csv', index=False)

In [20]:
s3 = boto3.client('s3')

bucket = 'thedogspaw-small-forecast-data'
s3_key = 'results/variation_reorder_report.csv'  # S3 "folder" + file name

s3.upload_file('/tmp/variation_reorder_report.csv', bucket, s3_key)

print(f"✅ File uploaded to s3://{bucket}/{s3_key}")


✅ File uploaded to s3://thedogspaw-small-forecast-data/results/variation_reorder_report.csv
