# the_dogs_paw

In [None]:
!pip install pandas
!pip install pystan==2.19.1.1
!pip install prophet
!pip install boto3

import pandas as pd
import numpy as np
from prophet import Prophet
import boto3
import os

In [None]:
import sagemaker
print(sagemaker.get_execution_role())


## S3 Setup

In [None]:
bucket = 'thedogspaw-small-forecast-data'  # <--- change to your S3 bucket

# Connect to S3
s3 = boto3.client('s3')

def s3_get(key, local):
    s3.download_file(bucket, key, local)

# Download all required files to /tmp
s3_get('datasets/thedogspaw_phppos_items.csv',         '/tmp/items.csv')
s3_get('datasets/thedogspaw_phppos_locations.csv',     '/tmp/locations.csv')
s3_get('datasets/thedogspaw_phppos_location_items.csv','/tmp/location_items.csv')
s3_get('datasets/thedogspaw_phppos_sales.csv',         '/tmp/sales.csv')
s3_get('datasets/thedogspaw_phppos_sales_items.csv',   '/tmp/sales_items.csv')


In [None]:
items_df         = pd.read_csv('/tmp/items.csv')
locations_df     = pd.read_csv('/tmp/locations.csv')
locations_item_df= pd.read_csv('/tmp/location_items.csv')
sales_df         = pd.read_csv('/tmp/sales.csv', parse_dates=['sale_time'])
sales_items_df   = pd.read_csv('/tmp/sales_items.csv')

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 50)   # set to None to show all rows if needed

display(items_df.head())


## Preprocess sales.csv, items.csv and sale_items.csv NaN columns to Str


In [None]:
sales_cols_as_str = [5, 29, 37, 38, 40, 41, 53, 54, 63]  # zero-based columns

sales_df = pd.read_csv(
    '/tmp/sales.csv',
    parse_dates=['sale_time'],
    dtype={col: str for col in sales_cols_as_str},
    low_memory=False
)
items_df = pd.read_csv(
    '/tmp/items.csv',
    dtype={56: str, 74: str},  # or use actual column names
    low_memory=False
)
sales_items_df = pd.read_csv(
    '/tmp/sales_items.csv',
    dtype={6: str, 23: str},   # or use names!
    low_memory=False
)

## Sanity Check

In [None]:
# Check for duplicate item_ids
duplicates = items_df[items_df.duplicated('item_id', keep=False)]
print(f"Found {duplicates['item_id'].nunique()} duplicated item_ids.")
print(duplicates.sort_values('item_id'))

# Check for duplicate item names
name_dupes = items_df[items_df.duplicated('name', keep=False)]
print(f"Found {name_dupes['name'].nunique()} duplicated item names.")
print(name_dupes.sort_values('name'))

# See name duplicate counts
name_counts = items_df['name'].value_counts()
dupe_names = name_counts[name_counts > 1].index.tolist()
print(f"Found {len(dupe_names)} item names with duplicates.")
print(dupe_names[:20])

# See all items with duplicate names
dupes = items_df[items_df['name'].isin(dupe_names)].sort_values('name')
print(dupes[['item_id', 'name', 'category_id', 'supplier_id']].head(30))


## Merge Sales Items with Sales (Add date/store info to item sales) 

In [None]:
item_sales = sales_items_df.merge(
    sales_df[['sale_id', 'sale_time', 'location_id']],
    on='sale_id', how='left'
)
item_sales['date'] = pd.to_datetime(item_sales['sale_time']).dt.date
print(item_sales[['sale_id', 'item_id', 'location_id', 'date', 'quantity_purchased']].tail(10))


## Aggregate Daily Sales Per Item Per Store

In [None]:
daily_item_sales = (
    item_sales.groupby(['location_id', 'item_id', 'date'])['quantity_purchased']
    .sum()
    .reset_index()
    .sort_values(['location_id', 'item_id', 'date'])
)
print(daily_item_sales.tail(10))

## Calculate enough history (have a minimum 20 total sales day after filtering) for forecasting

In [None]:
item_day_counts = (
    recent_daily_item_sales.groupby(['location_id', 'item_id'])['date']
    .nunique()
    .reset_index()
    .rename(columns={'date': 'num_days_with_sales'})
)
item_day_counts['enough_history'] = item_day_counts['num_days_with_sales'] >= 20
print(item_day_counts.head(10))
print(f"Forecastable items: {item_day_counts['enough_history'].sum()} / {len(item_day_counts)}")


## The main shit. Forecast / fallback per item per store

In [None]:
from tqdm.notebook import tqdm

lead_time_days = 7
z = 1.65
results = []

for _, row in tqdm(item_day_counts.iterrows(), total=len(item_day_counts)):
    loc = row['location_id']
    item = row['item_id']
    enough = row['enough_history']

    item_sales_history = daily_item_sales[
        (daily_item_sales['location_id'] == loc) &
        (daily_item_sales['item_id'] == item)
    ].copy()
    item_sales_history = item_sales_history.rename(columns={'date': 'ds', 'quantity_purchased': 'y'})
    item_sales_history['ds'] = pd.to_datetime(item_sales_history['ds'])

    # Apply 12-month cutoff in the loop, to match recent_daily_item_sales
    cutoff_in_loop = item_sales_history['ds'].max() - pd.DateOffset(months=12)
    item_sales_history = item_sales_history[item_sales_history['ds'] >= cutoff_in_loop]

    reorder_level = None
    replenish_level = None

    if enough and len(item_sales_history) >= 20:
        try:
            m = Prophet(daily_seasonality=True)
            m.fit(item_sales_history)
            future = m.make_future_dataframe(periods=lead_time_days)
            forecast = m.predict(future)
            lead_forecast = forecast.tail(lead_time_days)
            demand_lt = lead_forecast['yhat'].sum()
            sigma_lt = (lead_forecast['yhat_upper'].sum() - lead_forecast['yhat_lower'].sum()) / 3.29
            safety_stock = z * sigma_lt
            reorder_level = int(np.round(demand_lt + safety_stock))
            replenish_level = int(np.round(reorder_level + demand_lt))
        except Exception as e:
            last_week = item_sales_history.sort_values('ds').tail(7)
            avg_daily = last_week['y'].mean() if len(last_week) else 1
            demand_lt = avg_daily * lead_time_days
            reorder_level = int(np.round(demand_lt))
            replenish_level = int(np.round(demand_lt * 2))
    else:
        last_week = item_sales_history.sort_values('ds').tail(7)
        avg_daily = last_week['y'].mean() if len(last_week) else 1
        demand_lt = avg_daily * lead_time_days
        reorder_level = int(np.round(demand_lt))
        replenish_level = int(np.round(demand_lt * 2))

    results.append({
        'location_id': loc,
        'item_id': item,
        'reorder_level': reorder_level,
        'replenish_level': replenish_level,
        'enough_history': enough
    })

results_df = pd.DataFrame(results)
print(results_df.head(10))
print("-----------------")
print(results_df.tail(10))


In [None]:
# Merge item names
merged_results = results_df.merge(items_df[['item_id', 'name']], on='item_id', how='left')

# Last sale date for each (location, item)
last_sale_dates = (
    recent_daily_item_sales.groupby(['location_id', 'item_id'])['date']
    .max()
    .reset_index()
    .rename(columns={'date': 'last_sale_date'})
)

merged_results = merged_results.merge(last_sale_dates, on=['location_id', 'item_id'], how='left')

print(merged_results[['location_id', 'item_id', 'name', 'reorder_level', 'replenish_level', 'last_sale_date']].head(20))


In [None]:
results_key = 'results/thedogspaw_reorder_replenish_results.csv'
merged_results.to_csv('/tmp/thedogspaw_reorder_replenish_results.csv', index=False)
s3.upload_file('/tmp/thedogspaw_reorder_replenish_results.csv', bucket, results_key)
os.remove('/tmp/thedogspaw_reorder_replenish_results.csv')
print(f"✅ Reorder & replenish results written to s3://{bucket}/{results_key}")
