In [None]:


# Import python packages
import streamlit as st
import pandas as pd  #%%

# We can also use Snowpark for our analyses!
from snowflake.snowpark.context import get_active_session

session = get_active_session()



     

In [None]:

sales = session.table("SALES").to_pandas()
purchase = session.table("PURCHASES").to_pandas()
invoice = session.table("INVOICE_PURCHASE").to_pandas()
end_inv = session.table("END_INV").to_pandas()
beg_inv = session.table("BEG_INV").to_pandas()
display(beg_inv.head())
display(end_inv.head())


In [None]:

nuniques = {"beg_inv": beg_inv.nunique(), "end_inv": end_inv.nunique()}
display(
    pd.DataFrame(nuniques).T[
        [
            "INVENTORYID",
            "STORE",
            "CITY",
            "BRAND",
            "DESCRIPTION",
            "SIZE",
            "ONHAND",
            "BEGINDATE",
            "ENDDATE",
        ]
    ]
)
print(
    f"BEG_INV BRAND nunique: {beg_inv.BRAND.nunique()}, desc + size nunique: {(beg_inv['DESCRIPTION'] + ' ' + beg_inv['SIZE']).nunique()}??? Might need cleaning"
)
print(
    f"end_inv Brand nunique: {end_inv.BRAND.nunique()}, desc + size nunique: {(end_inv['DESCRIPTION'] + ' ' + end_inv['SIZE']).nunique()}"
)
beg_inv_brand = beg_inv.loc[:]
beg_inv_brand["DESC_SIZE"] = beg_inv_brand["DESCRIPTION"] + " " + beg_inv_brand["SIZE"]
group_desc = (
    beg_inv_brand[["BRAND", "DESC_SIZE"]].groupby("DESC_SIZE")["BRAND"].unique()
)
group_desc.loc[group_desc.apply(len) > 1]

display(sales.head())
display(
    beg_inv.loc[
        (beg_inv["BRAND"] == 1004) & (beg_inv["INVENTORYID"] == "1_HARDERSFIELD_1004")
        ]
)
# ? Inventory ID = store_city_brand, Brand = description + Size, With Inventory ID we can find how many onhand the inventory have at the beginning and end.
print(sales.CLASSIFICATION.unique())

## 1. Aggregate the data from the tables

- **Group** the `sales` DataFrame by the `"SALESDATE"` column.  
- **Aggregate** by summing `"SALESQUANTITY"` to get total daily sales.  
- Result stored in a new DataFrame `sales_quantity_price`.



In [None]:
# * group by date, sum sales quantity to get total sales quantity per day
sales_quantity_price = sales.groupby("SALES_DATE").agg({"SALES_QUANTITY": "sum"})

sales_quantity_price.describe();

print(sales_quantity_price);



## 2. Native Snowflake ML Forecasting

Snowflake provides built-in time-series forecasting as a SQL object you train and invoke entirely in SQL

In [None]:
df_to_write = sales_quantity_price.reset_index()


snowpark_df = session.create_dataframe(df_to_write)  
snowpark_df.write.mode("overwrite").save_as_table("SALES_QUANTITY_PRICE");


In [None]:
-- 1) Train the model (requires CREATE SNOWFLAKE.ML.FORECAST privilege)
CREATE or REPLACE SNOWFLAKE.ML.FORECAST inventory_forecast_model (
  INPUT_DATA       => TABLE(sales_quantity_price),
  TIMESTAMP_COLNAME=> 'SALES_DATE',
  TARGET_COLNAME   => 'SALES_QUANTITY'
);



In [None]:
-- 2) Generate a forecast
CREATE or REPLACE table sales_quantity_price_forecast AS
SELECT * 
FROM TABLE(inventory_forecast_model!FORECAST(FORECASTING_PERIODS => 30));

In [None]:
print()

In [None]:
# ── 2) Load your tables into pandas (using SALES_DATE & SALES_QUANTITY) ─────────

sq_df = (
    session
    .table("SALES_QUANTITY_PRICE")
    .select("SALES_DATE", "SALES_QUANTITY")
    .to_pandas()
)

# Ensure the date columns are datetime
sq_df["SALES_DATE"] = pd.to_datetime(sales_df["SALES_DATE"])
sq_df.set_index("SALES_DATE")


### Summary

- **Built-in FORECAST** only returns future forecasts, not fitted training values :contentReference[oaicite:1]{index=1}.  
- To get in-sample predictions you must either **wrap** an external ARIMA implementation in a Python UDF or run your ARIMA entirely in a Container Runtime notebook and persist the `fittedvalues` yourself.  
- Snowflake does not currently expose a table function for fitted (in-sample) values from `SNOWFLAKE.ML.FORECAST`.  

In [None]:
sq_df = (
    session
    .table("SALES_QUANTITY_PRICE")
    .select("SALES_DATE", "SALES_QUANTITY")
    .to_pandas()
)

# 3) Prepare the DataFrame
sq_df["SALES_DATE"] = pd.to_datetime(sq_df["SALES_DATE"])
sq_df = sq_df.set_index("SALES_DATE").sort_index()

# 4) Render with Streamlit
st.title("📈 Raw vs Aggregated Sales Quantity")
st.line_chart(sq_df["SALES_QUANTITY"])

# Load historical and forecast tables into Pandas ─────────────────────────

In [None]:
import pandas as pd

# 1) Load your historical sales
sp_df = (
    session
    .table("SALES_QUANTITY_PRICE")
    .select("SALES_DATE", "SALES_QUANTITY")
    .to_pandas()
)
# ensure SALES_DATE is datetime
sp_df["SALES_DATE"] = pd.to_datetime(sp_df["SALES_DATE"])

# 2) Load your forecast data and rename TS → SALES_DATE
spf_df = (
    session
    .table("SALES_QUANTITY_PRICE_FORECAST")
    .select("TS", "FORECAST", "LOWER_BOUND", "UPPER_BOUND")
    .to_pandas()
)
spf_df.rename(columns={"TS": "SALES_DATE"}, inplace=True)
spf_df["SALES_DATE"] = pd.to_datetime(spf_df["SALES_DATE"])

# 3) Merge on SALES_DATE (outer join to include all dates)
merged_df = pd.merge(
    sp_df,
    spf_df,
    on="SALES_DATE",
    how="outer"
).sort_values("SALES_DATE")

# 4) (Optional) set the date as index
merged_df.set_index("SALES_DATE", inplace=True)

# merged_df now contains:
#  - SALES_QUANTITY (historical)
#  - FORECAST, LOWER_BOUND, UPPER_BOUND (forecasted)
print(merged_df.head())


In [None]:
# Visualize in Streamlit ──────────────────────────────────────────────────
st.title("📊 Actual vs Forecast Sales Quantity with Confidence Bounds")

# Show the raw merged table
st.subheader("Merged Data Table")
st.dataframe(merged_df)

# Plot the time series: actual, forecast, lower & upper bounds
st.subheader("Time Series Plot")
st.line_chart(
    merged_df[["SALES_QUANTITY", "FORECAST", "LOWER_BOUND", "UPPER_BOUND"]],
    height=400,
    use_container_width=True
)








## We want to integrate the process into Cortex to see the power of DATA + ML + LLM

In [None]:
-- This doesn't need to run every time.  CALL SNOWFLAKE.MODELS.CORTEX_BASE_MODELS_REFRESH();
SHOW MODELS IN SNOWFLAKE.MODELS;

In [None]:
sp_forecast_df = session.create_dataframe(merged_df)  
sp_forecast_df.write.mode("overwrite").save_as_table("SALES_FORECAST_FULL");

In [None]:
CREATE or REPLACE TABLE CORTEX_OUT AS
SELECT
  SNOWFLAKE.CORTEX.COMPLETE(
    'LLAMA3-8B',
    $$  
    I have a table SALES_FORECAST_FULL with columns:
      • SALES_DATE  
      • SALES_QUANTITY (actual or imputed forecast)  
      • FORECAST  
      • LOWER_BOUND  
      • UPPER_BOUND  
      
    Please summarize the key trends in the forecast, comment on how the model performed during the historical period, and highlight any notable patterns or anomalies in the 30-day forecast.  
    $$
  ) AS analysis;

In [None]:
SELECT * FROM CORTEX_OUT

Based on the provided table, I'll summarize the key trends in the forecast, comment on the model's performance during the historical period, and highlight any notable patterns or anomalies in the 30-day forecast.

**Key Trends:**

1. **Trend in Sales Quantity:** The trend in sales quantity over time can be analyzed by plotting the actual sales quantity against the forecasted sales quantity. This will help identify if the model is capturing the underlying trend in sales.
2. **Seasonality:** The presence of seasonality can be checked by analyzing the sales quantity and forecasted sales quantity over different days of the week, months, or quarters. This will help identify if the model is capturing seasonal patterns.
3. **Day-of-the-Week Effect:** The day-of-the-week effect can be analyzed by plotting the sales quantity and forecasted sales quantity for each day of the week. This will help identify if the model is capturing the typical patterns of sales on different days of the week.
4. **Month-of-the-Year Effect:** The month-of-the-year effect can be analyzed by plotting the sales quantity and forecasted sales quantity for each month of the year. This will help identify if the model is capturing the typical patterns of sales during different months of the year.

**Model Performance:**

1. **Mean Absolute Error (MAE):** The MAE can be calculated to measure the average difference between the actual sales quantity and the forecasted sales quantity. A lower MAE indicates better model performance.
2. **Mean Squared Error (MSE):** The MSE can be calculated to measure the average squared difference between the actual sales quantity and the forecasted sales quantity. A lower MSE indicates better model performance.
3. **Root Mean Squared Percentage Error (RMSPE):** The RMSPE can be calculated to measure the average percentage difference between the actual sales quantity and the forecasted sales quantity. A lower RMSPE indicates better model performance.

**Notable Patterns or Anomalies in the 30-day Forecast:**

1. **Outliers:** Any extreme values in the 30-day forecast can be identified and investigated to determine if they are anomalies or if they indicate a change in the underlying trend.
2. **Trend Breaks:** Any changes in the trend of the sales quantity or forecasted sales quantity can be identified and investigated to determine if they are anomalies or if they indicate a change in the underlying trend.
3. **Seasonal Patterns:** Any changes in the seasonal patterns of the sales quantity or forecasted sales quantity can be identified and investigated to determine if they are anomalies or if they indicate a change in the underlying trend.

To perform these analyses, you can use various statistical and data visualization techniques, such as:

1. Time series plots to visualize the trend and seasonality in the data.
2. Scatter plots to visualize the relationship between the actual sales quantity and the forecasted sales quantity.
3. Box plots to visualize the distribution of the errors between the actual sales quantity and the forecasted sales quantity.
4. Regression analysis to identify the relationship between the actual sales quantity and the forecasted sales quantity.
5. Statistical tests, such as the Augmented Dickey-Fuller test, to identify the presence of seasonality and trend breaks.

By performing these analyses, you can gain insights into the key trends in the forecast, the model's performance during the historical period, and any notable patterns or anomalies in the 30-day forecast.