In [9]:
import pandas as pd
from prophet import Prophet
import statsmodels.api as sm
import numpy as np

# Load the sales, price, and calendar data
sales_df = pd.read_csv('sales_train_validation.csv')
price_df = pd.read_csv('sell_prices.csv')
calendar_df = pd.read_csv('calendar.csv')

# Step 1: Melt the sales dataframe to convert daily sales columns into rows
sales_melted = sales_df.melt(id_vars=['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], 
                             var_name='d', value_name='sales')

# Step 2: Merge the sales data with the calendar data
sales_with_calendar = pd.merge(sales_melted, calendar_df[['d', 'wm_yr_wk', 'date']], on='d', how='left')

# Step 3: Merge sales data with price data
merged_df = pd.merge(sales_with_calendar, price_df, on=['store_id', 'item_id', 'wm_yr_wk'], how='left')


import logging
import cmdstanpy
# Set the logging level for cmdstanpy to WARNING or ERROR to suppress INFO messages
logger = logging.getLogger('cmdstanpy')
logger.setLevel(logging.WARNING)  # Set to ERROR to hide warnings as well

# Function to forecast for a single store-item combination with integer forecast
def forecast_for_item(store_id, item_id):
    # Filter the relevant time series for this store-item combination
    df_filtered = merged_df[(merged_df['store_id'] == store_id) & (merged_df['item_id'] == item_id)].copy()

    # Prepare the dataframe for linear regression
    df_filtered = df_filtered[['date', 'sales', 'sell_price']]
    df_filtered.rename(columns={'date': 'ds', 'sales': 'y', 'sell_price': 'price'}, inplace=True)
    df_filtered['price'] = df_filtered['price'].fillna(df_filtered['price'].mean())

    # Step 1: Fit the linear model on the time series using price as an external regressor
    X = sm.add_constant(df_filtered['price'])  # add constant for OLS
    linear_model = sm.OLS(df_filtered['y'], X).fit()

    # Step 2: Predict the sales using the linear model
    df_filtered['linear_forecast'] = linear_model.predict(X)

    # Step 3: Calculate the residuals (y - linear model forecast)
    df_filtered['residuals'] = df_filtered['y'] - df_filtered['linear_forecast']

    # Step 4: Fit the Prophet model on the residuals
    prophet_data = df_filtered[['ds', 'residuals']].copy()
    prophet_data.rename(columns={'residuals': 'y'}, inplace=True)
    
    prophet_model = Prophet()
    prophet_model.fit(prophet_data)

    # Step 5: Make a future dataframe for Prophet to predict the residuals for the next 28 days
    future = prophet_model.make_future_dataframe(periods=28)
    prophet_forecast = prophet_model.predict(future)

    # Forecast the future prices for the next 28 days
    future_prices = merged_df[(merged_df['store_id'] == store_id) & (merged_df['item_id'] == item_id)].iloc[-28:]['sell_price'].values
    future_prices = np.nan_to_num(future_prices, nan=df_filtered['price'].mean())  # handle NaNs

    # Predict future sales using the linear model
    future_X = sm.add_constant(future_prices, has_constant='add')
    future_linear_forecast = linear_model.predict(future_X)

    # Predict future residuals using Prophet
    future_residuals = prophet_forecast['yhat'][-28:].values

    # Combine linear model and Prophet residuals for future forecast
    final_forecast = future_linear_forecast + future_residuals

    # Ensure forecast is a positive integer (round and clip values)
    final_forecast = np.clip(np.round(final_forecast), 0, None).astype(int)

    return final_forecast

# Step 6: Prepare submission data
submission_data = {}

# Get the first 10 unique store-item combinations
unique_store_item_combinations = merged_df[['store_id', 'item_id', 'id']].drop_duplicates().head(10)

for _, row in unique_store_item_combinations.iterrows():
    store_id = row['store_id']
    item_id = row['item_id']
    item_id_submission = row['id']  # the original 'id' used for submission
    try:
        forecast = forecast_for_item(store_id, item_id)
        submission_data[item_id_submission] = forecast
        print(f"Completed forecast for {store_id} - {item_id}")
    except Exception as e:
        print(f"Error for {store_id} - {item_id}: {e}")

# Step 7: Prepare submission file in the required format
submission_df = pd.DataFrame(submission_data).T
submission_df.columns = [f'F{i+1}' for i in range(28)]  # Columns as F1, F2, ..., F28
submission_df.reset_index(inplace=True)
submission_df.rename(columns={'index': 'id'}, inplace=True)

# Step 8: Create submission file for validation (public) period (d_1913 to d_1941)
submission_filename = 'submission10.csv'
submission_df.to_csv(submission_filename, index=False)

print(f"Submission file '{submission_filename}' created successfully.")


Completed forecast for CA_1 - HOBBIES_1_001
Completed forecast for CA_1 - HOBBIES_1_002
Completed forecast for CA_1 - HOBBIES_1_003
Completed forecast for CA_1 - HOBBIES_1_004
Completed forecast for CA_1 - HOBBIES_1_005
Completed forecast for CA_1 - HOBBIES_1_006
Completed forecast for CA_1 - HOBBIES_1_007
Completed forecast for CA_1 - HOBBIES_1_008
Completed forecast for CA_1 - HOBBIES_1_009
Completed forecast for CA_1 - HOBBIES_1_010
Submission file 'submission10.csv' created successfully.
