In [None]:
! pip install kaggle
from google.colab import drive
drive.mount('/content/drive')
! mkdir ~/.kaggle
! cp /content/drive/MyDrive/cs231n/assignments/finalproject/kaggle.json ~/.kaggle/kaggle.json
! chmod 600 ~/.kaggle/kaggle.json
! kaggle competitions download -c walmart-recruiting-store-sales-forecasting
! unzip walmart-recruiting-store-sales-forecasting.zip
!unzip features.csv.zip
!unzip train.csv.zip
!unzip test.csv.zip
!unzip sampleSubmission.csv.zip

In [8]:
import wandb
wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mtvani22[0m ([33mfinal-project-ml[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [5]:
import pandas as pd
from prophet import Prophet
from sklearn.model_selection import TimeSeriesSplit
import numpy as np
from tqdm import tqdm

train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')
features = pd.read_csv('./features.csv')
stores = pd.read_csv('./stores.csv')

def prepare_data(df):
    df = df.merge(features, on=['Store', 'Date'], how='left')
    df = df.merge(stores, on='Store', how='left')
    df['Date'] = pd.to_datetime(df['Date'])
    return df

train = prepare_data(train)
test = prepare_data(test)
train = train[train['Weekly_Sales'] >= 0]

submission = pd.DataFrame()
submission['Id'] = test['Store'].astype(str) + '_' + test['Dept'].astype(str) + '_' + test['Date'].dt.strftime('%Y-%m-%d')
submission['Weekly_Sales'] = 0.0

group_cols = ['Store', 'Dept']
train_groups = train.groupby(group_cols)
test_groups = test.groupby(group_cols)

errors = []

# Iterate over each group
for (store, dept), train_group in tqdm(train_groups, desc="Running Prophet per group"):

    if (store, dept) not in test_groups.groups:
        continue

    test_group = test_groups.get_group((store, dept))

    # Prepare data for Prophet
    df = train_group[['Date', 'Weekly_Sales']].rename(columns={'Date': 'ds', 'Weekly_Sales': 'y'})
    df = df.sort_values('ds')

    if len(df) < 25:
        continue

    # Train model
    try:
        model = Prophet(daily_seasonality=False, yearly_seasonality=True, weekly_seasonality=True)
        model.fit(df)

        # Create future dataframe with dates from test set
        future = test_group[['Date']].rename(columns={'Date': 'ds'}).drop_duplicates()
        forecast = model.predict(future)

        # Fill in submission
        forecast = forecast[['ds', 'yhat']]
        merged = test_group[['Store', 'Dept', 'Date']].merge(forecast, left_on='Date', right_on='ds', how='left')
        # Convert Date to string before concatenation
        merged['Id'] = merged['Store'].astype(str) + '_' + merged['Dept'].astype(str) + '_' + merged['Date'].dt.strftime('%Y-%m-%d')
        submission.loc[submission['Id'].isin(merged['Id']), 'Weekly_Sales'] = merged['yhat'].values

    except Exception as e:
        errors.append((store, dept, str(e)))
        continue

# STEP 5: Final submission file
submission.to_csv('/content/submission_prophet.csv', index=False)
print("Submission file saved as prophet_submission.csv")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
18:34:20 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
Running Prophet per group:  82%|████████▏ | 2712/3323 [08:37<01:38,  6.20it/s]DEBUG:cmdstanpy:input tempfile: /tmp/tmpz75iuf7r/k9pe5e_p.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmpz75iuf7r/0ga6qpiz.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=69057', 'data', 'file=/tmp/tmpz75iuf7r/k9pe5e_p.json', 'init=/tmp/tmpz75iuf7r/0ga6qpiz.json', 'output', 'file=/tmp/tmpz75iuf7r/prophet_model8m4ru2xb/prophet_model-20250708183421.csv', 'method=optimize', 'algorithm=lbfgs', 'iter=10000']
18:34:21 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
18:34:21 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
Ru

✅ Submission file saved as prophet_submission.csv


In [None]:
import pandas as pd
from prophet import Prophet
import numpy as np
from tqdm import tqdm
import wandb
import matplotlib.pyplot as plt


train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
features = pd.read_csv("features.csv")
stores = pd.read_csv("stores.csv")

train['Date'] = pd.to_datetime(train['Date'], errors='coerce')
test['Date'] = pd.to_datetime(test['Date'], errors='coerce')
features['Date'] = pd.to_datetime(features['Date'], errors='coerce')

wandb.init(project="walmart-forecasting", name="prophet-per-store-dept")

def prepare_data(df, features_df, stores_df):
    df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
    features_df['Date'] = pd.to_datetime(features_df['Date'], errors='coerce')  # ensures merge match
    df = df.merge(features_df, on=['Store', 'Date'], how='left')
    df = df.merge(stores_df, on='Store', how='left')
    return df

train = prepare_data(train, features, stores)
test = prepare_data(test, features, stores)
train = train[train['Weekly_Sales'] >= 0]

# -----------------------
# Submission
# -----------------------
submission = pd.DataFrame()
submission['Id'] = test['Store'].astype(str) + '_' + test['Dept'].astype(str) + '_' + test['Date'].dt.strftime('%Y-%m-%d')
submission['Weekly_Sales'] = 0.0

group_cols = ['Store', 'Dept']
train_groups = train.groupby(group_cols)
test_groups = test.groupby(group_cols)

errors = []
group_metrics = []

# -----------------------
# Loop per (Store, Dept)
# -----------------------
for (store, dept), train_group in tqdm(train_groups, desc="Running Prophet per group"):
    if (store, dept) not in test_groups.groups:
        continue

    test_group = test_groups.get_group((store, dept))
    df = train_group[['Date', 'Weekly_Sales']].rename(columns={'Date': 'ds', 'Weekly_Sales': 'y'}).sort_values('ds')

    if len(df) < 25:
        continue

    try:
        model = Prophet(daily_seasonality=False, yearly_seasonality=True, weekly_seasonality=True)
        model.fit(df)

        future = test_group[['Date']].rename(columns={'Date': 'ds'}).drop_duplicates()
        forecast = model.predict(future)

        forecast = forecast[['ds', 'yhat']]
        merged = test_group[['Store', 'Dept', 'Date', 'IsHoliday']].merge(forecast, left_on='Date', right_on='ds', how='left')
        merged['Id'] = merged['Store'].astype(str) + '_' + merged['Dept'].astype(str) + '_' + merged['Date'].dt.strftime('%Y-%m-%d')

        # Fill submission
        submission.loc[submission['Id'].isin(merged['Id']), 'Weekly_Sales'] = merged['yhat'].values

        # ----- Log WMAE if ground truth is present -----
        if 'Weekly_Sales' in test_group.columns:
            y_true = test_group['Weekly_Sales'].values
            y_pred = merged['yhat'].values
            is_holiday = test_group['IsHoliday'].values.astype(int)
            weights = np.where(is_holiday, 5, 1)
            wmae = np.sum(weights * np.abs(y_true - y_pred)) / np.sum(weights)

            wandb.log({
                "group": f"{store}_{dept}",
                "WMAE": wmae,
                "train_len": len(train_group),
                "test_len": len(test_group)
            })

            group_metrics.append(wmae)

        # ----- Log plot for first few groups -----
        if len(group_metrics) <= 5:
            fig = model.plot(forecast)
            plt.title(f"Forecast for Store {store}, Dept {dept}")
            wandb.log({f"forecast_{store}_{dept}": wandb.Image(fig)})

    except Exception as e:
        errors.append((store, dept, str(e)))
        wandb.log({"error_group": f"{store}_{dept}", "error_msg": str(e)})
        continue

# -----------------------
# Save submission file
# -----------------------
submission.to_csv('/content/submission_prophet.csv', index=False)
print("Submission file saved as prophet_submission.csv")
wandb.save('/content/submission_prophet.csv')

# -----------------------
# Log summary metrics
# -----------------------
if group_metrics:
    wandb.summary["WMAE_mean"] = np.mean(group_metrics)
    wandb.summary["WMAE_median"] = np.median(group_metrics)
    wandb.summary["groups_processed"] = len(group_metrics)

# Log all errors as table
if errors:
    error_df = pd.DataFrame(errors, columns=["Store", "Dept", "Error"])
    wandb.log({"error_table": wandb.Table(dataframe=error_df)})

wandb.finish()