In [None]:
import warnings; warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import datetime
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.seasonal import seasonal_decompose
# import pmdarima
import pickle
import time
import os
from fbprophet import Prophet

pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 100)
input_path = '/kaggle/input/m5-forecasting-accuracy/'
output_path = '/kaggle/working/'

from multiprocessing import Pool, cpu_count # accelerate the forecasting speed

pd.plotting.register_matplotlib_converters()

### Initialisation

For submision, the evaluation period from 2016-4-25 to 2016-6-19 is splitted into two windows of 28 days each. The first(validation) is used for public leaderboard and the second(evaluation) is reserved for true evaluation (which is the metric used for final scoring of the competition). The $id$s for validation period will contain "validation" while $id$s for evaluation period will contain "evaluation".

In [None]:
TRAIN_END = '2016-4-24'
EVALUATION_START = '2016-4-25'
EVALUATION_END = '2016-6-19'
PREDICTION_PERIOD = 1969-1914

In [None]:
full_df = pickle.load(open('/kaggle/input/m5-eda/full_df', 'rb'))

### Submission preparation

In [None]:
sub = pd.read_csv('sample_submission.csv')

In [None]:
sub = sub.iloc[:int(sub.shape[0]/2)] # to remove the ids with "evaluation", we can predict the demand for 56 days straight and fill in the last 28 days into the evaluation period

### Model

In [None]:
def train_prophet(id):
    df = full_df[full_df.id == id]
    
    mean_price = df.sell_price.mean()
    std_price = df.sell_price.std()
    if not std_price == 0:
        df['sell_price'] = (df.sell_price-mean_price)/std_price

    df_train = df[:TRAIN_END].reset_index().rename(columns={'date': 'ds', 'demand': 'y'})
    future = df[EVALUATION_START:EVALUATION_END].reset_index().rename(columns={'date': 'ds', 'demand': 'y'})
    
    regs = ['sell_price', 'event_superbowl', 'event_valentinesday', 'event_presidentsday', 'event_lentstart', 'event_lentweek2', 'event_stpatricksday', 'event_purim end', 'event_orthodoxeaster', 'event_pesach end', 'event_cinco de mayo', "event_mother's day", 'event_memorialday', 'event_nbafinalsstart', 'event_nbafinalsend', "event_father's day", 'event_independenceday', 'event_ramadan starts', 'event_eid al-fitr', 'event_laborday', 'event_columbusday', 'event_halloween', 'event_eidaladha', 'event_veteransday', 'event_thanksgiving', 'event_christmas', 'event_chanukah end', 'event_newyear', 'event_orthodoxchristmas', 'event_martinlutherkingday', 'event_easter']
    
    prophet = Prophet(weekly_seasonality=True, uncertainty_samples=False, changepoint_prior_scale=0.4, changepoint_range=0.95)

    for reg in regs:
        prophet.add_regressor(reg)
    if 'CA' in id:
        prophet.add_regressor('snap_CA')
    elif 'TX' in id:
        prophet.add_regressor('snap_TX')
    else:
        prophet.add_regressor('snap_WI')
    
    prophet.fit(df_train)

    forecast = prophet.predict(future)
    return id, forecast.yhat.values.clip(min=0)

### Forecasting

In [None]:
ids = full_df.index.values

In [None]:
from tqdm import tqdm

This process will take up to 8 hours in kaggle notebook.

In [None]:
st = time.time()
with Pool(cpu_count()) as pool:
    results = pool.map(train_prophet, ids)
print(time.time()-st)

st=time.time()
for result in tqdm(results):
    sub.loc[result[0]] = result[1][:28]
    sub.loc[result[0].replace('validation', 'evaluation')] = result[1][28:] 
time.time()-st

### Submission

In [None]:
sub.to_csv('submission.csv')