Copyright 2020 Matthias Anderer

Copyright for aggregation code snippets 2020 by user: https://www.kaggle.com/lebroschar (name unknown)

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.

# Overall approach

We have two different inputs: 

1) Bottom level forecasts on item level (30490 signal) that are derived from a lgbm model that models a probability of this item being bought based on datatime features, price features and a few other features that are not time dependent. (Credits: https://www.kaggle.com/kyakovlev/m5-simple-fe)
2) Top level forecasts for the levels 1-5 that are created with N-Beats. 

We can now aggregate the bottom level "probabilit draws" up to the levels 1-5. By comparing/aligning the possible results we can select the most suitable probability distribution for the forecast period. ( The multiplier in the custom loss of the bottom level lgbm models seems to help adjust for trend or other effects not fully understood yet)

### Overall analysis result: 

The multiplier 0.95 seems to represent the lowest available fit so we build an ensemble with the 2 upper and 2 lower distributions to generate a robust test loss.
<br><br>
Final-11: 0.9 <br>
Final-12: 0.93 <br>
Final-17: 0.95 <br>
Final-13: 0.97 <br>
Final-16: 0.99

In [28]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import warnings
warnings.simplefilter(action='ignore')

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import seaborn as sns



## Load NBEATS reference predictions for global alignment

NBeats predictions trained and predicted on Colab with two different settings (only change in setting is num_epochs to get slightly different ensembles)

In [67]:
# nbeats_pred01_df = pd.read_csv('../input/m5alignnbeatsv01/nbeats_toplvl_forecasts1.csv')
# nbeats_pred02_df = pd.read_csv('../input/m5alignnbeatsv02/nbeats_toplvl_forecasts2.csv')

# Note: F1-F28 in indata.csv and outofdata.csv correspond to d1914-1941 and d1942-d1969, respectively. 
nbeats_pred01_df = pd.read_csv('./input/upper_out/nbeats_toplvl_forecasts2_outofdata.csv')
nbeats_pred01_df.head()

Unnamed: 0,id_str,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,all,41036.144531,37960.230469,36757.847656,36110.585938,40397.902344,49468.214844,50727.144531,39848.457031,38983.117188,...,45233.316406,53769.957031,55817.785156,43048.179688,39788.171875,38896.398438,39141.839844,43358.464844,51671.742188,53691.941406
1,CA,18014.326172,16338.359375,15914.831055,15591.859375,17406.675781,21632.677734,23304.962891,17722.083984,16970.626953,...,18163.107422,22255.035156,23753.103516,17960.953125,16125.870117,15596.844727,15884.791016,18092.335938,22028.152344,23498.333984
2,TX,11617.654297,10666.777344,10315.926758,10174.016602,11086.233398,13096.458984,13611.070312,11174.203125,10421.341797,...,11971.084961,13904.358398,14705.506836,12091.745117,10904.867188,10621.853516,10745.370117,11500.922852,13341.435547,14179.342773
3,WI,11686.342773,10963.235352,10405.642578,10602.512695,12453.588867,14823.274414,14002.179688,11132.737305,11228.587891,...,14449.547852,17859.019531,17442.394531,13444.646484,12793.322266,13252.936523,12608.693359,13733.206055,16089.298828,15243.477539
4,CA_1,4559.85791,4155.916992,4010.812988,3961.010742,4460.675781,5629.976562,6036.05127,4466.053223,4256.944336,...,4762.981445,5849.047852,6272.529785,4516.801758,4153.164062,4021.52832,4084.951172,4757.710449,5889.531738,6298.970215


## Load bottom level lgb predictions for alignment

In [29]:
# BUILD_ENSEMBLE = True
BUILD_ENSEMBLE = False # I just wanna see sample output

In [30]:
if BUILD_ENSEMBLE:
    
    pred_01_df = pd.read_csv('../input/m5-final-13/submission_v1.csv')
    pred_02_df = pd.read_csv('../input/fork-of-m5-final-11/submission_v1.csv')
    pred_03_df = pd.read_csv('../input/m5-final-12/submission_v1.csv')
    pred_04_df = pd.read_csv('../input/m5-final-17/submission_v1.csv')
    pred_05_df = pd.read_csv('../input/m5-final-16/submission_v1.csv')
    #pred_06_df = pd.read_csv('..')

    avg_pred = ( np.array(pred_01_df.values[:,1:]) 
                + np.array(pred_02_df.values[:,1:]) 
                + np.array(pred_03_df.values[:,1:])
                + np.array(pred_04_df.values[:,1:])  
                + np.array(pred_05_df.values[:,1:])  
               # + np.array(pred_06_df.values[:,1:])  
               ) /5.0
    
    ## Loading predictions
    valid_pred_df = pd.DataFrame(avg_pred, columns=pred_01_df.columns[1:])
    submission_pred_df = pd.concat([pred_01_df['id'],valid_pred_df],axis=1)
    
else:
    print('Should not submit single distibution')
    #submission_pred_df = pd.read_csv('../input/m5-final-13/submission_v1.csv')
    submission_pred_df = pd.read_csv('input/bottom_out/submission_v1.csv')

Should not submit single distibution


In [31]:
submission_pred_df

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,HOBBIES_1_001_CA_1_validation,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,HOBBIES_1_002_CA_1_validation,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,HOBBIES_1_003_CA_1_validation,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,HOBBIES_1_004_CA_1_validation,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,HOBBIES_1_005_CA_1_validation,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60975,FOODS_3_823_WI_3_evaluation,0.201094,0.143796,0.124260,0.099562,0.168245,0.314227,0.297308,0.152335,0.143491,...,0.215312,0.408708,0.476055,0.260853,0.243920,0.286225,0.179313,0.206506,0.348027,0.333052
60976,FOODS_3_824_WI_3_evaluation,0.144250,0.065468,0.045932,0.021234,0.074267,0.206200,0.194997,0.519547,0.065163,...,0.109414,0.409556,0.471000,0.175172,0.259158,0.306030,0.089065,0.108346,0.235818,0.224797
60977,FOODS_3_825_WI_3_evaluation,0.799411,0.742113,0.722577,0.696095,0.749129,0.839844,0.828641,0.678052,0.740024,...,0.930880,1.239176,1.278632,0.996482,1.077524,1.094285,0.870279,0.848850,0.960285,0.945454
60978,FOODS_3_826_WI_3_evaluation,1.022139,0.964841,0.945306,0.920607,0.874500,1.169567,1.028474,1.064088,0.964536,...,0.894506,1.180875,1.217752,1.063541,1.085945,1.121424,0.994582,0.914723,1.193525,1.379877


## Fill validation rows - we have no info about validation scoring


Even though it would not make sense at all to score public validation data it might be safest to set the submission validation values to the ground truth....

Spamming the LB a bit more ... 

In [33]:
# fill the submission_pred_df on those records with "_validation" with ground truth! (come from d1914-d1941 of sales_train_evaluation.csv)
# Note: sales_train_validation.csv is (30490, d1~d1913) and sales_train_evaluation.csv is (30490, d1914~d1941)
# validation -> evaluation
validation_gt_data = pd.read_csv('input/data/sales_train_evaluation.csv')
print("Oriignal sales_train_evaluation.csv:")
display(validation_gt_data)
validation_gt_data['id'] = validation_gt_data['id'].str.replace('_evaluation','_validation')
validation_gt_data = validation_gt_data.drop(['item_id','dept_id','cat_id','store_id','state_id'],axis=1)
validation_gt_data = pd.concat([validation_gt_data[['id']],validation_gt_data.iloc[:,-28:]],axis=1)
# validation_gt_data.columns=submission_pred_df.columns.values
print("Final look of the variable validation_gt_data:")
validation_gt_data

Oriignal sales_train_evaluation.csv:


Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1932,d_1933,d_1934,d_1935,d_1936,d_1937,d_1938,d_1939,d_1940,d_1941
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,4,0,0,0,0,3,3,0,1
1,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,1,2,1,1,0,0,0,0,0
2,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,0,2,0,0,0,2,3,0,1
3,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,1,0,4,0,1,3,0,2,6
4,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,2,1,0,0,2,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30485,FOODS_3_823_WI_3_evaluation,FOODS_3_823,FOODS_3,FOODS,WI_3,WI,0,0,2,2,...,1,0,3,0,1,1,0,0,1,1
30486,FOODS_3_824_WI_3_evaluation,FOODS_3_824,FOODS_3,FOODS,WI_3,WI,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
30487,FOODS_3_825_WI_3_evaluation,FOODS_3_825,FOODS_3,FOODS,WI_3,WI,0,6,0,2,...,0,0,1,2,0,1,0,1,0,2
30488,FOODS_3_826_WI_3_evaluation,FOODS_3_826,FOODS_3,FOODS,WI_3,WI,0,0,0,0,...,1,1,1,4,6,0,1,1,1,0


Final look of the variable validation_gt_data:


Unnamed: 0,id,d_1914,d_1915,d_1916,d_1917,d_1918,d_1919,d_1920,d_1921,d_1922,...,d_1932,d_1933,d_1934,d_1935,d_1936,d_1937,d_1938,d_1939,d_1940,d_1941
0,HOBBIES_1_001_CA_1_validation,0,0,0,2,0,3,5,0,0,...,2,4,0,0,0,0,3,3,0,1
1,HOBBIES_1_002_CA_1_validation,0,1,0,0,0,0,0,0,0,...,0,1,2,1,1,0,0,0,0,0
2,HOBBIES_1_003_CA_1_validation,0,0,1,1,0,2,1,0,0,...,1,0,2,0,0,0,2,3,0,1
3,HOBBIES_1_004_CA_1_validation,0,0,1,2,4,1,6,4,0,...,1,1,0,4,0,1,3,0,2,6
4,HOBBIES_1_005_CA_1_validation,1,0,2,3,1,0,3,2,3,...,0,0,0,2,1,0,0,2,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30485,FOODS_3_823_WI_3_validation,0,0,0,2,2,0,0,0,2,...,1,0,3,0,1,1,0,0,1,1
30486,FOODS_3_824_WI_3_validation,0,1,1,1,0,0,0,0,1,...,0,0,0,0,0,0,1,0,1,0
30487,FOODS_3_825_WI_3_validation,0,0,1,1,0,2,1,1,0,...,0,0,1,2,0,1,0,1,0,2
30488,FOODS_3_826_WI_3_validation,1,3,0,1,2,1,0,2,1,...,1,1,1,4,6,0,1,1,1,0


In [49]:
# continue from previous cell, so d1913-d1941 of validation can actually be seen as the next 28 days prediction period of a model trained on 
# sales_train_validation.csv (d1~d1913). 

# convert d1914-d1941 name to F1-F28 
d2f = { f"d_{1913+i}" : f"F{i}" for i in range(1, 28+1)}
validation_gt_data = validation_gt_data.rename(columns=d2f)

# replace 1st half in submission_pred_df 
submission_pred_df = pd.concat([validation_gt_data, submission_pred_df.iloc[30490:,:]],axis=0).reset_index(drop=True)
submission_pred_df

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,HOBBIES_1_001_CA_1_validation,0.000000,0.000000,0.000000,2.000000,0.000000,3.000000,5.000000,0.000000,0.000000,...,2.000000,4.000000,0.000000,0.000000,0.000000,0.000000,3.000000,3.000000,0.000000,1.000000
1,HOBBIES_1_002_CA_1_validation,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,1.000000,2.000000,1.000000,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,HOBBIES_1_003_CA_1_validation,0.000000,0.000000,1.000000,1.000000,0.000000,2.000000,1.000000,0.000000,0.000000,...,1.000000,0.000000,2.000000,0.000000,0.000000,0.000000,2.000000,3.000000,0.000000,1.000000
3,HOBBIES_1_004_CA_1_validation,0.000000,0.000000,1.000000,2.000000,4.000000,1.000000,6.000000,4.000000,0.000000,...,1.000000,1.000000,0.000000,4.000000,0.000000,1.000000,3.000000,0.000000,2.000000,6.000000
4,HOBBIES_1_005_CA_1_validation,1.000000,0.000000,2.000000,3.000000,1.000000,0.000000,3.000000,2.000000,3.000000,...,0.000000,0.000000,0.000000,2.000000,1.000000,0.000000,0.000000,2.000000,1.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60975,FOODS_3_823_WI_3_evaluation,0.201094,0.143796,0.124260,0.099562,0.168245,0.314227,0.297308,0.152335,0.143491,...,0.215312,0.408708,0.476055,0.260853,0.243920,0.286225,0.179313,0.206506,0.348027,0.333052
60976,FOODS_3_824_WI_3_evaluation,0.144250,0.065468,0.045932,0.021234,0.074267,0.206200,0.194997,0.519547,0.065163,...,0.109414,0.409556,0.471000,0.175172,0.259158,0.306030,0.089065,0.108346,0.235818,0.224797
60977,FOODS_3_825_WI_3_evaluation,0.799411,0.742113,0.722577,0.696095,0.749129,0.839844,0.828641,0.678052,0.740024,...,0.930880,1.239176,1.278632,0.996482,1.077524,1.094285,0.870279,0.848850,0.960285,0.945454
60978,FOODS_3_826_WI_3_evaluation,1.022139,0.964841,0.945306,0.920607,0.874500,1.169567,1.028474,1.064088,0.964536,...,0.894506,1.180875,1.217752,1.063541,1.085945,1.121424,0.994582,0.914723,1.193525,1.379877


## Only work on evaluation forecasts

In [50]:
bottom_lvl_pred_df = submission_pred_df.iloc[30490:,:].reset_index(drop=True)
bottom_lvl_pred_df

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,HOBBIES_1_001_CA_1_evaluation,0.753042,0.688188,0.688751,0.686679,0.768146,0.916658,0.874288,0.901520,0.744240,...,0.826474,0.927827,0.892749,0.766914,0.726412,0.720585,0.724602,0.799029,0.944285,0.716593
1,HOBBIES_1_002_CA_1_evaluation,0.258061,0.179185,0.179749,0.177677,0.265720,0.410616,0.366975,0.287745,0.235237,...,0.313481,0.411217,0.374869,0.268727,0.216486,0.210659,0.214676,0.296547,0.438186,0.371699
2,HOBBIES_1_003_CA_1_evaluation,0.380282,0.306656,0.300634,0.298563,0.386606,0.524676,0.481036,0.410726,0.363467,...,0.435126,0.526038,0.489690,0.391707,0.344716,0.332305,0.336321,0.418192,0.553006,0.486519
3,HOBBIES_1_004_CA_1_evaluation,1.576726,1.512710,1.513273,1.556411,1.859568,2.880035,3.226641,1.765071,1.568762,...,1.872590,2.845898,3.199796,1.546131,1.505628,1.499801,1.549028,1.846013,2.863223,2.817502
4,HOBBIES_1_005_CA_1_evaluation,1.019091,0.945465,0.946028,0.943957,1.054895,1.357839,1.305729,1.071401,1.002277,...,1.044939,1.300725,1.344590,1.057527,1.010536,1.004709,1.008726,1.112624,1.412312,1.343253
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30485,FOODS_3_823_WI_3_evaluation,0.201094,0.143796,0.124260,0.099562,0.168245,0.314227,0.297308,0.152335,0.143491,...,0.215312,0.408708,0.476055,0.260853,0.243920,0.286225,0.179313,0.206506,0.348027,0.333052
30486,FOODS_3_824_WI_3_evaluation,0.144250,0.065468,0.045932,0.021234,0.074267,0.206200,0.194997,0.519547,0.065163,...,0.109414,0.409556,0.471000,0.175172,0.259158,0.306030,0.089065,0.108346,0.235818,0.224797
30487,FOODS_3_825_WI_3_evaluation,0.799411,0.742113,0.722577,0.696095,0.749129,0.839844,0.828641,0.678052,0.740024,...,0.930880,1.239176,1.278632,0.996482,1.077524,1.094285,0.870279,0.848850,0.960285,0.945454
30488,FOODS_3_826_WI_3_evaluation,1.022139,0.964841,0.945306,0.920607,0.874500,1.169567,1.028474,1.064088,0.964536,...,0.894506,1.180875,1.217752,1.063541,1.085945,1.121424,0.994582,0.914723,1.193525,1.379877


## Reconstruct level descriptions for aggregation

In [51]:
name_cols = bottom_lvl_pred_df.id.str.split(pat='_',expand=True)
name_cols['dept_id']=name_cols[0]+'_'+name_cols[1]
name_cols['store_id']=name_cols[3]+'_'+name_cols[4]
name_cols = name_cols.rename(columns={0: "cat_id", 3: "state_id"})
name_cols = name_cols.drop([1,2,4,5],axis=1)
bottom_lvl_pred_df = pd.concat([name_cols,bottom_lvl_pred_df],axis=1)

In [53]:
bottom_lvl_pred_df

Unnamed: 0,cat_id,state_id,dept_id,store_id,id,F1,F2,F3,F4,F5,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,HOBBIES,CA,HOBBIES_1,CA_1,HOBBIES_1_001_CA_1_evaluation,0.753042,0.688188,0.688751,0.686679,0.768146,...,0.826474,0.927827,0.892749,0.766914,0.726412,0.720585,0.724602,0.799029,0.944285,0.716593
1,HOBBIES,CA,HOBBIES_1,CA_1,HOBBIES_1_002_CA_1_evaluation,0.258061,0.179185,0.179749,0.177677,0.265720,...,0.313481,0.411217,0.374869,0.268727,0.216486,0.210659,0.214676,0.296547,0.438186,0.371699
2,HOBBIES,CA,HOBBIES_1,CA_1,HOBBIES_1_003_CA_1_evaluation,0.380282,0.306656,0.300634,0.298563,0.386606,...,0.435126,0.526038,0.489690,0.391707,0.344716,0.332305,0.336321,0.418192,0.553006,0.486519
3,HOBBIES,CA,HOBBIES_1,CA_1,HOBBIES_1_004_CA_1_evaluation,1.576726,1.512710,1.513273,1.556411,1.859568,...,1.872590,2.845898,3.199796,1.546131,1.505628,1.499801,1.549028,1.846013,2.863223,2.817502
4,HOBBIES,CA,HOBBIES_1,CA_1,HOBBIES_1_005_CA_1_evaluation,1.019091,0.945465,0.946028,0.943957,1.054895,...,1.044939,1.300725,1.344590,1.057527,1.010536,1.004709,1.008726,1.112624,1.412312,1.343253
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30485,FOODS,WI,FOODS_3,WI_3,FOODS_3_823_WI_3_evaluation,0.201094,0.143796,0.124260,0.099562,0.168245,...,0.215312,0.408708,0.476055,0.260853,0.243920,0.286225,0.179313,0.206506,0.348027,0.333052
30486,FOODS,WI,FOODS_3,WI_3,FOODS_3_824_WI_3_evaluation,0.144250,0.065468,0.045932,0.021234,0.074267,...,0.109414,0.409556,0.471000,0.175172,0.259158,0.306030,0.089065,0.108346,0.235818,0.224797
30487,FOODS,WI,FOODS_3,WI_3,FOODS_3_825_WI_3_evaluation,0.799411,0.742113,0.722577,0.696095,0.749129,...,0.930880,1.239176,1.278632,0.996482,1.077524,1.094285,0.870279,0.848850,0.960285,0.945454
30488,FOODS,WI,FOODS_3,WI_3,FOODS_3_826_WI_3_evaluation,1.022139,0.964841,0.945306,0.920607,0.874500,...,0.894506,1.180875,1.217752,1.063541,1.085945,1.121424,0.994582,0.914723,1.193525,1.379877


## Build aggregates of predictions

In [54]:
# Get column groups
cat_cols = ['id', 'dept_id', 'cat_id',  'store_id', 'state_id']
ts_cols = [col for col in bottom_lvl_pred_df.columns if col not in cat_cols]
ts_dict = {t: int(t[1:]) for t in ts_cols}

# Describe data
print('  unique forecasts: %i' % bottom_lvl_pred_df.shape[0])
for col in cat_cols:
    print('   N_unique %s: %i' % (col, bottom_lvl_pred_df[col].nunique()))


  unique forecasts: 30490
   N_unique id: 30490
   N_unique dept_id: 7
   N_unique cat_id: 3
   N_unique store_id: 10
   N_unique state_id: 3


In [55]:
# 1. All products, all stores, all states (1 series)
all_sales = pd.DataFrame(bottom_lvl_pred_df[ts_cols].sum()).transpose()
all_sales['id_str'] = 'all'
all_sales = all_sales[ ['id_str'] +  [c for c in all_sales if c not in ['id_str']] ]

In [56]:
# 2. All products by state (3 series)
state_sales = bottom_lvl_pred_df.groupby('state_id',as_index=False)[ts_cols].sum()
state_sales['id_str'] = state_sales['state_id'] 
state_sales = state_sales[ ['id_str'] +  [c for c in state_sales if c not in ['id_str']] ]
state_sales = state_sales.drop(['state_id'],axis=1)

In [57]:
# 3. All products by store (10 series)
store_sales = bottom_lvl_pred_df.groupby('store_id',as_index=False)[ts_cols].sum()
store_sales['id_str'] = store_sales['store_id'] 
store_sales = store_sales[ ['id_str'] +  [c for c in store_sales if c not in ['id_str']] ]
store_sales = store_sales.drop(['store_id'],axis=1)

In [58]:
# 4. All products by category (3 series)
cat_sales = bottom_lvl_pred_df.groupby('cat_id',as_index=False)[ts_cols].sum()
cat_sales['id_str'] = cat_sales['cat_id'] 
cat_sales = cat_sales[ ['id_str'] +  [c for c in cat_sales if c not in ['id_str']] ]
cat_sales = cat_sales.drop(['cat_id'],axis=1)


In [59]:
# 5. All products by department (7 series)
dept_sales = bottom_lvl_pred_df.groupby('dept_id',as_index=False)[ts_cols].sum()
dept_sales['id_str'] = dept_sales['dept_id'] 
dept_sales = dept_sales[ ['id_str'] +  [c for c in dept_sales if c not in ['id_str']] ]
dept_sales = dept_sales.drop(['dept_id'],axis=1)

In [60]:
all_pred_agg = pd.concat([all_sales,state_sales,store_sales,cat_sales,dept_sales],ignore_index=True)


In [62]:
# recall that the aggregation is based only on evaluation part of bottom layer model. 
# so F1-F28 here correspond to d1942-d1969.
all_pred_agg 

Unnamed: 0,id_str,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,all,39828.472059,36540.376882,37137.733704,36667.581249,41318.204859,49280.056347,49654.291092,43621.843919,37285.050549,...,45266.444854,55712.339957,57228.749376,44752.586247,42118.382722,42994.526564,40360.486299,44262.026526,52879.8046,47201.402483
1,CA,17548.755748,15811.876754,15739.178651,15773.952862,17629.952666,21816.693251,22664.429894,20345.439355,16632.132013,...,19406.838213,23727.650605,24733.39976,18776.702128,17381.456862,16920.692014,16955.783968,18929.634122,23659.258225,20409.714171
2,TX,11173.295352,10021.896665,10684.51485,10083.294768,11045.593306,12776.691431,13099.49151,11669.659647,9915.190402,...,11761.379968,14471.631849,15230.467992,13281.220228,10912.635656,11832.92735,10779.77311,11740.196294,13713.166646,12501.972513
3,WI,11106.420959,10706.603463,10714.040203,10810.333618,12642.658887,14686.671665,13890.369688,11606.744917,10737.728134,...,14098.226672,17513.057504,17264.881624,12694.663891,13824.290204,14240.9072,12624.929221,13592.19611,15507.379729,14289.715799
4,CA_1,4370.664946,3772.303468,3776.930158,3774.381892,4497.034109,5534.423627,5723.548963,5067.77243,4131.759889,...,4986.540771,5990.147475,6364.461175,4753.137936,4194.361077,4157.053096,4213.281442,4871.423399,6084.327564,5241.449212
5,CA_2,4213.820543,4024.555245,4079.184238,4146.204961,4896.235826,6587.944826,6471.043476,4853.750712,4064.046529,...,5005.646455,6840.85824,6780.784917,4149.77398,4069.422536,4084.382079,4174.33042,4883.788073,6581.29433,5329.210254
6,CA_3,6083.840211,5362.281845,5303.597894,5283.753585,5545.983212,6771.145744,7453.164239,7314.104169,5640.587327,...,6645.13627,7730.593002,8348.284743,6923.661962,6427.562365,6033.73184,5963.172381,6397.57561,7853.722231,7043.933113
7,CA_4,2880.430048,2652.736196,2579.466362,2569.612425,2690.699518,2923.179054,3016.673216,3109.812044,2795.738269,...,2769.514717,3166.051888,3239.868925,2950.128249,2690.110883,2645.524999,2604.999725,2776.84704,3139.914099,2795.121592
8,TX_1,3257.780362,2985.802554,2934.752034,2942.616022,3180.656992,3806.800656,4108.912144,3385.191837,2843.929462,...,3373.644172,4269.949421,4598.956248,4002.608733,3226.853309,3487.91949,3142.814308,3412.771954,4105.153828,3972.337942
9,TX_2,3898.188136,3340.141837,4128.949554,3536.846278,3912.882785,4526.884978,4357.27968,4057.036814,3367.637464,...,4201.398024,5196.470811,5346.972709,4771.223579,3775.912322,4161.988908,3789.796959,4201.070202,4878.243256,4125.210114


# Calculating comparision metrics

## Interpretation

If prediction is bigger than "true" values error will be positive -> prediction is overshooting (pos error)

If prediction is smaller than "true" values error will be negative -> prediction is undershooting (neg error) 


## NBeats 01

In [69]:
# have a look at our upper prediction df 
nbeats_pred01_df

Unnamed: 0,id_str,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,all,41036.144531,37960.230469,36757.847656,36110.585938,40397.902344,49468.214844,50727.144531,39848.457031,38983.117188,...,45233.316406,53769.957031,55817.785156,43048.179688,39788.171875,38896.398438,39141.839844,43358.464844,51671.742188,53691.941406
1,CA,18014.326172,16338.359375,15914.831055,15591.859375,17406.675781,21632.677734,23304.962891,17722.083984,16970.626953,...,18163.107422,22255.035156,23753.103516,17960.953125,16125.870117,15596.844727,15884.791016,18092.335938,22028.152344,23498.333984
2,TX,11617.654297,10666.777344,10315.926758,10174.016602,11086.233398,13096.458984,13611.070312,11174.203125,10421.341797,...,11971.084961,13904.358398,14705.506836,12091.745117,10904.867188,10621.853516,10745.370117,11500.922852,13341.435547,14179.342773
3,WI,11686.342773,10963.235352,10405.642578,10602.512695,12453.588867,14823.274414,14002.179688,11132.737305,11228.587891,...,14449.547852,17859.019531,17442.394531,13444.646484,12793.322266,13252.936523,12608.693359,13733.206055,16089.298828,15243.477539
4,CA_1,4559.85791,4155.916992,4010.812988,3961.010742,4460.675781,5629.976562,6036.05127,4466.053223,4256.944336,...,4762.981445,5849.047852,6272.529785,4516.801758,4153.164062,4021.52832,4084.951172,4757.710449,5889.531738,6298.970215
5,CA_2,4150.600586,4021.164551,3972.109131,4091.837402,4862.009277,6599.766602,6519.443848,4289.94043,4199.371582,...,4672.666504,6284.9375,6392.365723,4103.969727,3931.375,3821.698242,4000.479492,4776.424316,6398.492676,6395.704102
6,CA_3,6339.609375,5750.206543,5538.748535,5356.137207,5600.040039,6679.956055,7537.55127,6171.797852,5838.746582,...,6061.67041,7248.727051,7777.924805,6387.514648,5619.710938,5448.716797,5333.046875,5841.853516,7007.646484,7616.036133
7,CA_4,3049.504639,2740.724365,2588.42627,2543.924805,2653.547852,3019.407959,3210.865479,2932.672852,2708.423096,...,2674.567383,2988.327637,3286.901123,2923.466797,2642.811523,2540.725586,2526.351562,2696.437988,3020.103516,3299.008789
8,TX_1,3407.058838,3136.644531,3071.433594,2944.693848,3387.39917,4062.455566,4236.333984,3274.125488,3077.330078,...,3521.450439,4175.395996,4418.091309,3561.998779,3234.827637,3162.436279,3235.495605,3480.784424,4130.553711,4411.554199
9,TX_2,4097.371094,3754.497803,3610.155273,3613.64917,3872.020264,4574.513672,4783.150879,3899.196289,3649.812744,...,4226.152832,4921.086914,5151.644043,4220.43457,3811.386719,3766.815674,3802.04248,4083.018555,4696.986816,5006.757324


In [68]:
metrics_df = nbeats_pred01_df[['id_str']]

## Calculate errors
## CAUTION: nbeats_pred_df is "truth"/actual values in this context
## the error is calculated based on d1942-d1969, ie, F1-F28 of evaluation part, ie, the real live testing in private leaderboard. 
error = ( np.array(all_pred_agg.values[:,1:]) - np.array(nbeats_pred01_df.values[:,1:]) ) 

## Calc RMSSE
successive_diff = np.diff(nbeats_pred01_df.values[:,1:]) ** 2
denom = successive_diff.mean(1)

num = error.mean(1)**2
rmsse = num / denom

metrics_df['rmsse'] = rmsse

## Not so clean Pandas action :-) - supressing warnings for now...
metrics_df['mean_error'] = error.mean(1)
metrics_df['mean_abs_error'] = np.absolute(error).mean(1)

squared_error = error **2
mean_squ_err = np.array(squared_error.mean(1), dtype=np.float64) 

metrics_df['rmse'] = np.sqrt( mean_squ_err )

metrics_df

Unnamed: 0,id_str,rmsse,mean_error,mean_abs_error,rmse
0,all,0.02691,893.031989,1795.406497,2261.246505
1,CA,0.062293,667.309805,1041.263605,1235.831042
2,TX,0.009133,121.747196,461.861706,609.657451
3,WI,0.003021,93.052207,504.778303,615.014301
4,CA_1,0.000689,19.899763,210.114854,294.554011
5,CA_2,0.013388,117.607791,207.75358,310.910808
6,CA_3,0.270656,382.350787,515.203234,578.154978
7,CA_4,0.012571,26.772701,111.510199,152.061516
8,TX_1,0.002029,-19.685295,162.321807,195.064025
9,TX_2,0.007868,40.439138,221.375099,300.213046


## NBeats 02

In [None]:
metrics_df = nbeats_pred02_df[['id_str']]

## Calculate errors
## CAUTION: nbeats_pred_df is "truth"/actual values in this context
error = ( np.array(all_pred_agg.values[:,1:]) - np.array(nbeats_pred02_df.values[:,1:]) ) 

## Calc RMSSE
successive_diff = np.diff(nbeats_pred01_df.values[:,1:]) ** 2
denom = successive_diff.mean(1)

num = error.mean(1)**2
rmsse = num / denom

metrics_df['rmsse'] = rmsse

## Not so clean Pandas action :-) - supressing warnings for now...
metrics_df['mean_error'] = error.mean(1)
metrics_df['mean_abs_error'] = np.absolute(error).mean(1)

squared_error = error **2
mean_squ_err = np.array(squared_error.mean(1), dtype=np.float64) 

metrics_df['rmse'] = np.sqrt( mean_squ_err )

metrics_df

# Visualizations

### NBeats 01

In [None]:
for i in range(0,nbeats_pred01_df.shape[0]):
    plot_df = pd.concat( [nbeats_pred01_df.iloc[i], all_pred_agg.iloc[i] ]  , axis=1, ignore_index=True)
    plot_df = plot_df.iloc[1:,]
    plot_df = plot_df.rename(columns={0:'NBeats',1:'Predictions'})
    plot_df = plot_df.reset_index()
    #plot_df
    
    plot_df.plot(x='index', y=['NBeats', 'Predictions'] ,figsize=(10,5), grid=True, title=nbeats_pred02_df.iloc[i,0]  )

## NBeats 02

In [None]:
for i in range(0,nbeats_pred02_df.shape[0]):
    plot_df = pd.concat( [nbeats_pred02_df.iloc[i], all_pred_agg.iloc[i] ]  , axis=1, ignore_index=True)
    plot_df = plot_df.iloc[1:,]
    plot_df = plot_df.rename(columns={0:'NBeats',1:'Predictions'})
    plot_df = plot_df.reset_index()
    #plot_df
    
    plot_df.plot(x='index', y=['NBeats', 'Predictions'] ,figsize=(10,5), grid=True, title=nbeats_pred02_df.iloc[i,0]  )

# Submit based on above analysis and manual selection/clearance

In [None]:
submission_pred_df

In [None]:
submission_pred_df.to_csv('m5-final-submission.csv', index=False)
