# 1. Loading Data

In [9]:
import sys
import gc
import os
import warnings
import pickle
import statsmodels.api as sm
from pylab import rcParams
import time
from  datetime import datetime, timedelta

import pandas as pd
from pandas.plotting import register_matplotlib_converters
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing, metrics

warnings.filterwarnings("ignore")

pd.set_option("display.max_columns", 500)
pd.set_option("display.max_rows", 500)

register_matplotlib_converters()
sns.set()

try:
    import google.colab
    IN_COLAB = True
except:
    IN_COLAB = False

# # Google Colab trick to extend memory
# a = []
# while(1):
#     a.append('1')


## 1.1 Functions

In [10]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df


def display_missing(df):    
    for col in df.columns.tolist():  
        if df[col].isnull().sum() != 0:
            print('{} column missing values: {}'.format(col, df[col].isnull().sum()))
    print('\n')
    


## 1.2 Loading data grid


In [11]:
# Mount google drive
if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive')

In [12]:
# Setting directories where data is stored and ouptut dir
if IN_COLAB:
    DATA_GRID_INPUT_DIR = './drive/My Drive/Colab Notebooks' 
    DATA_OUTPUT_DIR = './drive/My Drive/Colab Notebooks'
    !ls './drive/My Drive/Colab Notebooks'
else:
    DATA_GRID_INPUT_DIR = '.'
    DATA_OUTPUT_DIR = '.'

In [13]:
print('Loading the data...')

data = pd.read_pickle(f'{DATA_GRID_INPUT_DIR}/m5_data_direct.pkl')

Loading the data...


## 1.3 Init variables

In [25]:
h = 28 # Prediction horizon
MAX_LAGS = 150 # Max lags used
SEED = 7


In [15]:
data

Unnamed: 0,id,item_id,dept_id,store_id,cat_id,state_id,d,sales,date,wm_yr_wk,weekday,wday,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price
0,HOBBIES_1_001_CA_1_validation,0,0,0,0,0,d_1686,0.0,2015-09-10,11532,4,6,9,2015,0,0,0,0,1.0,0.0,0.0,8.257812
1,HOBBIES_1_001_CA_1_validation,0,0,0,0,0,d_1687,2.0,2015-09-11,11532,0,7,9,2015,0,0,0,0,0.0,1.0,1.0,8.257812
2,HOBBIES_1_002_CA_1_validation,1,0,0,0,0,d_1686,0.0,2015-09-10,11532,4,6,9,2015,0,0,0,0,1.0,0.0,0.0,3.970703
3,HOBBIES_1_002_CA_1_validation,1,0,0,0,0,d_1687,1.0,2015-09-11,11532,0,7,9,2015,0,0,0,0,0.0,1.0,1.0,3.970703
4,HOBBIES_1_003_CA_1_validation,2,0,0,0,0,d_1686,1.0,2015-09-10,11532,4,6,9,2015,0,0,0,0,1.0,0.0,0.0,2.970703
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6946239,FOODS_3_825_WI_3_validation,3046,6,9,2,2,d_1913,0.0,2016-04-24,11613,3,2,4,2016,0,0,0,0,0.0,0.0,0.0,3.980469
6946240,FOODS_3_826_WI_3_validation,3047,6,9,2,2,d_1912,1.0,2016-04-23,11613,2,1,4,2016,0,0,0,0,0.0,0.0,0.0,1.280273
6946241,FOODS_3_826_WI_3_validation,3047,6,9,2,2,d_1913,3.0,2016-04-24,11613,3,2,4,2016,0,0,0,0,0.0,0.0,0.0,1.280273
6946242,FOODS_3_827_WI_3_validation,3048,6,9,2,2,d_1912,0.0,2016-04-23,11613,2,1,4,2016,0,0,0,0,0.0,0.0,0.0,1.000000


# 2. Feature Engineering and Data Preprocessing

## Creating features


In [20]:
def create_features(df):
    print('Computing lags...')
    lags = range(29)
    lag_cols = [f"lag_t{lag}" for lag in lags ]
    
    for lag, lag_col in zip(lags, lag_cols):
        df[lag_col] = df[["id","sales"]].groupby("id")["sales"].shift(lag).astype(np.float16)
        
    print('Computing rollings...')
    wins = [3, 7, 14, 30, 60, 140]

    # Compute diffs for rolling diff_mean
    df["diff_sales"] = df[["id", "sales"]].groupby("id")["sales"].transform(lambda x : x.diff()).astype(np.float16)
    for win in wins:
        df[f"mean_{win}"] = df[["id", "sales"]].groupby("id")["sales"].transform(lambda x : x.rolling(win).mean()).astype(np.float16)
        df[f"mean_{win}_diff"] = df[["id", "sales", "diff_sales"]].groupby("id")["diff_sales"].transform(lambda x : x.rolling(win).mean()).astype(np.float16)
        df[f"mean_{win}_decay"] = df[["id", "sales"]].groupby("id")["sales"].transform(lambda x: x.ewm(span=win).mean()).astype(np.float16)
        df[f"median_{win}"] = df[["id", "sales"]].groupby("id")["sales"].transform(lambda x : x.rolling(win).median()).astype(np.float16)
        df[f"min_{win}"] = df[["id", "sales"]].groupby("id")["sales"].transform(lambda x : x.rolling(win).min()).astype(np.float16)
        df[f"max_{win}"] = df[["id", "sales"]].groupby("id")["sales"].transform(lambda x : x.rolling(win).max()).astype(np.float16)
        df[f"std_{win}"] = df[["id", "sales"]].groupby("id")["sales"].transform(lambda x : x.rolling(win).std()).astype(np.float16)            
    
    print('Computing lags for aggregated levels...')
    # Computing aggregated lags and rollings
    agg_levels = [['dept_id', 'store_id']]
    agg_level_names = ['_'.join(level) for level in agg_levels]
    
    # Create dataframes grouped by agg_levels and date
    agg_df = dict()
    for level, level_name in zip(agg_levels, agg_level_names):
        agg_df[level_name] = df[['id', 'date', 'dept_id', 'store_id', 'sales']].groupby(level + ['date'])['sales'].sum().astype(np.float16).reset_index()
    
    # Computing lags for aggregated levels
    lags = [0, 1, 2, 3, 4, 5, 6, 7, 14, 28]
    lag_cols = [f"lag_t{lag}" for lag in lags ]
    
    for lag, lag_col in zip(lags, lag_cols):
        for level, level_name in zip(agg_levels, agg_level_names):
            agg_df[level_name][level_name + '_' + lag_col] = agg_df[level_name].groupby(level)['sales'].shift(lag).astype(np.float16)
    
    # Computing rollings for aggregated levels
    print('Computing lags for aggregated levels...')
    wins = [3, 7, 14, 30, 60, 140]
    for win in wins:
        for level, level_name in zip(agg_levels, agg_level_names):
            agg_df[level_name][level_name + '_' + f'mean_{win}'] = agg_df[level_name].groupby(level)['sales'].transform(lambda x: x.rolling(win).mean().astype(np.float16))

    # Mergin aggregated lags and rollings
    for level, level_name in zip(agg_levels, agg_level_names):
        agg_df[level_name].drop(['sales'], axis=1, inplace=True)
        df = df.merge(agg_df[level_name], on=level + ['date'], copy=False)
    
    print('Computing day of week means...')
    
    # Computing day of the week rollings
    # Later we will do little hack to apply them to 1 day
    week_wins = [4, 20]
    for week_win in week_wins:
        df[f"mean_{week_win}_dow"] = df[["id", "wday", "date", "sales"]].groupby(["id", "wday"])["sales"]\
                        .transform(lambda x : x.rolling(week_win).mean()).astype(np.float16)
    
    # How much sales days in last n days
    print('Computing how much sales in last n days...')
    
    for win in [7, 14, 30, 60, 140]:
        df[f"has_sales_days_last_{win}"] = df[["id", "sales"]].groupby("id")["sales"].transform(lambda x: (x>0).rolling(win).sum()).astype(np.float16)
    
    # Drop unnecessary columns
    unused_columns = ['diff_sales']
    df.drop(columns=unused_columns, inplace=True)
    return df
    
    

In [21]:
%%time

data = create_features(data)

Computing lags...
Computing rollings...
Computing lags for aggregated levels...
Computing lags for aggregated levels...
Computing day of week means...
Computing how much sales in last n days...
CPU times: user 17min 52s, sys: 32.4 s, total: 18min 25s
Wall time: 18min 36s


In [22]:
data

Unnamed: 0,id,item_id,dept_id,store_id,cat_id,state_id,d,sales,date,wm_yr_wk,weekday,wday,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price,lag_t0,lag_t1,lag_t2,lag_t3,lag_t4,lag_t5,lag_t6,lag_t7,lag_t8,lag_t9,lag_t10,lag_t11,lag_t12,lag_t13,lag_t14,lag_t15,lag_t16,lag_t17,lag_t18,lag_t19,lag_t20,lag_t21,lag_t22,lag_t23,lag_t24,lag_t25,lag_t26,lag_t27,lag_t28,mean_3,mean_3_diff,mean_3_decay,median_3,min_3,max_3,std_3,mean_7,mean_7_diff,mean_7_decay,median_7,min_7,max_7,std_7,mean_14,mean_14_diff,mean_14_decay,median_14,min_14,max_14,std_14,mean_30,mean_30_diff,mean_30_decay,median_30,min_30,max_30,std_30,mean_60,mean_60_diff,mean_60_decay,median_60,min_60,max_60,std_60,mean_140,mean_140_diff,mean_140_decay,median_140,min_140,max_140,std_140,dept_id_store_id_lag_t0,dept_id_store_id_lag_t1,dept_id_store_id_lag_t2,dept_id_store_id_lag_t3,dept_id_store_id_lag_t4,dept_id_store_id_lag_t5,dept_id_store_id_lag_t6,dept_id_store_id_lag_t7,dept_id_store_id_lag_t14,dept_id_store_id_lag_t28,dept_id_store_id_mean_3,dept_id_store_id_mean_7,dept_id_store_id_mean_14,dept_id_store_id_mean_30,dept_id_store_id_mean_60,dept_id_store_id_mean_140,mean_4_dow,mean_20_dow,has_sales_days_last_7,has_sales_days_last_14,has_sales_days_last_30,has_sales_days_last_60,has_sales_days_last_140
0,HOBBIES_1_001_CA_1_validation,0,0,0,0,0,d_1686,0.0,2015-09-10,11532,4,6,9,2015,0,0,0,0,1.0,0.0,0.0,8.257812,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.000000,,,,,,,0.000000,,,,,,,0.000000,,,,,,,0.000000,,,,,,,0.000000,,,,,,,0.000000,,,,,440.0,,,,,,,,,,,,,,,,,,,,,,
1,HOBBIES_1_002_CA_1_validation,1,0,0,0,0,d_1686,0.0,2015-09-10,11532,4,6,9,2015,0,0,0,0,1.0,0.0,0.0,3.970703,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.000000,,,,,,,0.000000,,,,,,,0.000000,,,,,,,0.000000,,,,,,,0.000000,,,,,,,0.000000,,,,,440.0,,,,,,,,,,,,,,,,,,,,,,
2,HOBBIES_1_003_CA_1_validation,2,0,0,0,0,d_1686,1.0,2015-09-10,11532,4,6,9,2015,0,0,0,0,1.0,0.0,0.0,2.970703,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.000000,,,,,,,1.000000,,,,,,,1.000000,,,,,,,1.000000,,,,,,,1.000000,,,,,,,1.000000,,,,,440.0,,,,,,,,,,,,,,,,,,,,,,
3,HOBBIES_1_004_CA_1_validation,3,0,0,0,0,d_1686,3.0,2015-09-10,11532,4,6,9,2015,0,0,0,0,1.0,0.0,0.0,4.640625,3.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,3.000000,,,,,,,3.000000,,,,,,,3.000000,,,,,,,3.000000,,,,,,,3.000000,,,,,,,3.000000,,,,,440.0,,,,,,,,,,,,,,,,,,,,,,
4,HOBBIES_1_005_CA_1_validation,4,0,0,0,0,d_1686,4.0,2015-09-10,11532,4,6,9,2015,0,0,0,0,1.0,0.0,0.0,2.880859,4.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,4.000000,,,,,,,4.000000,,,,,,,4.000000,,,,,,,4.000000,,,,,,,4.000000,,,,,,,4.000000,,,,,440.0,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6946239,FOODS_3_823_WI_3_validation,3044,6,9,2,2,d_1913,1.0,2016-04-24,11613,3,2,4,2016,0,0,0,0,0.0,0.0,0.0,2.980469,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333252,0.0,0.564453,0.0,0.0,1.0,0.577148,0.285645,0.142822,0.404785,0.0,0.0,1.0,0.488037,0.428467,0.071411,0.336182,0.0,0.0,2.0,7.558594e-01,0.199951,0.033325,0.273193,0.0,0.0,2.0,0.550781,0.250000,0.016663,0.304932,0.0,0.0,2.0,0.571289,0.585938,-0.007141,0.448486,0.0,0.0,6.0,1.032227,2232.0,2412.0,1897.0,1670.0,1579.0,1715.0,1991.0,2448.0,2574.0,2224.0,2180.0,1928.0,2082.0,2078.0,2048.0,1919.0,0.25,0.350098,2.0,4.0,4.0,11.0,47.0
6946240,FOODS_3_824_WI_3_validation,3045,6,9,2,2,d_1913,0.0,2016-04-24,11613,3,2,4,2016,0,0,0,0,0.0,0.0,0.0,2.480469,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,2.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333252,0.0,0.250244,0.0,0.0,1.0,0.577148,0.142822,0.000000,0.211792,0.0,0.0,1.0,0.377930,0.214233,-0.071411,0.221436,0.0,0.0,1.0,4.257812e-01,0.300049,0.000000,0.230347,0.0,0.0,2.0,0.535156,0.150024,0.000000,0.181152,0.0,0.0,2.0,0.404541,0.064270,0.000000,0.107056,0.0,0.0,2.0,0.273926,2232.0,2412.0,1897.0,1670.0,1579.0,1715.0,1991.0,2448.0,2574.0,2224.0,2180.0,1928.0,2082.0,2078.0,2048.0,1919.0,0.50,0.099976,1.0,3.0,8.0,8.0,8.0
6946241,FOODS_3_825_WI_3_validation,3046,6,9,2,2,d_1913,0.0,2016-04-24,11613,3,2,4,2016,0,0,0,0,0.0,0.0,0.0,3.980469,0.0,1.0,0.0,0.0,1.0,0.0,2.0,0.0,1.0,2.0,0.0,0.0,0.0,4.0,2.0,2.0,0.0,0.0,4.0,1.0,2.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.333252,0.0,0.301025,0.0,0.0,1.0,0.577148,0.571289,0.000000,0.467285,0.0,0.0,2.0,0.786621,0.785645,-0.142822,0.665039,0.0,0.0,4.0,1.188477e+00,0.866699,-0.066650,0.831543,0.5,0.0,4.0,1.136719,1.033203,0.000000,0.892578,1.0,0.0,4.0,1.056641,0.885742,0.000000,0.839355,1.0,0.0,4.0,0.997070,2232.0,2412.0,1897.0,1670.0,1579.0,1715.0,1991.0,2448.0,2574.0,2224.0,2180.0,1928.0,2082.0,2078.0,2048.0,1919.0,0.75,0.700195,3.0,6.0,15.0,36.0,77.0
6946242,FOODS_3_826_WI_3_validation,3047,6,9,2,2,d_1913,3.0,2016-04-24,11613,3,2,4,2016,0,0,0,0,0.0,0.0,0.0,1.280273,3.0,1.0,3.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,3.0,0.0,1.0,0.0,2.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,3.0,4.0,2.333984,1.0,2.162109,3.0,1.0,3.0,1.154297,1.142578,0.285645,1.544922,1.0,0.0,3.0,1.344727,0.928711,0.071411,1.214844,0.5,0.0,3.0,1.207031e+00,1.066406,0.066650,1.075195,1.0,0.0,4.0,1.172852,1.016602,0.049988,1.062500,1.0,0.0,4.0,1.065430,1.171875,-0.014282,1.163086,1.0,0.0,8.0,1.313477,2232.0,2412.0,1897.0,1670.0,1579.0,1715.0,1991.0,2448.0,2574.0,2224.0,2180.0,1928.0,2082.0,2078.0,2048.0,1919.0,1.75,1.700195,4.0,7.0,18.0,36.0,86.0


In [35]:
# # Test for data consistency for different timestamps
# time_range = range(1, 180)

# #print(data.loc[(data.date == LAST_TRAIN_DAY_DT), 'item_id'])
# np.array_equal(data.loc[(data.date == LAST_TRAIN_DAY_DT), 'id'].values,\
#                data.loc[(data.date == LAST_TRAIN_DAY_DT-timedelta(days=1)), 'id'].values)
# # print(data.loc[(data.date == LAST_TRAIN_DAY_DT), 'item_id']\
# #         .equals(data.loc[(data.date == LAST_TRAIN_DAY_DT-timedelta(days=1)), 'item_id']))
# arr_1 = data.loc[(data.date == LAST_TRAIN_DAY_DT), 'id'].values
# for time_step in time_range:
#     time_target = LAST_TRAIN_DAY_DT-timedelta(days=time_step)
#     arr_2 = data.loc[(data.date == time_target), 'id'].values
#     if (np.array_equal(arr_1, arr_2) == False):
#         print(time_target)
#         #print(arr_1.shape)
#         print(arr_2.shape)
#         #print(arr_2)
#         #break
# #     print(data.loc[(data.date == LAST_TRAIN_DAY_DT), 'id']\
# #         .equals(data.loc[(data.date == LAST_TRAIN_DAY_DT-timedelta(days=time_step)), 'id']))

2016-02-12 00:00:00
(30482,)
2016-02-11 00:00:00
(30482,)
2016-02-10 00:00:00
(30482,)
2016-02-09 00:00:00
(30482,)
2016-02-08 00:00:00
(30482,)
2016-02-07 00:00:00
(30482,)
2016-02-06 00:00:00
(30482,)
2016-02-05 00:00:00
(30482,)
2016-02-04 00:00:00
(30482,)
2016-02-03 00:00:00
(30482,)
2016-02-02 00:00:00
(30482,)
2016-02-01 00:00:00
(30482,)
2016-01-31 00:00:00
(30482,)
2016-01-30 00:00:00
(30482,)
2016-01-29 00:00:00
(30479,)
2016-01-28 00:00:00
(30479,)
2016-01-27 00:00:00
(30479,)
2016-01-26 00:00:00
(30479,)
2016-01-25 00:00:00
(30479,)
2016-01-24 00:00:00
(30479,)
2016-01-23 00:00:00
(30479,)
2016-01-22 00:00:00
(30477,)
2016-01-21 00:00:00
(30477,)
2016-01-20 00:00:00
(30477,)
2016-01-19 00:00:00
(30477,)
2016-01-18 00:00:00
(30477,)
2016-01-17 00:00:00
(30477,)
2016-01-16 00:00:00
(30477,)
2016-01-15 00:00:00
(30475,)
2016-01-14 00:00:00
(30475,)
2016-01-13 00:00:00
(30475,)
2016-01-12 00:00:00
(30475,)
2016-01-11 00:00:00
(30475,)
2016-01-10 00:00:00
(30475,)
2016-01-09 00:

## Data preprocessing

In [96]:
def create_dataset(df, time_point, is_test=False):
    # We do little hack, to set features for every week day
    # and to get labels for every day in prediction horizon
    # We do this because number of items differs as time goes,
    # so we will use "merge" to set appropriate features and labels
    
    # Setting dataframe for output
    out_df = df.loc[data.date == time_point].copy()
    
    # Setting wday rollings
    # Current wday is 2 (Sunday)
    # Target wday and corresponding minus value
    target_wdays = [ 1, 2,  3,  4,  5,  6,  7]
    minus_vals   = [-1, 0, -6, -5, -4, -3, -2]

    week_wins = [4, 20]
    for i, wday in enumerate(target_wdays):        
        for week_win in week_wins:
            wday_roll = data.loc[(data.date == time_point + timedelta(days=minus_vals[i])),\
                           ['id', f'mean_{week_win}_dow']].rename(columns={f'mean_{week_win}_dow': f'mean_{week_win}_dow_{wday}'})
            out_df = out_df.merge(wday_roll, on=['id'], copy=False)
    
    # Making labels using merge 
    if not is_test:
        global h # prediction horizon (28)
        for i in range(1, h + 1):
            labels = data.loc[data.date == time_point + timedelta(days=i), ['id', 'sales']]\
                             .rename(columns={'sales': f'y_{i}'})
            out_df = out_df.merge(labels, on=['id'], copy=False)
    
    # Removing unused columns
    unused_cols = ['sales']
    for week_win in week_wins:
        unused_cols.append(f'mean_{week_win}_dow')
    
    out_df.drop(columns=unused_cols, inplace=True)
    
    return out_df
    

### Define variables

In [106]:
LAST_TRAIN_DAY_DT = datetime(2016, 3, 20) # Sunday
LAST_TRAIN_VAL__DAY_DT = datetime(2016, 3, 27) # Sunday
FIRST_PRED_DAY_DT = datetime(2016, 4, 25) # Monday
TEST_SET_DAY = FIRST_PRED_DAY_DT - timedelta(days=1)
TRAINING_DAYS_NUM = 7 # Days for training

In [99]:
train_days = []
for day in range(TRAINING_DAYS_NUM):
    target_day = LAST_TRAIN_DAY_DT - timedelta(days = day * 7)
    print(target_day)
    
    train_tmp = create_dataset(data, target_day)
    train_days.append(train_tmp)
    
    # Printing null values if they exist
    print(train_tmp[train_tmp.isna()].count().sum())

train = pd.concat(train_days, axis=0)
val = create_dataset(data, LAST_TRAIN_VAL__DAY_DT)
test = create_dataset(data, TEST_SET_DAY, True)

2016-03-20 00:00:00
0
2016-03-13 00:00:00
0
2016-03-06 00:00:00
0
2016-02-28 00:00:00
0
2016-02-21 00:00:00
0
2016-02-14 00:00:00
0
2016-02-07 00:00:00
0


## Saving data

In [109]:
train = reduce_mem_usage(train)
val = reduce_mem_usage(val)
test = reduce_mem_usage(test)

train.to_pickle(f'{DATA_GRID_INPUT_DIR}/train_direct_model.pkl', protocol=3)
val.to_pickle(f'{DATA_GRID_INPUT_DIR}/val_direct_model.pkl', protocol=3)
test.to_pickle(f'{DATA_GRID_INPUT_DIR}/test_direct_model.pkl', protocol=3)

Mem. usage decreased to 66.15 Mb (0.0% reduction)
Mem. usage decreased to  9.45 Mb (0.0% reduction)
Mem. usage decreased to  7.82 Mb (0.0% reduction)
