In [None]:
'''!pip install pyyaml==5.4.1
!pip install darts
!pip install -U optuna==2.0.0'''

In [1]:
import darts

import numpy as np
import time

from darts import TimeSeries
from darts.utils.timeseries_generation import gaussian_timeseries, linear_timeseries, sine_timeseries
from darts.models import LightGBMModel, CatBoostModel, Prophet, RNNModel, TFTModel, NaiveSeasonal, ExponentialSmoothing, NHiTSModel
from darts.metrics import mape, smape, rmse, rmsle
from darts.dataprocessing import Pipeline
from darts.dataprocessing.transformers import Scaler, StaticCovariatesTransformer, MissingValuesFiller, InvertibleMapper
from darts.utils.timeseries_generation import datetime_attribute_timeseries
from darts.utils.statistics import check_seasonality, plot_acf, plot_residuals_analysis, plot_hist
from darts.utils.likelihood_models import QuantileRegression
from darts.utils.missing_values import fill_missing_values
from darts.models.filtering.moving_average_filter import MovingAverageFilter
#from darts.models import MovingAverage

import optuna
from optuna.integration import PyTorchLightningPruningCallback
from optuna.visualization import (
    plot_optimization_history,
    plot_contour,
    plot_param_importances,
)

from pytorch_lightning.callbacks.early_stopping import EarlyStopping

from tqdm import tqdm

import sklearn
from sklearn import preprocessing

import pandas as pd
import torch
import matplotlib.pyplot as plt
import gc

%matplotlib inline
torch.manual_seed(1); np.random.seed(1)  # for reproducibility

In [24]:
test = pd.read_csv('test_features.csv')
train = pd.read_csv('train_features.csv')
holidays = pd.read_csv('holidays_events.csv', parse_dates=['date'])
oil = pd.read_csv('oil.csv', parse_dates=['date'])
stores = pd.read_csv('stores.csv')
transactions = pd.read_csv('transactions.csv', parse_dates=['date'])

To create a list of families and stores. 

In [3]:
family_list = train['family'].unique()
family_list

array(['AUTOMOTIVE', 'BABY CARE', 'BEAUTY', 'BEVERAGES', 'BOOKS',
       'BREAD/BAKERY', 'CELEBRATION', 'CLEANING', 'DAIRY', 'DELI', 'EGGS',
       'FROZEN FOODS', 'GROCERY I', 'GROCERY II', 'HARDWARE',
       'HOME AND KITCHEN I', 'HOME AND KITCHEN II', 'HOME APPLIANCES',
       'HOME CARE', 'LADIESWEAR', 'LAWN AND GARDEN', 'LINGERIE',
       'LIQUOR,WINE,BEER', 'MAGAZINES', 'MEATS', 'PERSONAL CARE',
       'PET SUPPLIES', 'PLAYERS AND ELECTRONICS', 'POULTRY',
       'PREPARED FOODS', 'PRODUCE', 'SCHOOL AND OFFICE SUPPLIES',
       'SEAFOOD'], dtype=object)

In [4]:
store_list = train['store_nbr'].unique()
store_list

array([ 1, 10, 17, 18,  2, 20,  3,  4, 44, 45, 46, 47, 48, 49,  6,  7,  8,
        9, 11, 12, 13, 14, 15, 16, 21,  5, 19, 22, 23, 50, 24, 26, 28, 29,
       30, 32, 34, 51, 27, 35, 36, 25, 31, 33, 37, 39, 42, 38, 40, 41, 43,
       52, 53, 54], dtype=int64)

To create a test dataframe without the onpromotion value and another test dataframe with the values sorted by store number and then family. 

In [5]:
test_dropped = test.drop(['onpromotion'], axis=1)

In [6]:
test_dropped

Unnamed: 0,id,date,store_nbr,family,day_of_week,weekend,month,day_of_year,week_of_year,year,dcoilwtico,city,state,type,cluster,transactions,national_day,regional_day,local_day
0,3000888,2017-08-16,1,AUTOMOTIVE,3,0,8,228,33,2017,46.80,Quito,Pichincha,D,13,0.0,0.0,0.0,0.0
1,3000889,2017-08-16,1,BABY CARE,3,0,8,228,33,2017,46.80,Quito,Pichincha,D,13,0.0,0.0,0.0,0.0
2,3000890,2017-08-16,1,BEAUTY,3,0,8,228,33,2017,46.80,Quito,Pichincha,D,13,0.0,0.0,0.0,0.0
3,3000891,2017-08-16,1,BEVERAGES,3,0,8,228,33,2017,46.80,Quito,Pichincha,D,13,0.0,0.0,0.0,0.0
4,3000892,2017-08-16,1,BOOKS,3,0,8,228,33,2017,46.80,Quito,Pichincha,D,13,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28507,3029263,2017-08-31,54,POULTRY,4,0,8,243,35,2017,47.26,El Carmen,Manabi,C,3,0.0,0.0,0.0,0.0
28508,3029264,2017-08-31,54,PREPARED FOODS,4,0,8,243,35,2017,47.26,El Carmen,Manabi,C,3,0.0,0.0,0.0,0.0
28509,3029265,2017-08-31,54,PRODUCE,4,0,8,243,35,2017,47.26,El Carmen,Manabi,C,3,0.0,0.0,0.0,0.0
28510,3029266,2017-08-31,54,SCHOOL AND OFFICE SUPPLIES,4,0,8,243,35,2017,47.26,El Carmen,Manabi,C,3,0.0,0.0,0.0,0.0


In [7]:
test_sorted = test_dropped.sort_values(by=['store_nbr','family'])

In [8]:
test_sorted

Unnamed: 0,id,date,store_nbr,family,day_of_week,weekend,month,day_of_year,week_of_year,year,dcoilwtico,city,state,type,cluster,transactions,national_day,regional_day,local_day
0,3000888,2017-08-16,1,AUTOMOTIVE,3,0,8,228,33,2017,46.80,Quito,Pichincha,D,13,0.0,0.0,0.0,0.0
1782,3002670,2017-08-17,1,AUTOMOTIVE,4,0,8,229,33,2017,47.07,Quito,Pichincha,D,13,0.0,0.0,0.0,0.0
3564,3004452,2017-08-18,1,AUTOMOTIVE,5,0,8,230,33,2017,48.59,Quito,Pichincha,D,13,0.0,0.0,0.0,0.0
5346,3006234,2017-08-19,1,AUTOMOTIVE,6,1,8,231,33,2017,48.59,Quito,Pichincha,D,13,0.0,0.0,0.0,0.0
7128,3008016,2017-08-20,1,AUTOMOTIVE,7,1,8,232,33,2017,48.59,Quito,Pichincha,D,13,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21383,3022139,2017-08-27,54,SEAFOOD,7,1,8,239,34,2017,47.65,El Carmen,Manabi,C,3,0.0,0.0,0.0,0.0
23165,3023921,2017-08-28,54,SEAFOOD,1,0,8,240,35,2017,46.40,El Carmen,Manabi,C,3,0.0,0.0,0.0,0.0
24947,3025703,2017-08-29,54,SEAFOOD,2,0,8,241,35,2017,46.46,El Carmen,Manabi,C,3,0.0,0.0,0.0,0.0
26729,3027485,2017-08-30,54,SEAFOOD,3,0,8,242,35,2017,45.96,El Carmen,Manabi,C,3,0.0,0.0,0.0,0.0


In the following lines of code, we extract the TimeSeries of the 54 stores for each family. These TimeSeries will group the sales by family, the date of each sale, but also the dependent covariates (indicated with group_cols and static_cols) of these sales: store_nbr, family, city, state, type, cluster

You can also see that we indicate fill_missing_dates=True because in the dataset, the sales of each December 25th are missing. 
We also indicate freq='D', to indicate that the interval for the values of the time series is in days (D for day). Finally, we indicate that the values of the TimeSeries must be interpreted in float32 and that the time series must be sorted by stores.

In [9]:
family_TS_dict = {}

for family in family_list:
    df_family = train.loc[train['family'] == family]

    list_of_TS_family = TimeSeries.from_group_dataframe(
                                df_family,
                                time_col='date',
                                group_cols=['store_nbr','family'],
                                static_cols=['city','state','type','cluster'], 
                                value_cols='sales',
                                fill_missing_dates=True,
                                freq='D')
    for ts in list_of_TS_family:
            ts = ts.astype(np.float32)

    list_of_TS_family = sorted(list_of_TS_family, key=lambda ts: int(ts.static_covariates_values()[0,0]))
    family_TS_dict[family] = list_of_TS_family

To display the first time series of the first family. 

In [10]:
display(family_TS_dict['AUTOMOTIVE'][0])

Normalization is a technique used to improve the performance of a Machine Learning model by facilitating its training. I let you refer to our article on the subject if you want to know more.

We can easily normalize a TimeSeries with the Scaler function of darts.

Moreover, we will further optimize the training of the model by one hot encoding our covariates. We implement the one hot encoding via the StaticCovariatesTransformer function.

In [11]:
family_pipeline_dict = {}
family_TS_transformed_dict = {}

for key in family_TS_dict:
    train_filler = MissingValuesFiller(verbose=False, n_jobs=-1, name="Fill NAs")
    static_cov_transformer = StaticCovariatesTransformer(verbose=False, transformer_cat = sklearn.preprocessing.OneHotEncoder(), name="Encoder")
    log_transformer = InvertibleMapper(np.log1p, np.expm1, verbose=False, n_jobs=-1, name="Log-Transform")   
    train_scaler = Scaler(verbose=False, n_jobs=-1, name="Scaling")

    train_pipeline = Pipeline([train_filler,
                             static_cov_transformer,
                             log_transformer,
                             train_scaler])
     
    training_transformed = train_pipeline.fit_transform(family_TS_dict[key])
    family_pipeline_dict[key] = train_pipeline
    family_TS_transformed_dict[key] = training_transformed
      
     
    training_transformed = train_pipeline.fit_transform(family_TS_dict[key])
    family_pipeline_dict[key] = train_pipeline
    family_TS_transformed_dict[key] = training_transformed

We can display the first transformed TimeSeries of the first family:

In [12]:
display(family_TS_transformed_dict['AUTOMOTIVE'][0])

The date is a future covariate because we know the date of the coming days. It has, in many cases, an impact on the traffic of a store. For example, we can expect that on Saturday there will be more customers in the store than on Monday. But it can also be expected that during the summer vacations the store will be less busy than in normal times. Hence every little detail counts.

Look at adding linear increase to master data columns. 

In [13]:
from darts.utils.timeseries_generation import datetime_attribute_timeseries

full_time_period = pd.date_range(start='2013-01-01', end='2017-08-31', freq='D')


year = datetime_attribute_timeseries(time_index = full_time_period, attribute="year")
month = datetime_attribute_timeseries(time_index = full_time_period, attribute="month")
day = datetime_attribute_timeseries(time_index = full_time_period, attribute="day")
dayofyear = datetime_attribute_timeseries(time_index = full_time_period, attribute="dayofyear")
weekday = datetime_attribute_timeseries(time_index = full_time_period, attribute="dayofweek")
weekofyear = datetime_attribute_timeseries(time_index = full_time_period, attribute="weekofyear")
timesteps = TimeSeries.from_times_and_values(times=full_time_period,
                                             values=np.arange(len(full_time_period)),
                                             columns=["linear_increase"])

time_cov = year.stack(month).stack(day).stack(dayofyear).stack(weekday).stack(weekofyear).stack(timesteps)
time_cov = time_cov.astype(np.float32)

This is what it gives us for the date at index 100:

In [15]:
display(print(time_cov.components.values))
display(time_cov[100])

['year' 'month' 'day' 'dayofyear' 'dayofweek' 'weekofyear'
 'linear_increase']


None

To normalize this data:

In [16]:
time_cov_scaler = Scaler(verbose=False, n_jobs=-1, name="Scaler")
time_cov_train, time_cov_val = time_cov.split_before(pd.Timestamp('20170816'))
time_cov_scaler.fit(time_cov_train)
time_cov_transformed = time_cov_scaler.transform(time_cov)

In [18]:
display(print(time_cov_transformed.components.values))
display(time_cov_transformed[100])

['year' 'month' 'day' 'dayofyear' 'dayofweek' 'weekofyear'
 'linear_increase']


None

As said before, the price of oil is a future covariate because it is known in advance. Here, we will not simply extract the daily oil price but we will calculate the moving average. The moving average in X, is an average of the current value and the X-1 previous values of a time series.

Calculating the moving average allows us to remove the momentary fluctuations of a value and thus to accentuate the long-term trends. The moving average is used in trading, but more generally in Time Series Analysis.In the following code, we calculate the moving average in 7 and 28 of the oil price. And of course, we apply a normalization :

In [21]:
oil = TimeSeries.from_dataframe(oil, 
                                time_col = 'date', 
                                value_cols = ['dcoilwtico'],
                                freq = 'D')

oil = oil.astype(np.float32)

# Transform
oil_filler = MissingValuesFiller(verbose=False, n_jobs=-1, name="Filler")
oil_scaler = Scaler(verbose=False, n_jobs=-1, name="Scaler")
oil_pipeline = Pipeline([oil_filler, oil_scaler])
oil_transformed = oil_pipeline.fit_transform(oil)

# Moving Averages for Oil Price
oil_moving_average_7 = MovingAverageFilter(window=7)
oil_moving_average_28 = MovingAverageFilter(window=28)

oil_moving_averages = []

ma_7 = oil_moving_average_7.filter(oil_transformed).astype(np.float32)
ma_7 = ma_7.with_columns_renamed(col_names=ma_7.components, col_names_new="oil_ma_7")
ma_28 = oil_moving_average_28.filter(oil_transformed).astype(np.float32)
ma_28 = ma_28.with_columns_renamed(col_names=ma_28.components, col_names_new="oil_ma_28")
oil_moving_averages = ma_7.stack(ma_28)

In [23]:
display(oil_moving_averages[100])

Let’s now focus on the holidays. Here, Ferdinand Berr has implemented functions to detail these holidays. In particular, he adds information about whether the holiday is Christmas day, whether it is a soccer game day, etc:

In [25]:
def holiday_list(stores):

    listofseries = []
    
    for i in range(0,len(stores)):
            
            df_holiday_dummies = pd.DataFrame(columns=['date'])
            df_holiday_dummies["date"] = holidays["date"]
            
            df_holiday_dummies["national_holiday"] = np.where(((holidays["type"] == "Holiday") & (holidays["locale"] == "National")), 1, 0)

            df_holiday_dummies["earthquake_relief"] = np.where(holidays['description'].str.contains('Terremoto Manabi'), 1, 0)

            df_holiday_dummies["christmas"] = np.where(holidays['description'].str.contains('Navidad'), 1, 0)

            df_holiday_dummies["football_event"] = np.where(holidays['description'].str.contains('futbol'), 1, 0)

            df_holiday_dummies["national_event"] = np.where(((holidays["type"] == "Event") & (holidays["locale"] == "National") & (~holidays['description'].str.contains('Terremoto Manabi')) & (~holidays['description'].str.contains('futbol'))), 1, 0)

            df_holiday_dummies["work_day"] = np.where((holidays["type"] == "Work Day"), 1, 0)

            df_holiday_dummies["local_holiday"] = np.where(((holidays["type"] == "Holiday") & ((holidays["locale_name"] == stores['state'][i]) | (holidays["locale_name"] == stores['city'][i]))), 1, 0)
                     
            listofseries.append(df_holiday_dummies)

    return listofseries

Then, we have a function to remove the days equal to 0 and the duplicates:

In [26]:
def remove_0_and_duplicates(holiday_list):

    listofseries = []
    
    for i in range(0,len(holiday_list)):
            
            df_holiday_per_store = list_of_holidays_per_store[i].set_index('date')

            df_holiday_per_store = df_holiday_per_store.loc[~(df_holiday_per_store==0).all(axis=1)]
            
            df_holiday_per_store = df_holiday_per_store.groupby('date').agg({'national_holiday':'max', 'earthquake_relief':'max', 
                                   'christmas':'max', 'football_event':'max', 
                                   'national_event':'max', 'work_day':'max', 
                                   'local_holiday':'max'}).reset_index()

            listofseries.append(df_holiday_per_store)

    return listofseries

And finally a function that allows us to have the holidays associated to each of the 54 stores :

In [27]:
def holiday_TS_list_54(holiday_list):

    listofseries = []
    
    for i in range(0,54):
            
            holidays_TS = TimeSeries.from_dataframe(list_of_holidays_per_store[i], 
                                        time_col = 'date',
                                        fill_missing_dates=True,
                                        fillna_value=0,
                                        freq='D')
            
            holidays_TS = holidays_TS.slice(pd.Timestamp('20130101'),pd.Timestamp('20170831'))
            holidays_TS = holidays_TS.astype(np.float32)
            listofseries.append(holidays_TS)

    return listofseries

Now we just need to apply these functions:

In [30]:
list_of_holidays_per_store = holiday_list(stores)
list_of_holidays_per_store = remove_0_and_duplicates(list_of_holidays_per_store)   
list_of_holidays_store = holiday_TS_list_54(list_of_holidays_per_store)

holidays_filler = MissingValuesFiller(verbose=False, n_jobs=-1, name="Filler")
holidays_scaler = Scaler(verbose=False, n_jobs=-1, name="Scaler")

holidays_pipeline = Pipeline([holidays_filler, holidays_scaler])
holidays_transformed = holidays_pipeline.fit_transform(list_of_holidays_store)

We get 54 TimeSeries with 7 columns: national_holiday, earthquake_relief, christmas, football_event, national_event, work_day, local_holiday

Here is the TimeSeries index 100 for the first store:

In [31]:
display(len(holidays_transformed))
display(holidays_transformed[0].components.values)
display(holidays_transformed[0][100])

54

array(['national_holiday', 'earthquake_relief', 'christmas',
       'football_event', 'national_event', 'work_day', 'local_holiday'],
      dtype=object)

The last future covariate to process is the onpromotion column. It gives us the number of items on promotion in a product family. Here the code is similar to the one used for the sales column. It allows to extract for each family, the time series of the 54 stores:

In [32]:
df_promotion = pd.concat([train, test], axis=0)
df_promotion = df_promotion.sort_values(["store_nbr","family","date"])
df_promotion.tail()

family_promotion_dict = {}

for family in family_list:
    df_family = df_promotion.loc[df_promotion['family'] == family]

    list_of_TS_promo = TimeSeries.from_group_dataframe(
                                df_family,
                                time_col="date",
                                group_cols=["store_nbr","family"],
                                value_cols="onpromotion",
                                fill_missing_dates=True,
                                freq='D')
  
    for ts in list_of_TS_promo:
        ts = ts.astype(np.float32)

    family_promotion_dict[family] = list_of_TS_promo

We can display the first TimeSeries of the first family :

In [33]:
display(family_promotion_dict['AUTOMOTIVE'][0])

Let’s go further by calculating also the moving average in 7 and 28, like for the oil price:

In [35]:
from tqdm import tqdm

promotion_transformed_dict = {}

for key in tqdm(family_promotion_dict):
    promo_filler = MissingValuesFiller(verbose=False, n_jobs=-1, name="Fill NAs")
    promo_scaler = Scaler(verbose=False, n_jobs=-1, name="Scaling")

    promo_pipeline = Pipeline([promo_filler,
                             promo_scaler])
  
    promotion_transformed = promo_pipeline.fit_transform(family_promotion_dict[key])
  
    # Moving Averages for Promotion Family Dictionaries
    promo_moving_average_7 = MovingAverageFilter(window=7)
    promo_moving_average_28 = MovingAverageFilter(window=28)

    promotion_covs = []

    for ts in promotion_transformed:
        ma_7 = promo_moving_average_7.filter(ts)
        ma_7 = TimeSeries.from_series(ma_7.pd_series())  
        ma_7 = ma_7.astype(np.float32)
        ma_7 = ma_7.with_columns_renamed(col_names=ma_7.components, col_names_new="promotion_ma_7")
        ma_28 = promo_moving_average_28.filter(ts)
        ma_28 = TimeSeries.from_series(ma_28.pd_series())  
        ma_28 = ma_28.astype(np.float32)
        ma_28 = ma_28.with_columns_renamed(col_names=ma_28.components, col_names_new="promotion_ma_28")
        promo_and_mas = ts.stack(ma_7).stack(ma_28)
        promotion_covs.append(promo_and_mas)

    promotion_transformed_dict[key] = promotion_covs

100%|██████████████████████████████████████████████████████████████████████████████████| 33/33 [00:46<00:00,  1.42s/it]


We obtain a normalized time series with 3 columns. We can display the index 1 of the first TimeSeries of the first family:

In [36]:
display(promotion_transformed_dict['AUTOMOTIVE'][0].components.values)
display(promotion_transformed_dict['AUTOMOTIVE'][0][1])

array(['onpromotion', 'promotion_ma_7', 'promotion_ma_28'], dtype=object)