In [13]:
import numpy as np
import pandas as pd
import os, sys, gc, time, warnings, pickle, psutil, random

import time

warnings.filterwarnings('ignore')

In [14]:
from razor.api import project_space_path
import razor


In [15]:
def get_memory_usage():
    return np.round(psutil.Process(os.getpid()).memory_info()[0]/2.**30, 2) 
        
def sizeof_fmt(num, suffix='B'):
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f%s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f%s%s" % (num, 'Yi', suffix)

In [16]:
TARGET = 'sales'         # Our main target
END_TRAIN = 1913         # Last day in train set
MAIN_INDEX = ['id','d']  # We can identify item by these columns

In [17]:
print('Load Main Data')

# We will need only train dataset
# to show lags concept
train_df = pd.read_csv(project_space_path('M5Forecasting/sales_train_validation.csv'))

# To make all calculations faster
# we will limit dataset by 'CA' state
train_df = train_df[train_df['state_id']=='CA']


Load Main Data


In [18]:
train_df.shape

(12196, 1919)

In [19]:
train_df.iloc[:10]

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1904,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,3,0,1,1,1,3,0,1,1
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,1,2,1,1,1,0,1,1,1
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,0,5,4,1,0,1,3,7,2
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,1,1,0,1,1,2,2,2,4
5,HOBBIES_1_006_CA_1_validation,HOBBIES_1_006,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,1,0,1,0,0,0,2,0,0
6,HOBBIES_1_007_CA_1_validation,HOBBIES_1_007,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,1,0,1,0,0,1,1
7,HOBBIES_1_008_CA_1_validation,HOBBIES_1_008,HOBBIES_1,HOBBIES,CA_1,CA,12,15,0,0,...,0,0,1,37,3,4,6,3,2,1
8,HOBBIES_1_009_CA_1_validation,HOBBIES_1_009,HOBBIES_1,HOBBIES,CA_1,CA,2,0,7,3,...,0,0,1,1,6,0,0,0,0,0
9,HOBBIES_1_010_CA_1_validation,HOBBIES_1_010,HOBBIES_1,HOBBIES,CA_1,CA,0,0,1,0,...,1,0,0,0,0,0,0,2,0,2


In [None]:
# index_columns = ['id','item_id','dept_id','cat_id','store_id','state_id']
# train_df = pd.melt(train_df, 
#                   id_vars = index_columns, 
#                   var_name = 'd', 
#                   value_name = TARGET)

# train_df[train_df['id']=='HOBBIES_1_001_CA_1_validation'].iloc[:10]


In [None]:
import razor.flow as rf
import typing as t
import logging
import pandas as pd
from razor.api import project_space_path


In [None]:
# train_df.to_csv(project_space_path('M5Forecasting/train_df.csv'), index=False)



In [None]:
@rf.block(executor=rf.ContainerExecutor(cores=2, memory=8192))
class HeavyOps():
    path: t.Any
        
    def run(self):

        train_df = pd.read_csv(project_space_path(path))
        
        # Or same in "compact" manner
        LAG_DAYS = [col for col in range(1,8)]
        temp_df = train_df[['id','d',TARGET]]

        start_time = time.time()
        temp_df = temp_df.assign(**{
                '{}_lag_{}'.format(col, l): temp_df.groupby(['id'])[col].transform(lambda x: x.shift(l))
                for l in LAG_DAYS
                for col in [TARGET]
            })
        
        logging.info('%0.2f min: Time for bulk shift')
        logging.info(time.time() - start_time / 60)
        print('%0.2f min: Time for bulk shift' % ((time.time() - start_time) / 60))
        
        temp_df.to_csv(project_space_path('M5Forecasting/temp_df.csv'), index=False)
        

In [None]:
heavy_ops = HeavyOps(path = 'M5Forecasting/train_df.csv')



In [None]:
p = rf.Pipeline("Heavy Operation", targets=[heavy_ops])  #preprocess


In [None]:
p.show()

In [None]:
deployed_pipeline = razor.api.engines('DS-engine').execute(pipeline=p)


In [None]:
deployed_pipeline

In [None]:
temp_df = pd.read_csv('M5Forecasting/temp_df.csv')




# Original Lag Code

In [None]:
@rf.block(executor=rf.ContainerExecutor(cores=2, memory=20000))
class LagCreator:
    grid_path: t.Any
        
    def run(self):
        ########################### Apply on grid_df

        grid_df = pd.read_pickle(project_space_path(self.grid_path))

        TARGET = 'sales'         # Our main target
        END_TRAIN = 1913         # Last day in train set
        MAIN_INDEX = ['id','d']  # We can identify item by these columns
        
        # We need only 'id','d','sales'
        # to make lags and rollings
        grid_df = grid_df[['id','d','sales']]
        SHIFT_DAY = 28

        # Lags
        # with 28 day shift
        start_time = time.time()
        self.logger.info('Create lags')

        LAG_DAYS = [col for col in range(SHIFT_DAY,SHIFT_DAY+15)]
        grid_df = grid_df.assign(**{
                '{}_lag_{}'.format(col, l): grid_df.groupby(['id'])[col].transform(lambda x: x.shift(l))
                for l in LAG_DAYS
                for col in [TARGET]
            })

        # Minify lag columns
        for col in list(grid_df):
            if 'lag' in col:
                grid_df[col] = grid_df[col].astype(np.float16)

        self.logger.info('%0.2f min: Lags' % ((time.time() - start_time) / 60))

        # Rollings
        # with 28 day shift
        start_time = time.time()
        self.logger.info('Create rolling aggs')

        for i in [7,14,30,60,180]:
            self.logger.info(f'Rolling period: {i}')
            grid_df['rolling_mean_'+str(i)] = grid_df.groupby(['id'])[TARGET].transform(lambda x: x.shift(SHIFT_DAY).rolling(i).mean()).astype(np.float16)
            grid_df['rolling_std_'+str(i)]  = grid_df.groupby(['id'])[TARGET].transform(lambda x: x.shift(SHIFT_DAY).rolling(i).std()).astype(np.float16)

        # Rollings
        # with sliding shift
        for d_shift in [1,7,14]: 
            self.logger.info(f'Shifting period: {d_shift}')
            for d_window in [7,14,30,60]:
                col_name = 'rolling_mean_tmp_'+str(d_shift)+'_'+str(d_window)
                grid_df[col_name] = grid_df.groupby(['id'])[TARGET].transform(lambda x: x.shift(d_shift).rolling(d_window).mean()).astype(np.float16)


        self.logger.info('%0.2f min: Lags' % ((time.time() - start_time) / 60))

        self.logger.info(f'Save lags and rollings  {grid_df.shape}')
        grid_df.to_pickle(project_space_path(f'M5Forecasting/lags_df_{str(SHIFT_DAY)}.pkl'))


In [None]:
lag_creator = LagCreator(grid_path = 'M5Forecasting/m5-simple-fe/grid_part_1.pkl')


In [None]:
p = rf.Pipeline("lag_creator", targets=[lag_creator])  #preprocess


In [None]:
p.show()

In [37]:
deployed_pipeline = razor.api.engines('DS-engine').execute(pipeline=p)


In [46]:
deployed_pipeline

<razor_tools.backend.ipython.mime.run_monitor.RunMonitor at 0x7f15b54f3410>