In [1]:
# General imports
import numpy as np
import pandas as pd
import os, sys, gc, time, warnings, pickle, psutil, random
import time

warnings.filterwarnings('ignore')

In [2]:
def get_memory_usage():
    return np.round(psutil.Process(os.getpid()).memory_info()[0]/2.**30, 2) 
        
def sizeof_fmt(num, suffix='B'):
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f%s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f%s%s" % (num, 'Yi', suffix)

In [3]:
########################### Vars
#################################################################################
TARGET = 'sales'         # Our main target
END_TRAIN = 1941         # Last day in train set
MAIN_INDEX = ['id','d']  # We can identify item by these columns

In [4]:
#################################################################################
print('Load Main Data')

# We will need only train dataset
# to show lags concept
train_df = pd.read_csv('../data/sales_train_evaluation.csv')

# To make all calculations faster
# we will limit dataset by 'CA' state
train_df = train_df[train_df['state_id']=='CA']

Load Main Data


In [5]:
########################### Data Representation
#################################################################################

# Let's check our shape
print('Shape', train_df.shape)

Shape (12196, 1947)


In [6]:
index_columns = ['id','item_id','dept_id','cat_id','store_id','state_id']
train_df = pd.melt(train_df, 
                  id_vars = index_columns, 
                  var_name = 'd', 
                  value_name = TARGET)

train_df[train_df['id']=='HOBBIES_1_001_CA_1_evaluation'].iloc[:10]

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0
12196,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_2,0
24392,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_3,0
36588,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_4,0
48784,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_5,0
60980,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_6,0
73176,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_7,0
85372,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_8,0
97568,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_9,0
109764,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_10,0


In [7]:
train_df['day'] = train_df['d'].apply(lambda x: x[2:]).astype(np.int16)
train_df = train_df[train_df['day']>703]
print(train_df.shape) 
train_df = train_df.iloc[:1000, :]
train_df.head()

(15098648, 9)


Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales,day
8573788,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_704,0,704
8573789,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,d_704,1,704
8573790,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,d_704,0,704
8573791,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,d_704,0,704
8573792,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,d_704,1,704


In [8]:
train_df['d'] = train_df['d'].apply(lambda x: x[2:]).astype(np.int16)

icols = ['id','item_id','dept_id','cat_id','store_id','state_id']
for col in icols:
    train_df[col] = train_df[col].astype('category')

In [9]:
########################### Lags creation
#################################################################################

temp_df = train_df[['id', 'd', TARGET]]

start_time = time.time()
for i in range(1, 8):
    print('Shifting:', i)
    temp_df['lag_' + str(i)] = temp_df.groupby(
        ['id'])[TARGET].transform(lambda x: x.shift(i))

print('%0.2f min: Time for loops' % ((time.time() - start_time) / 60))

# Or same in "compact" manner
LAG_DAYS = [col for col in range(1, 8)]
temp_df = train_df[['id', 'd', TARGET]]

start_time = time.time()
temp_df = temp_df.assign(
    **{
        '{}_lag_{}'.format(col, l): temp_df.groupby(['id'])[col].transform(
            lambda x: x.shift(l))
        for l in LAG_DAYS for col in [TARGET]
    })

print('%0.2f min: Time for bulk shift' % ((time.time() - start_time) / 60))

Shifting: 1
Shifting: 2
Shifting: 3
Shifting: 4
Shifting: 5
Shifting: 6
Shifting: 7
0.03 min: Time for loops
0.03 min: Time for bulk shift


In [10]:
temp_df[temp_df['id']=='HOBBIES_1_001_CA_1_evaluation'].iloc[:10]

Unnamed: 0,id,d,sales,sales_lag_1,sales_lag_2,sales_lag_3,sales_lag_4,sales_lag_5,sales_lag_6,sales_lag_7
8573788,HOBBIES_1_001_CA_1_evaluation,704,0,,,,,,,


In [11]:
########################### Rolling lags
#################################################################################

temp_df = train_df[['id', 'd', 'sales']]

start_time = time.time()
for i in [14, 30, 60]:
    print('Rolling period:', i)
    temp_df['rolling_mean_' + str(i)] = temp_df.groupby(
        ['id'])[TARGET].transform(lambda x: x.shift(1).rolling(i).mean())
    temp_df['rolling_std_' + str(i)] = temp_df.groupby(
        ['id'])[TARGET].transform(lambda x: x.shift(1).rolling(i).std())

print('%0.2f min: Time for loop' % ((time.time() - start_time) / 60))

Rolling period: 14
Rolling period: 30
Rolling period: 60
0.06 min: Time for loop


In [12]:
# The result
temp_df[temp_df['id'] == 'HOBBIES_1_002_CA_1_evaluation'].iloc[:20]

# Same for NaNs values - it's normal
# because there is no data for
# 0*(rolling_period),-1*(rolling_period),-2*(rolling_period)

Unnamed: 0,id,d,sales,rolling_mean_14,rolling_std_14,rolling_mean_30,rolling_std_30,rolling_mean_60,rolling_std_60
8573789,HOBBIES_1_002_CA_1_evaluation,704,1,,,,,,


In [13]:
########################### Memory ussage
#################################################################################
print("{:>20}: {:>8}".format(
    'Original rolling df', sizeof_fmt(temp_df.memory_usage(index=True).sum())))

temp_df = temp_df.iloc[:, 3:]
print("{:>20}: {:>8}".format(
    'Values rolling df', sizeof_fmt(temp_df.memory_usage(index=True).sum())))

from scipy import sparse
temp_matrix = sparse.csr_matrix(temp_df)

# restore to df
temp_matrix_restored = pd.DataFrame(temp_matrix.todense())
restored_cols = ['roll_' + str(i) for i in list(temp_matrix_restored)]
temp_matrix_restored.columns = restored_cols

 Original rolling df: 114.2KiB
   Values rolling df:  54.7KiB


In [14]:
########################### Remove old objects
#################################################################################
del temp_df, train_df, temp_matrix, temp_matrix_restored

# 正式开始

In [15]:
grid_df = pd.read_pickle('../data/output/grid_part_1.pkl')

# We need only 'id','d','sales'
# to make lags and rollings
grid_df = grid_df[['id', 'd', 'sales']]
SHIFT_DAY = 28

start_time = time.time()
print('Create lags')

LAG_DAYS = [col for col in range(SHIFT_DAY, SHIFT_DAY + 15)]
grid_df = grid_df.assign(
    **{
        '{}_lag_{}'.format(col, l): grid_df.groupby(['id'])[col].transform(
            lambda x: x.shift(l))
        for l in LAG_DAYS for col in [TARGET]
    })

for col in list(grid_df):
    if 'lag' in col:
        grid_df[col] = grid_df[col].astype(np.float16)

print('%0.2f min: Lags' % ((time.time() - start_time) / 60))

start_time = time.time()
print('Create rolling aggs')

for i in [7, 14, 30, 60, 180]:
    print('Rolling period:', i)
    grid_df['rolling_mean_' + str(i)] = grid_df.groupby(
        ['id'])[TARGET].transform(
            lambda x: x.shift(SHIFT_DAY).rolling(i).mean()).astype(np.float16)
    grid_df['rolling_std_' + str(i)] = grid_df.groupby([
        'id'
    ])[TARGET].transform(lambda x: x.shift(SHIFT_DAY).rolling(i).std()).astype(
        np.float16)

# Rollings
# with sliding shift
for d_shift in [1, 7, 14]:
    print('Shifting period:', d_shift)
    for d_window in [7, 14, 30, 60]:
        col_name = 'rolling_mean_tmp_' + str(d_shift) + '_' + str(d_window)
        grid_df[col_name] = grid_df.groupby(['id'])[TARGET].transform(
            lambda x: x.shift(d_shift).rolling(d_window).mean()).astype(
                np.float16)

print('%0.2f min: Lags' % ((time.time() - start_time) / 60))

Create lags
4.75 min: Lags
Create rolling aggs
Rolling period: 7
Rolling period: 14
Rolling period: 30
Rolling period: 60
Rolling period: 180
Shifting period: 1
Shifting period: 7
Shifting period: 14
12.20 min: Lags


In [16]:
#################################################################################
print('Save lags and rollings')
grid_df.to_pickle('../data/output/lags_df_' + str(SHIFT_DAY) + '.pkl')

Save lags and rollings


In [17]:
grid_df.tail()

Unnamed: 0,id,d,sales,sales_lag_28,sales_lag_29,sales_lag_30,sales_lag_31,sales_lag_32,sales_lag_33,sales_lag_34,...,rolling_mean_tmp_1_30,rolling_mean_tmp_1_60,rolling_mean_tmp_7_7,rolling_mean_tmp_7_14,rolling_mean_tmp_7_30,rolling_mean_tmp_7_60,rolling_mean_tmp_14_7,rolling_mean_tmp_14_14,rolling_mean_tmp_14_30,rolling_mean_tmp_14_60
47735392,FOODS_3_823_WI_3_evaluation,1969,,1.0,1.0,0.0,0.0,1.0,1.0,0.0,...,,,,,,,,,,
47735393,FOODS_3_824_WI_3_evaluation,1969,,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,,,,,,,,,,
47735394,FOODS_3_825_WI_3_evaluation,1969,,2.0,0.0,1.0,0.0,1.0,0.0,2.0,...,,,,,,,,,,
47735395,FOODS_3_826_WI_3_evaluation,1969,,0.0,1.0,1.0,1.0,0.0,6.0,4.0,...,,,,,,,,,,
47735396,FOODS_3_827_WI_3_evaluation,1969,,1.0,5.0,2.0,2.0,0.0,4.0,5.0,...,,,,,,,,,,
