In [1]:
import numpy as np
import pandas as pd
import os
import glob
from tqdm import tqdm
import datetime
import gc
import random
pd.set_option("display.max_columns", 500)
pd.set_option("display.max_rows", 500)

In [2]:
data = pd.read_csv("trn_art_agg_cust_ywk.csv")

In [4]:
fix_weeks = {20181:201801, 20191:201901, 20192:201902, 20193:201903, 20194:201904, 20195:201905, 20196:201906, 
             20197:201907, 20198:201908, 20199:201909, 20201:202001, 20202:202002, 
             20203:202003, 20204:202004, 20205:202005, 20206:202006, 20207:202007, 20208:202008, 20209:202009}

data['YearWeek'] = data['YearWeek'].replace(fix_weeks)

In [5]:
data['customer_id'] = data['customer_id'].apply(lambda x: int(x[-16:],16) ).astype('int64')

In [6]:
new_df = pd.DataFrame()

week_list = []
cust_week_list = []

cust_list = data.customer_id.unique().tolist()
all_weeks = [i for i in range(201934, 201953)] + [i for i in range(202001, data.YearWeek.max()+1)]
# all_weeks = [i for i in range(data.YearWeek.min(), 201853)] + [i for i in range(201901, 201953)] + [i for i in range(202001, data.YearWeek.max()+1)]

for customer in cust_list:
    cust_week_list.extend([customer]*len(all_weeks))
    week_list.extend(all_weeks)

len(week_list), len(cust_week_list)

(77332413, 77332413)

In [7]:
new_df['customer_id'] = cust_week_list
new_df['YearWeek'] = week_list

del week_list, cust_week_list

In [8]:
new_df = new_df.merge(data, on=['customer_id', 'YearWeek'], how='left').fillna(0)

del data
gc.collect()

0

In [9]:
# new_df.to_csv('trn_art_agg_cust_ywk_ts_1933.csv', index=False)

In [10]:
# Lag and rolling feature engineering based on number of days of the weekly model

def simple_feat_revenue(data, days, target):

    df = data[['customer_id', 'YearWeek', target]].copy()
    sdays = days
    group = df.groupby('customer_id')[target]

    df['lag_ts0'] = group.transform(lambda x: x.shift(sdays))
    df['lag_ts1'] = group.transform(lambda x: x.shift(sdays+1))
    df['lag_ts2'] = group.transform(lambda x: x.shift(sdays+2))
    print('Lag FE Finished.')

    df['rolling_mean_t4'] = group.transform(lambda x: x.shift(sdays).rolling(4).mean())
    df['rolling_mean_t8'] = group.transform(lambda x: x.shift(sdays).rolling(8).mean())
    df['rolling_mean_t16'] = group.transform(lambda x: x.shift(sdays).rolling(16).mean())
    df['rolling_mean_t32'] = group.transform(lambda x: x.shift(sdays).rolling(32).mean())
    print('Rolling FE Finished.')

#     df['rolling_std_t4'] = group.transform(lambda x: x.shift(sdays).rolling(4).std())
#     df['rolling_std_t8'] = group.transform(lambda x: x.shift(sdays).rolling(8).std())
#     df['rolling_std_t16'] = group.transform(lambda x: x.shift(sdays).rolling(16).std())
#     df['rolling_std_t32'] = group.transform(lambda x: x.shift(sdays).rolling(32).std())
    
#     df['rolling_skew_t3'] = group.transform(lambda x: x.shift(sdays).rolling(3).skew())
#     df['rolling_skew_t6'] = group.transform(lambda x: x.shift(sdays).rolling(6).skew())
#     df['rolling_skew_t12'] = group.transform(lambda x: x.shift(sdays).rolling(12).skew())
#     df['rolling_skew_t24'] = group.transform(lambda x: x.shift(sdays).rolling(24).skew())
#     print('SD FE Finished.')

#     df['rolling_max_t5'] = group.transform(lambda x: x.shift(sdays).rolling(5).max())
#     df['rolling_max_t10'] = group.transform(lambda x: x.shift(sdays).rolling(10).max())
#     df['rolling_max_t15'] = group.transform(lambda x: x.shift(sdays).rolling(15).max())

    del group 
    print('Feature Engineering Finished.')

    return df

In [11]:
new_df.info(null_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 77332413 entries, 0 to 77332412
Data columns (total 11 columns):
 #   Column                          Non-Null Count     Dtype  
---  ------                          --------------     -----  
 0   customer_id                     77332413 non-null  int64  
 1   YearWeek                        77332413 non-null  int64  
 2   Children Accessories, Swimwear  77332413 non-null  float64
 3   Children Sizes 134-170          77332413 non-null  float64
 4   Children Sizes 92-140           77332413 non-null  float64
 5   Divided                         77332413 non-null  float64
 6   Ladies Accessories              77332413 non-null  float64
 7   Ladieswear                      77332413 non-null  float64
 8   Lingeries/Tights                77332413 non-null  float64
 9   Menswear                        77332413 non-null  float64
 10  Sport                           77332413 non-null  float64
dtypes: float64(9), int64(2)
memory usage: 6.9 GB


In [12]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'
                      .format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [13]:
# new_df = reduce_mem_usage(new_df, verbose=True)

Mem. usage decreased to 2802.50 Mb (60.4% reduction)


In [13]:
target_list = ['Children Accessories, Swimwear',
                'Children Sizes 134-170', 'Children Sizes 92-140', 'Divided',
                'Ladies Accessories', 'Ladieswear', 'Lingeries/Tights', 'Menswear',
                'Sport']

In [None]:
t1_df = simple_feat_revenue(new_df, 1, 'Children Accessories, Swimwear')

In [None]:
t1_df.info(null_counts=True)