In [1]:
from  datetime import datetime, timedelta
import pandas as pd
import pickle
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import plotly.express as px
from downcast import reduce
from numba import jit
import time
import gc
from downcast import reduce
import random
import warnings
from math import ceil
from tqdm import trange
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
import catboost as cb
import lightgbm as lgb
warnings.filterwarnings("ignore")

In [2]:
# Large dataset, use 2 methods to save memory
@jit
def reduce_mem_usage(df, verbose=True, method='downcast'):
    start = time.time()
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024 ** 2
    if method == 'downcast':
        df = reduce(df)
    else:
        for col in df.columns:
            col_type = df[col].dtypes
            if col_type in numerics:
                c_min = df[col].min()
                c_max = df[col].max()
                if str(col_type)[:3] == 'int':
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                        df[col] = df[col].astype(np.int64)
                else:
                    if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        df[col] = df[col].astype(np.float16)
                    elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        df[col] = df[col].astype(np.float32)
                    else:
                        df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction) and time is '.format(end_mem, 100 * (
                    start_mem - end_mem) / start_mem) + str(time.time() - start))
    return df

In [3]:
FIRST_DAY = 846
# use the latest 3 years data only
remove_feature = ['id',
                  'state_id',
                  'store_id',
#                   'item_id',
#                   'dept_id',
#                   'cat_id',
                  'date','wm_yr_wk','d','sales']

cat_var = ['item_id', 'dept_id','store_id', 'cat_id', 'state_id'] + ["event_name_1", "event_name_2", "event_type_1", "event_type_2"]
cat_var = list(set(cat_var) - set(remove_feature))

In [4]:
grid2_colnm = ['sell_price', 'price_max', 'price_min', 'price_std',
               'price_mean', 'price_norm', 'price_nunique', 'item_nunique',
               'price_momentum', 'price_momentum_m', 'price_momentum_y']

grid3_colnm = ['event_name_1', 'event_type_1', 'event_name_2',
               'event_type_2', 'snap_CA', 'snap_TX', 'snap_WI', 'tm_d', 'tm_w', 'tm_m',
               'tm_y', 'tm_wm', 'tm_dw', 'tm_w_end']

lag_colnm = [ 'sales_lag_28', 'sales_lag_29', 'sales_lag_30',
             'sales_lag_31', 'sales_lag_32', 'sales_lag_33', 'sales_lag_34',
             'sales_lag_35', 'sales_lag_36', 'sales_lag_37', 'sales_lag_38',
             'sales_lag_39', 'sales_lag_40', 'sales_lag_41', 'sales_lag_42',
             
             'rolling_mean_7', 'rolling_std_7', 'rolling_mean_14', 'rolling_std_14',
             'rolling_mean_30', 'rolling_std_30', 'rolling_mean_60',
             'rolling_std_60']

mean_enc_colnm = [
    
    'enc_store_id_dept_id_mean', 'enc_store_id_dept_id_std', 
    'enc_item_id_state_id_mean', 'enc_item_id_state_id_std',

]

In [None]:
%%time

grid_1 = pd.read_pickle("E:/grid_part_1.pkl")
grid_2 = pd.read_pickle("E:/grid_part_2.pkl")[grid2_colnm]
grid_3 = pd.read_pickle("E:/grid_part_3.pkl")[grid3_colnm]

In [None]:
%%time

grid_df = pd.concat([grid_1, grid_2, grid_3], axis=1)
day = grid_df['d']
del grid_1, grid_2, grid_3
gc.collect()

In [None]:
traindata = grid_df[grid_df['d']>=FIRST_DAY]
del grid_df
gc.collect()

In [None]:
%%time

lag = pd.read_pickle("E:/lags_df_28.pkl")[lag_colnm]
lag = lag[lag.index.isin(traindata.index)]
lag

In [None]:
traindata = pd.concat([traindata, lag], axis=1)
traindata = reduce_mem_usage(traindata)
del lag
gc.collect()

In [None]:
%%time

mean_enc = pd.read_pickle("E:/mean_encoding_df.pkl")[mean_enc_colnm]
mean_enc = mean_enc[mean_enc.index.isin(traindata.index)]
traindata = pd.concat([traindata, mean_enc], axis=1)    
del mean_enc
gc.collect()

In [None]:
traindata = reduce_mem_usage(traindata)
traindata.to_pickle('E:/traindata.pkl')

In [5]:
%%time

traindata = pd.read_pickle("E:/traindata.pkl").reset_index(drop=True)

Wall time: 5.27 s


In [6]:
########################### Model params
#################################################################################
lgb_params = {
                    'boosting_type': 'gbdt',
                    'objective': 'tweedie',
                    'tweedie_variance_power': 1.1,
                    'metric': 'rmse',
                    'subsample': 0.5,
                    'subsample_freq': 1,
                    'learning_rate': 0.015,
                    'num_leaves': 2**8-1,
                    'min_data_in_leaf': 2**8-1,
                    'feature_fraction': 0.5,
                    'max_bin': 100,
                    'n_estimators': 3000,
                    'boost_from_average': False,
                    'verbose': -1
                } 

In [7]:
public = 'public'
STORES = ['CA_1', 'CA_2', 'CA_3', 'CA_4', 'TX_1', 'TX_2', 'TX_3', 'WI_1', 'WI_2', 'WI_3']
validation = {
    'cv' : [1885,1913],
    'public' : [1913, 1941],
    'private' : [1941, 1969]
}

In [9]:
########################### Train Models
#################################################################################

rmsse_bycv = dict()

print('cv : day', validation[public])

pred_list = []
for store in STORES:
    print(store, 'start')
    start = time.time()
    data = traindata[traindata['store_id'] == store]

    model_var = data.columns[~data.columns.isin(remove_feature)]
        
    tr_mask = (data['d'] <= validation[public][0])
    vl_mask = (data['d'] > validation[public][0]) & (data['d'] <= validation[public][1])
    pr_mask = (data['d'] > validation[public][1])
        
    train_data = lgb.Dataset(data[tr_mask][model_var], label=data[tr_mask]['sales'])
    valid_data = lgb.Dataset(data[vl_mask][model_var], label=data[vl_mask]['sales'])
    '''
    x_train = data[data['d'] <= validation[cv][0]].reset_index(drop=True)
    y_train = pd.DataFrame(x_train['sales'])
    x_train = x_train[model_var]
    x_val = data[(data['d'] > validation[cv][0]) & (data['d'] <= validation[cv][1])].reset_index(drop=True)
    y_val = pd.DataFrame(x_val['sales'])
    indice = x_val.index.tolist()
    x_val = x_val[model_var]
    '''

    m_lgb = lgb.train(lgb_params, train_data, valid_sets=[valid_data], verbose_eval=50, 
                          num_boost_round = 1000, early_stopping_rounds = 50)
    print('time: ' + str(time.time() - start))
    '''
    display(pd.DataFrame({'name':m_lgb.feature_name(),
                          'imp':m_lgb.feature_importance()}).sort_values('imp',ascending=False).head(25))
    '''
    indice_valid = data[vl_mask].index.tolist()
    indice_pred = data[pr_mask].index.tolist()
    prediction_valid = pd.DataFrame({'y_pred': m_lgb.predict(data[vl_mask][model_var])})
    prediction_pred = pd.DataFrame({'y_pred': m_lgb.predict(data[pr_mask][model_var])})
    prediction_valid.index = indice_valid
    prediction_pred.index = indice_pred
    
    model_name = 'non_recur_model_' + store + '.bin'
    pickle.dump(m_lgb, open(model_name, 'wb'))

    grid_1 = pd.read_pickle("E:/grid_part_1.pkl")
    pd.concat([grid_1.iloc[indice], prediction_valid], axis=1).pivot(index='id', columns='d', values='y_pred') \
            .reset_index().set_index('id').to_csv(f'submission_onlystore_{store}_{public}_valid.csv')
    pd.concat([grid_1.iloc[indice], prediction_pred], axis=1).pivot(index='id', columns='d', values='y_pred') \
            .reset_index().set_index('id').to_csv(f'submission_onlystore_{store}_{public}_pred.csv')

    del data, train_data, valid_data, m_lgb, grid_1
    gc.collect()



cv : day [1913, 1941]
CA_1 start
Training until validation scores don't improve for 50 rounds.
[50]	valid_0's rmse: 2.41937
[100]	valid_0's rmse: 1.57197
[150]	valid_0's rmse: 1.16932
[200]	valid_0's rmse: 1.00757
[250]	valid_0's rmse: 0.930772
[300]	valid_0's rmse: 0.893
[350]	valid_0's rmse: 0.872678
[400]	valid_0's rmse: 0.861239
[450]	valid_0's rmse: 0.854146
[500]	valid_0's rmse: 0.848539
[550]	valid_0's rmse: 0.844525
[600]	valid_0's rmse: 0.840868
[650]	valid_0's rmse: 0.837782
[700]	valid_0's rmse: 0.835613
[750]	valid_0's rmse: 0.835508
Early stopping, best iteration is:
[712]	valid_0's rmse: 0.835013
time: 244.079669713974
CA_2 start
Training until validation scores don't improve for 50 rounds.
[50]	valid_0's rmse: 2.09393
[100]	valid_0's rmse: 1.34701
[150]	valid_0's rmse: 0.991399
[200]	valid_0's rmse: 0.854831
[250]	valid_0's rmse: 0.791473
[300]	valid_0's rmse: 0.760152
[350]	valid_0's rmse: 0.743049
[400]	valid_0's rmse: 0.733763
[450]	valid_0's rmse: 0.727159
[500]	vali

In [33]:
import os
submission_pred = pd.read_csv('D:/HKUST/MAFS6010Z AI in Fintech/project3/sample_submission.csv').set_index('id').iloc[30490:]
submisssion_valid = pd.read_csv('D:/HKUST/MAFS6010Z AI in Fintech/project3/sample_submission.csv').set_index('id').iloc[:30490]
sub_id = pd.DataFrame({'id':submission.index.tolist()})

pri = [a for a in os.listdir() if 'onlystore' in a]
valid = [a for a in pri if 'valid' in a]
pred = [a for a in pri if 'pred' in a]

fcol = [f'F{i}' for i in range(1,29)]
sub_copy = submisssion_valid.copy()
for file in valid:
    temp = pd.read_csv(file).iloc[:,:29]
    temp.columns = ['id']+fcol
    sub_copy += sub_id.merge(temp, how='left', on='id').set_index('id').fillna(0)
sub_copy.columns = fcol

sub_copy1 = submission_pred.copy()
for file in pred:
    temp = pd.read_csv(file).iloc[:,:29]
    temp.columns = ['id']+fcol
    sub_copy1 += sub_id.merge(temp, how='left', on='id').set_index('id').fillna(0)
sub_copy1.columns = fcol

df = pd.concat[sub_copy, sub_copy1].to_csv('E:/submission_nonrecursive_store.csv')