# 1. Loading Data

In [15]:
import sys
import gc
import os
import warnings
import pickle
import statsmodels.api as sm
from pylab import rcParams
import time
from  datetime import datetime, timedelta

import pandas as pd
from pandas.plotting import register_matplotlib_converters
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing, metrics

warnings.filterwarnings("ignore")

pd.set_option("display.max_columns", 500)
pd.set_option("display.max_rows", 500)

register_matplotlib_converters()
sns.set()

try:
    import google.colab
    IN_COLAB = True
except:
    IN_COLAB = False

# # Google Colab trick to extend memory
# a = []
# while(1):
#     a.append('1')


## 1.1 Functions

In [16]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df


def display_missing(df):    
    for col in df.columns.tolist():  
        if df[col].isnull().sum() != 0:
            print('{} column missing values: {}'.format(col, df[col].isnull().sum()))
    print('\n')
    


## 1.2 Loading data 


In [17]:
# Mount google drive
if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive')

In [18]:
# Setting directories where data is stored and ouptut dir
if IN_COLAB:
    DATA_GRID_INPUT_DIR = './drive/My Drive/Colab Notebooks' 
    DATA_OUTPUT_DIR = './drive/My Drive/Colab Notebooks'
    !ls './drive/My Drive/Colab Notebooks'
else:
    DATA_GRID_INPUT_DIR = '.'
    DATA_OUTPUT_DIR = '.'

In [19]:
print('Loading the data...')

train = pd.read_pickle(f'{DATA_GRID_INPUT_DIR}/train_direct_model.pkl')
val = pd.read_pickle(f'{DATA_GRID_INPUT_DIR}/val_direct_model.pkl')
test = pd.read_pickle(f'{DATA_GRID_INPUT_DIR}/test_direct_model.pkl')

Loading the data...


## 1.3 Init variables

In [20]:
h = 28 # Prediction horizon
MAX_LAGS = 200 # Max lags used
SEED = 7


In [21]:
LAST_TRAIN_DAY_DT = datetime(2016, 3, 20) # Sunday
LAST_TRAIN_VAL__DAY_DT = datetime(2016, 3, 27) # Sunday
FIRST_PRED_DAY_DT = datetime(2016, 4, 25) # Monday
TEST_SET_DAY = FIRST_PRED_DAY_DT - timedelta(days=1)
TRAINING_DAYS_NUM = 14 # Days for training

# 2. Fit& Predict

In [22]:
cat_feats = ['item_id', 'dept_id','store_id', 'cat_id', 'state_id']
useless_cols = ["id", "date", "sales","d", "wm_yr_wk", "weekday", "weights"] + \
                ['wday', 'month', 'year', 'snap_CA', 'snap_TX', 'snap_WI']  # may be is better to add snaps 
targets = ['y_' + str(i) for i in range(1, h + 1)]

train_cols = train.columns[~train.columns.isin(useless_cols + targets)]

In [23]:
MAX_ROUNDS = 5000

params = {
    'num_leaves': 80,
    'objective': 'regression',
    'min_data_in_leaf': 200,
    'learning_rate': 0.02,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.7,
    'bagging_freq': 1,
    'metric': 'rmse',
    'num_threads': 4,
    'seed': SEED
}


# Recursive model's params
# params = {
#             'boosting_type': 'gbdt',
#             'objective': 'tweedie',
#             'tweedie_variance_power': 1.1,
#             'metric': 'rmse',
#             'subsample': 0.5,
#             'subsample_freq': 1,
#             'learning_rate': 0.03,
#             'num_leaves': 2**11-1,
#             'min_data_in_leaf': 2**12-1,
#             'feature_fraction': 0.5,
#             'max_bin': 100,
#             'n_estimators': 1400,
#             'boost_from_average': False,
#             'verbose': 1,
#             'n_jobs': 4, # For local computation optimization
#             'seed': SEED,
# } 


In [24]:
%%time

# Setting train, val and test sets
# Converting to numpy for optimal memory usage
X_train_np = train[train_cols].values.astype(np.float16)
X_val_np = val[train_cols].values.astype(np.float16)
X_test_np = test[train_cols].values.astype(np.float16)

# Initializing dataframe for later submission
final_preds = pd.DataFrame(test['id'])

# Training a models separate for each day
for day in range(1, h + 1):
#for day in range(1, 2):
    # Updating y's for every iteration
    y_train_np = train['y_' + str(day)].values.astype(np.float16)
    y_val_np = val['y_' + str(day)].values.astype(np.float16)
    
    train_data = lgb.Dataset(X_train_np, label = y_train_np, feature_name = list(train_cols), categorical_feature=cat_feats, free_raw_data=False)
    val_data = lgb.Dataset(X_val_np, label = y_val_np, feature_name = list(train_cols), categorical_feature=cat_feats, free_raw_data=False)
    
    print(f"Training day {day}:")
    
    m_lgb = lgb.train(params, train_data, valid_sets = [train_data, val_data], 
                  verbose_eval=50, early_stopping_rounds=125, num_boost_round=MAX_ROUNDS)
    
    
    feature_importance = pd.DataFrame({"Value": m_lgb.feature_importance("gain"), "Feature": m_lgb.feature_name()}) \
                    .sort_values(by="Value", ascending=False)
    
    # Printing 50 most important features
    print(feature_importance.iloc[0:50])
    
    final_preds[f'F{day}'] = m_lgb.predict(X_test_np)
    
        # May be we will need it to intermediate check of performance
#     val_preds.append(m_lgb.predict(
#         X_valid_np, num_iteration=m_lgb.best_iteration or MAX_ROUNDS))
    

final_preds.to_pickle(f'{DATA_GRID_INPUT_DIR}/final_preds.pkl', protocol=3)

Training day 1:
Training until validation scores don't improve for 125 rounds
[50]	training's rmse: 2.17246	valid_1's rmse: 1.86153
[100]	training's rmse: 1.92269	valid_1's rmse: 1.75665
[150]	training's rmse: 1.85146	valid_1's rmse: 1.77334
[200]	training's rmse: 1.8184	valid_1's rmse: 1.79038
Early stopping, best iteration is:
[101]	training's rmse: 1.92059	valid_1's rmse: 1.75616
            Value                  Feature
55   2.110292e+07            mean_14_decay
46   1.167547e+07                   mean_7
48   6.143324e+06             mean_7_decay
62   5.966830e+06            mean_30_decay
53   2.796368e+06                  mean_14
0    1.262416e+06                  item_id
107  9.412002e+05            mean_20_dow_3
39   7.186943e+05                   mean_3
10   6.747928e+05                   lag_t0
56   6.557093e+05                median_14
41   3.056819e+05             mean_3_decay
37   2.969532e+05                  lag_t27
67   2.221545e+05                  mean_60
50   2.03122

Training day 5:
Training until validation scores don't improve for 125 rounds
[50]	training's rmse: 2.30076	valid_1's rmse: 2.48871
[100]	training's rmse: 2.05721	valid_1's rmse: 2.19903
[150]	training's rmse: 1.9705	valid_1's rmse: 2.13174
[200]	training's rmse: 1.92278	valid_1's rmse: 2.11181
[250]	training's rmse: 1.89036	valid_1's rmse: 2.1042
[300]	training's rmse: 1.86514	valid_1's rmse: 2.10014
[350]	training's rmse: 1.84301	valid_1's rmse: 2.09731
[400]	training's rmse: 1.82204	valid_1's rmse: 2.09614
[450]	training's rmse: 1.80313	valid_1's rmse: 2.09463
[500]	training's rmse: 1.78569	valid_1's rmse: 2.09425
[550]	training's rmse: 1.7681	valid_1's rmse: 2.09375
[600]	training's rmse: 1.75138	valid_1's rmse: 2.09284
[650]	training's rmse: 1.73542	valid_1's rmse: 2.09416
[700]	training's rmse: 1.7202	valid_1's rmse: 2.09401
Early stopping, best iteration is:
[606]	training's rmse: 1.7494	valid_1's rmse: 2.09226
            Value                    Feature
62   2.295778e+07      

Training day 8:
Training until validation scores don't improve for 125 rounds
[50]	training's rmse: 2.2453	valid_1's rmse: 2.25456
[100]	training's rmse: 2.01449	valid_1's rmse: 2.05092
[150]	training's rmse: 1.94545	valid_1's rmse: 2.01915
[200]	training's rmse: 1.91118	valid_1's rmse: 2.01423
[250]	training's rmse: 1.88707	valid_1's rmse: 2.01201
[300]	training's rmse: 1.86693	valid_1's rmse: 2.01243
[350]	training's rmse: 1.84813	valid_1's rmse: 2.013
Early stopping, best iteration is:
[269]	training's rmse: 1.87915	valid_1's rmse: 2.01185
            Value                  Feature
62   2.358814e+07            mean_30_decay
55   8.208304e+06            mean_14_decay
46   5.803326e+06                   mean_7
0    2.912917e+06                  item_id
69   2.873630e+06            mean_60_decay
76   1.805306e+06           mean_140_decay
107  1.597744e+06            mean_20_dow_3
63   1.091096e+06                median_30
48   6.438931e+05             mean_7_decay
77   5.388271e+05    

Training day 11:
Training until validation scores don't improve for 125 rounds
[50]	training's rmse: 2.27657	valid_1's rmse: 2.24616
[100]	training's rmse: 2.05711	valid_1's rmse: 2.05898
[150]	training's rmse: 1.98995	valid_1's rmse: 2.03344
[200]	training's rmse: 1.9548	valid_1's rmse: 2.02997
[250]	training's rmse: 1.92984	valid_1's rmse: 2.02933
[300]	training's rmse: 1.90753	valid_1's rmse: 2.02752
[350]	training's rmse: 1.88719	valid_1's rmse: 2.02843
[400]	training's rmse: 1.86762	valid_1's rmse: 2.02931
Early stopping, best iteration is:
[298]	training's rmse: 1.9085	valid_1's rmse: 2.0273
            Value                  Feature
62   2.261522e+07            mean_30_decay
69   5.380068e+06            mean_60_decay
111  4.154819e+06            mean_20_dow_5
76   3.974813e+06           mean_140_decay
0    3.157822e+06                  item_id
55   2.634253e+06            mean_14_decay
115  1.197111e+06            mean_20_dow_7
53   1.120347e+06                  mean_14
107  8.7

Training day 14:
Training until validation scores don't improve for 125 rounds
[50]	training's rmse: 2.75751	valid_1's rmse: 2.77442
[100]	training's rmse: 2.43442	valid_1's rmse: 2.49247
[150]	training's rmse: 2.3421	valid_1's rmse: 2.44211
[200]	training's rmse: 2.29629	valid_1's rmse: 2.42898
[250]	training's rmse: 2.2633	valid_1's rmse: 2.42392
[300]	training's rmse: 2.23565	valid_1's rmse: 2.42077
[350]	training's rmse: 2.2104	valid_1's rmse: 2.41997
[400]	training's rmse: 2.18672	valid_1's rmse: 2.41899
[450]	training's rmse: 2.16493	valid_1's rmse: 2.4185
[500]	training's rmse: 2.1445	valid_1's rmse: 2.41756
[550]	training's rmse: 2.1251	valid_1's rmse: 2.41943
[600]	training's rmse: 2.10614	valid_1's rmse: 2.42094
Early stopping, best iteration is:
[492]	training's rmse: 2.14785	valid_1's rmse: 2.41712
            Value                  Feature
62   4.384005e+07            mean_30_decay
69   6.676236e+06            mean_60_decay
76   6.495060e+06           mean_140_decay
0    6

Training day 17:
Training until validation scores don't improve for 125 rounds
[50]	training's rmse: 2.13037	valid_1's rmse: 1.95788
[100]	training's rmse: 1.92308	valid_1's rmse: 1.79522
[150]	training's rmse: 1.85943	valid_1's rmse: 1.77171
[200]	training's rmse: 1.82708	valid_1's rmse: 1.76792
[250]	training's rmse: 1.80233	valid_1's rmse: 1.76732
[300]	training's rmse: 1.78197	valid_1's rmse: 1.76564
[350]	training's rmse: 1.76296	valid_1's rmse: 1.76512
[400]	training's rmse: 1.74612	valid_1's rmse: 1.76613
[450]	training's rmse: 1.7301	valid_1's rmse: 1.76643
Early stopping, best iteration is:
[330]	training's rmse: 1.77049	valid_1's rmse: 1.76487
            Value                  Feature
62   1.466896e+07            mean_30_decay
69   7.478102e+06            mean_60_decay
76   5.645083e+06           mean_140_decay
77   3.558082e+06               median_140
0    3.241163e+06                  item_id
55   1.950761e+06            mean_14_decay
53   1.834375e+06                  me

Training day 20:
Training until validation scores don't improve for 125 rounds
[50]	training's rmse: 2.88501	valid_1's rmse: 2.54214
[100]	training's rmse: 2.53212	valid_1's rmse: 2.26201
[150]	training's rmse: 2.42558	valid_1's rmse: 2.22074
[200]	training's rmse: 2.3732	valid_1's rmse: 2.21045
[250]	training's rmse: 2.3357	valid_1's rmse: 2.20328
[300]	training's rmse: 2.30532	valid_1's rmse: 2.19704
[350]	training's rmse: 2.27812	valid_1's rmse: 2.18957
[400]	training's rmse: 2.2519	valid_1's rmse: 2.18446
[450]	training's rmse: 2.22806	valid_1's rmse: 2.18031
[500]	training's rmse: 2.20634	valid_1's rmse: 2.17729
[550]	training's rmse: 2.18564	valid_1's rmse: 2.17519
[600]	training's rmse: 2.16575	valid_1's rmse: 2.17275
[650]	training's rmse: 2.14636	valid_1's rmse: 2.1697
[700]	training's rmse: 2.1276	valid_1's rmse: 2.16881
[750]	training's rmse: 2.10963	valid_1's rmse: 2.16616
[800]	training's rmse: 2.09295	valid_1's rmse: 2.16384
[850]	training's rmse: 2.07665	valid_1's rmse: 

Training until validation scores don't improve for 125 rounds
[50]	training's rmse: 2.17333	valid_1's rmse: 2.0866
[100]	training's rmse: 1.95813	valid_1's rmse: 1.88897
[150]	training's rmse: 1.89219	valid_1's rmse: 1.85591
[200]	training's rmse: 1.85921	valid_1's rmse: 1.8505
[250]	training's rmse: 1.83559	valid_1's rmse: 1.84888
[300]	training's rmse: 1.81539	valid_1's rmse: 1.84907
[350]	training's rmse: 1.79658	valid_1's rmse: 1.84984
[400]	training's rmse: 1.77895	valid_1's rmse: 1.85169
Early stopping, best iteration is:
[296]	training's rmse: 1.81698	valid_1's rmse: 1.84855
            Value                   Feature
69   1.728956e+07             mean_60_decay
76   8.650682e+06            mean_140_decay
62   6.302900e+06             mean_30_decay
0    3.665562e+06                   item_id
77   2.839659e+06                median_140
109  2.161836e+06             mean_20_dow_4
53   1.275114e+06                   mean_14
111  8.888394e+05             mean_20_dow_5
56   7.162991e+

Training until validation scores don't improve for 125 rounds
[50]	training's rmse: 2.36855	valid_1's rmse: 2.15407
[100]	training's rmse: 2.1002	valid_1's rmse: 1.9407
[150]	training's rmse: 2.02007	valid_1's rmse: 1.91539
[200]	training's rmse: 1.98031	valid_1's rmse: 1.91043
[250]	training's rmse: 1.95171	valid_1's rmse: 1.90873
[300]	training's rmse: 1.92859	valid_1's rmse: 1.90764
[350]	training's rmse: 1.90723	valid_1's rmse: 1.90592
[400]	training's rmse: 1.88703	valid_1's rmse: 1.90493
[450]	training's rmse: 1.86822	valid_1's rmse: 1.90398
[500]	training's rmse: 1.84984	valid_1's rmse: 1.9031
[550]	training's rmse: 1.83272	valid_1's rmse: 1.90121
[600]	training's rmse: 1.81584	valid_1's rmse: 1.9007
[650]	training's rmse: 1.8	valid_1's rmse: 1.90154
Early stopping, best iteration is:
[570]	training's rmse: 1.82586	valid_1's rmse: 1.90014
            Value                  Feature
76   1.528556e+07           mean_140_decay
62   1.401333e+07            mean_30_decay
69   9.665527

In [25]:
print(y_train_np.shape)
X_train_np.shape

(426725,)


(426725, 116)

In [26]:
# Just to be sure that we doing all right, loading sample submission file
sample_submission = pd.read_csv(f'{DATA_GRID_INPUT_DIR}/sample_submission.csv')
sample_submission = pd.DataFrame(sample_submission['id'])

# Again to be sure everything is fine, we create evaluation items
final_preds_eval = final_preds.copy()
for i in range(1, h + 1):
    final_preds_eval[f'F{i}'] = 0
final_preds_eval['id'] = final_preds_eval['id'].str.replace('validation', 'evaluation')

# Merge final_preds
final_preds = pd.concat([final_preds, final_preds_eval], axis=0)

final_submission = sample_submission.merge(final_preds, on=['id'], copy=False)

final_submission.to_csv(f"{DATA_OUTPUT_DIR}/submission_direct.csv",index=False)

In [27]:
os.system('say "Training complete"')

0

In [28]:
# m_lgb.save_model(f'{DATA_OUTPUT_DIR}/model.lgb')
# m_lgb = lgb.Booster(model_file=f'{DATA_OUTPUT_DIR}/model.lgb')