In [1]:
import numpy as np
import pandas as pd
import os
import time
import lightgbm as lgb
import seaborn as sns
import matplotlib.pyplot as plt
import gc
import pickle
from tqdm.auto import tqdm

import warnings
warnings.filterwarnings("ignore")

In [None]:
# load data
process_df = pd.read_pickle('processed.pkl')

# since we do it by store and department, store_id, cat_id, dept_id are useless
unused_features = [
    'id',
    'state_id',
    'store_id',
    'cat_id',
    'dept_id',
    'date',
    'wm_yr_wk',
    'd',
    'sales',
    'revenue'
]

# retrieve training features
used_features = process_df.columns[~process_df.columns.isin(unused_features)]

FIRST_DAY = 0
STORES = ['CA_1', 'CA_2', 'CA_3', 'CA_4', 'TX_1', 'TX_2', 'TX_3', 'WI_1', 'WI_2', 'WI_3']
DEPTS = ['HOBBIES_1', 'HOBBIES_2', 'HOUSEHOLD_1', 'HOUSEHOLD_2', 'FOODS_1', 'FOODS_2', 'FOODS_3']

def prepare_data(df, store, state):
    df = df[df['d'] >= FIRST_DAY]
    df = df[(df['store_id'] == store) & (df['dept_id'] == state)]
    return df

# lightgbm params
params = {'boosting_type': 'gbdt',
          'metric': 'rmse',
          'objective': 'regression',
          'n_jobs': -1,
          'seed': 5013,
          'learning_rate': 0.01,
          'bagging_fraction': 0.75,
          'bagging_freq': 10, 
          'colsample_bytree': 0.75, 
          'force_col_wise': True
         }

model_dir = './models_store_dept/'

In [11]:
for store in STORES:
    for state in DEPTS:
        print('starting:',store,state)
        
        # get train and val data
        train_df = prepare_data(process_df,store,state)
        x_train = train_df[(train_df['date'] <= '2016-03-27')]
        y_train = x_train['sales']
        x_val = train_df[(train_df['date'] > '2016-03-27') & (train_df['date'] <= '2016-04-24')]
        y_val = x_val['sales']
        
        # create train and val dataset
        lgb_train = lgb.Dataset(x_train[used_features], y_train)
        lgb_val = lgb.Dataset(x_val[used_features], y_val)
        
        # model training
        model = lgb.train(params, lgb_train, num_boost_round = 2500, early_stopping_rounds = 50, 
                          valid_sets = [lgb_train, lgb_val], verbose_eval = 100
                         )
        
        # store model
        model_name = store+'_'+state+'.bin'
        pickle.dump(model, open(os.path.join(model_dir,model_name), 'wb'))
        
        del train_df, x_train, y_train, x_val, y_val, lgb_train, lgb_val
        gc.collect()

starting: CA_1 HOBBIES_1
[LightGBM] [Info] Total Bins 3628
[LightGBM] [Info] Number of data points in the train set: 784160, number of used features: 30
[LightGBM] [Info] Start training from score 0.029370
Training until validation scores don't improve for 50 rounds
[100]	training's rmse: 2.73121	valid_1's rmse: 2.60627
[200]	training's rmse: 2.60479	valid_1's rmse: 2.48915
[300]	training's rmse: 2.57261	valid_1's rmse: 2.46351
[400]	training's rmse: 2.55553	valid_1's rmse: 2.45461
[500]	training's rmse: 2.54111	valid_1's rmse: 2.45244
Early stopping, best iteration is:
[510]	training's rmse: 2.53983	valid_1's rmse: 2.4522
starting: CA_1 HOBBIES_2
[LightGBM] [Info] Total Bins 2714
[LightGBM] [Info] Number of data points in the train set: 280865, number of used features: 30
[LightGBM] [Info] Start training from score -1.647169
Training until validation scores don't improve for 50 rounds
[100]	training's rmse: 0.723755	valid_1's rmse: 0.713867
[200]	training's rmse: 0.701454	valid_1's rm