In [1]:
import numpy as np
import pandas as pd
import os
import time
import lightgbm as lgb
import seaborn as sns
import matplotlib.pyplot as plt
import gc
import pickle
from tqdm.auto import tqdm

import warnings
warnings.filterwarnings("ignore")

In [7]:
# load data
process_df = pd.read_pickle('processed.pkl')

# since we do it by store and category, store_id, cat_id are useless
# dept_id is useful
unused_features = [
    'id',
    'state_id',
    'store_id',
    'cat_id',
    # 'dept_id',
    'date',
    'wm_yr_wk',
    'd',
    'sales',
    'revenue'
]

# retrieve training features
used_features = process_df.columns[~process_df.columns.isin(unused_features)]

FIRST_DAY = 0
STORES = ['CA_1', 'CA_2', 'CA_3', 'CA_4', 'TX_1', 'TX_2', 'TX_3', 'WI_1', 'WI_2', 'WI_3']
CATEGORIES = ['HOBBIES', 'HOUSEHOLD', 'FOODS']

def prepare_data(df, store, category):
    df = df[df['d'] >= FIRST_DAY]
    df = df[(df['store_id'] == store) & (df['cat_id'] == category)]
    return df

# lightgbm params
params = {'boosting_type': 'gbdt',
          'metric': 'rmse',
          'objective': 'regression',
          'n_jobs': -1,
          'seed': 5013,
          'learning_rate': 0.01,
          'bagging_fraction': 0.75,
          'bagging_freq': 10, 
          'colsample_bytree': 0.75, 
          'force_col_wise': True
         }

model_dir = './models_store_cat/'

In [8]:
for store in STORES:
    for category in CATEGORIES:
        print('starting:',store,category)
        
        # get train and val data
        train_df = prepare_data(process_df,store,category)
        x_train = train_df[(train_df['date'] <= '2016-03-27')]
        y_train = x_train['sales']
        x_val = train_df[(train_df['date'] > '2016-03-27') & (train_df['date'] <= '2016-04-24')]
        y_val = x_val['sales']
        
        # create train and val dataset
        lgb_train = lgb.Dataset(x_train[used_features], y_train)
        lgb_val = lgb.Dataset(x_val[used_features], y_val)
        
        # model training
        model = lgb.train(params, lgb_train, num_boost_round = 2500, early_stopping_rounds = 50, 
                          valid_sets = [lgb_train, lgb_val], verbose_eval = 100
                         )
        
        # store model
        model_name = store+'_'+category+'.bin'
        pickle.dump(model, open(os.path.join(model_dir,model_name), 'wb'))
        
        del train_df, x_train, y_train, x_val, y_val, lgb_train, lgb_val
        gc.collect()

starting: CA_1 HOBBIES
[LightGBM] [Info] Total Bins 3743
[LightGBM] [Info] Number of data points in the train set: 1065025, number of used features: 31
[LightGBM] [Info] Start training from score 0.809019
Training until validation scores don't improve for 50 rounds
[100]	training's rmse: 2.34958	valid_1's rmse: 2.25166
[200]	training's rmse: 2.2516	valid_1's rmse: 2.16561
[300]	training's rmse: 2.22005	valid_1's rmse: 2.14872
[400]	training's rmse: 2.20136	valid_1's rmse: 2.1454
[500]	training's rmse: 2.18634	valid_1's rmse: 2.14447
Early stopping, best iteration is:
[520]	training's rmse: 2.18386	valid_1's rmse: 2.14307
starting: CA_1 HOUSEHOLD
[LightGBM] [Info] Total Bins 4012
[LightGBM] [Info] Number of data points in the train set: 1973595, number of used features: 31
[LightGBM] [Info] Start training from score 0.715687
Training until validation scores don't improve for 50 rounds
[100]	training's rmse: 1.25284	valid_1's rmse: 1.54144
[200]	training's rmse: 1.17451	valid_1's rmse: 1