In [1]:
import os, pickle, sys, time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline 
import sklearn
from sklearn.linear_model import LinearRegression
import lightgbm as lgbm

In [2]:
for p in [np, pd, sklearn, lgbm]:
    print (p.__name__, p.__version__)

numpy 1.18.1
pandas 1.0.1
sklearn 0.22.1
lightgbm 2.3.1


# Load Training Data

In [3]:
# Load training and validation data.  
train_df = pd.read_hdf('../CleanData/trainDF.h5', 'df')
valid_df = pd.read_hdf('../CleanData/validDF.h5', 'df')

In [4]:
# Concatenate the training and validation data.  This constitues all the data available for training.
train_valid_df = pd.concat([train_df, valid_df], ignore_index=True, sort=False)

In [5]:
# Single out the target variable.
Y_valid = valid_df['target']
Y_train_valid = train_valid_df['target']

# Light GBM

We will train light GBM on the entire training data.

In [6]:
# Retrain on the entire training and validation set, and make prediction on test set. 
lgbm_features = ['avg_item_price_lag_1', 'target_lag_1', 'target_shop_lag_1', 'target_item_lag_1', 
                 'target_item_category_lag_1', 'avg_item_price_lag_2', 'target_lag_2', 'target_shop_lag_2',
                 'target_item_lag_2', 'target_item_category_lag_2', 'avg_item_price_lag_3', 'target_lag_3', 
                 'target_shop_lag_3', 'target_item_lag_3', 'target_item_category_lag_3', 'avg_item_price_lag_4', 
                 'target_lag_4', 'target_shop_lag_4', 'target_item_lag_4', 'target_item_category_lag_4',
                 'avg_item_price_lag_5', 'target_lag_5', 'target_shop_lag_5', 'target_item_lag_5', 
                 'target_item_category_lag_5', 'avg_item_price_lag_6', 'target_lag_6', 'target_shop_lag_6',
                 'target_item_lag_6', 'target_item_category_lag_6', 'avg_item_price_lag_12', 'target_lag_12', 
                 'target_shop_lag_12', 'target_item_lag_12', 'target_item_category_lag_12', 'shop_mean',
                 'item_mean', 'shop_item_mean', 'item_category_mean', 'month']
lgbm_train_data = lgbm.Dataset(train_valid_df[lgbm_features], label=Y_train_valid, feature_name=lgbm_features) #categorical_feature
lgbm_valid_data = lgbm.Dataset(valid_df[lgbm_features], label=Y_valid, feature_name=lgbm_features)

params = {'objective':'regression', 'metric':['rmse'], 'boosting_type':'gbdt', 'num_rounds':100, 'eta':0.2, 
          'max_depth':5, 'min_data_in_leaf':150, 'min_gain_to_split':0.01, 
          'feature_fraction':0.7, 'bagging_freq':0, 'bagging_fraction':1.0, 'lambda_l1':0,
          'lambda_l2':0.001, 'early_stopping_round':20, 'verbosity':1}
eval_metrics_full = {}
start = time.time()
lgbm_model_full = lgbm.train(params, lgbm_train_data, valid_sets=[lgbm_train_data, lgbm_valid_data],
                             valid_names=['train', 'valid'], evals_result=eval_metrics_full, verbose_eval=True)
end = time.time()
print(end-start)



[1]	train's rmse: 1.10888	valid's rmse: 1.02024
Training until validation scores don't improve for 20 rounds
[2]	train's rmse: 1.04434	valid's rmse: 0.967287
[3]	train's rmse: 0.999699	valid's rmse: 0.930433
[4]	train's rmse: 0.968809	valid's rmse: 0.904698
[5]	train's rmse: 0.935099	valid's rmse: 0.877754
[6]	train's rmse: 0.919503	valid's rmse: 0.865431
[7]	train's rmse: 0.895009	valid's rmse: 0.844792
[8]	train's rmse: 0.881292	valid's rmse: 0.832516
[9]	train's rmse: 0.869861	valid's rmse: 0.822814
[10]	train's rmse: 0.863269	valid's rmse: 0.816816
[11]	train's rmse: 0.855875	valid's rmse: 0.810763
[12]	train's rmse: 0.850413	valid's rmse: 0.806096
[13]	train's rmse: 0.847366	valid's rmse: 0.803608
[14]	train's rmse: 0.843692	valid's rmse: 0.801739
[15]	train's rmse: 0.841116	valid's rmse: 0.799824
[16]	train's rmse: 0.838802	valid's rmse: 0.797982
[17]	train's rmse: 0.83049	valid's rmse: 0.791654
[18]	train's rmse: 0.828784	valid's rmse: 0.790664
[19]	train's rmse: 0.82424	valid's

In [7]:
# Save model.
filename = "../Model/lgbm_model_full.pkl"
with open(filename, 'wb') as file:
    pickle.dump(lgbm_model_full, file)

# Linear Model

We will train linear model on the entire training data, using featurees selected by ElasticNetCV only.

In [8]:
# Retrain on the entire training and validation set, and make prediction on test set. 
lr_features = ['target_lag_1', 'target_lag_2', 'target_lag_3', 'target_lag_4', 'target_lag_5', 'target_lag_6', 
               'shop_mean', 'item_mean', 'shop_item_mean', 'item_category_mean']
lr_model_full = LinearRegression(normalize=True, n_jobs=-1)
lr_model_full.fit(train_valid_df[lr_features], Y_train_valid)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=-1, normalize=True)

In [9]:
# Save model.
filename = "../Model/lr_model_full.pkl"
with open(filename, 'wb') as file:
    pickle.dump(lr_model_full, file)