In [1]:
import numpy as np
import pandas as pd
import datetime
import matplotlib.pyplot as plt
from itertools import product
from lightgbm import LGBMRegressor
from sklearn.preprocessing import StandardScaler
import warnings
import json
warnings.filterwarnings("ignore")

def RMSE(series_true, series_predicted):
    return sum((series_true - series_predicted)**2 / len(series_true))**0.5

In [2]:
X_lags = pd.read_csv('data/X_without_target_encoding.csv')
y_lags = pd.read_csv('data/y_without_target_encoding.csv').item_cnt
X_test_lags = pd.read_csv('data/test_without_target_encoding.csv', index_col='ID')

In [3]:
types = {
    'date_block_num': 'int8',
    'shop_id': 'int8',
    'item_id': 'int16',
    'item_category_id': 'int8',
    'super_category': 'int8',
    'category': 'int8',
    'city': 'int8',
    'shop_type': 'int8',
    'shop_name': 'int8'
}
types.update({
    f'item_cnt_lag{i}': 'int8' for i in range(1, 13)
})
X_lags = X_lags.astype(types)
X_test_lags = X_test_lags.astype(types)

In [4]:
X_lags.drop(columns=['shop_id', 'item_id', 'item_category_id'], inplace=True)
X_test_lags.drop(columns=['shop_id', 'item_id', 'item_category_id'], inplace=True)

In [5]:
X_train = X_lags[X_lags.date_block_num < 33]
y_train = y_lags[X_lags.date_block_num < 33]

X_val = X_lags[X_lags.date_block_num == 33]
y_val = y_lags[X_lags.date_block_num == 33]

In [6]:
cat_features = X_train.columns.to_list()[:6]

In [7]:
train_scores = {}
val_scores = {}

In [9]:
for learning_rate, early_stopping_rounds in product([0.01, 0.05, 0.1, 0.5, 1], range(10, 91, 20)):
    lgbm = LGBMRegressor(max_depth=12, learning_rate=learning_rate, n_estimators=1000)
    lgbm.fit(X_train,
             y_train,
             eval_metric='l2',
             eval_set=[(X_val, y_val)],
             early_stopping_rounds=early_stopping_rounds,
             verbose=-1,
             feature_name=X_train.columns.to_list(),
             categorical_feature=cat_features
             )
    train_scores[(12, learning_rate, early_stopping_rounds)] = RMSE(
        y_train, lgbm.predict(X_train).clip(0, 20))
    val_scores[(12, learning_rate, early_stopping_rounds)] = RMSE(
        y_val, lgbm.predict(X_val).clip(0, 20))

In [16]:
val_scores_df = pd.DataFrame(val_scores, index=[1]).T.reset_index()\
    .rename(columns={
    'level_0': 'max_depth',
    'level_1': 'learning_rate',
    'level_2': 'early_stopping_rounds',
    1: 'val_score'
}).drop(columns='max_depth')

val_scores_df.to_csv('data/val_scores.csv', index=None)

In [20]:
train_scores_df = pd.DataFrame(train_scores, index=[1]).T.reset_index()    .rename(columns={
    'level_0': 'max_depth',
    'level_1': 'learning_rate',
    'level_2': 'early_stopping_rounds',
    1: 'train_score'
}).drop(columns='max_depth')

train_scores_df.to_csv('data/train_scores.csv', index=None)

In [21]:
val_scores_df[val_scores_df.val_score == val_scores_df.val_score.min()]

Unnamed: 0,learning_rate,early_stopping_rounds,val_score
12,0.1,50,0.919165
13,0.1,70,0.919165
14,0.1,90,0.919165


In [22]:
train_scores_df[train_scores_df.train_score == train_scores_df.train_score.min()]

Unnamed: 0,learning_rate,early_stopping_rounds,train_score
12,0.1,50,0.829572
13,0.1,70,0.829572
14,0.1,90,0.829572
