In [2]:
import os, sys
import joblib
import importlib

import pandas as pd
import polars as pl
import numpy as np

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

import dproc, sgml, sgutil, sgpp

import lightgbm as lgb
import catboost as cb
import xgboost as xgb

print(sys.version)
for i in [pd, pl, mpl, sns, np, lgb, cb, xgb]:
    try:
        print(i.__name__, i.__version__)
    except:
        pass

3.12.6 (main, Sep 30 2024, 02:19:13) [GCC 9.4.0]
pandas 2.2.3
polars 1.12.0
matplotlib 3.8.4
seaborn 0.13.2
numpy 1.26.4
catboost 1.2.5
xgboost 2.1.2


In [4]:
df_train = pd.read_parquet(os.path.join('data', 'train.parquet'))
df_test = pd.read_parquet(os.path.join('data', 'test.parquet'))

# Config

In [158]:
from sklearn.model_selection import StratifiedShuffleSplit, train_test_split, StratifiedKFold, GroupKFold
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.linear_model import LinearRegression

target = 'num_sold'
def get_validation_splitter(validation_fraction):
    return lambda x: (x.loc[x['year'] != x['year'].max()], x.loc[x['year'] == x['year'].max()])

config = {
    'predict_func': lambda m, df, X: pd.Series(m.predict(df[X]), index = df.index),
    'score_func': lambda df, prds: mean_absolute_percentage_error(df[target].sort_index(), prds.sort_index()),
    'validation_splitter': get_validation_splitter,
    'progress_callback': sgml.ProgressCallBack(), 
    'return_train_scores': True,
    'sp_y': 'md',
    'y': 'num_sold',
    'target_func': lambda a, b: np.log(b),
    'target_invfunc': lambda b: np.exp(b)
}

config2 = config.copy()
config2['groups'] = 'year'

ss = StratifiedShuffleSplit(n_splits = 1, train_size = 0.7, random_state = 123)
skf = StratifiedKFold(n_splits = 5, random_state = 123, shuffle = True)
gkf = GroupKFold(n_splits = 4)

cb_adapter = sgml.CBAdapter(cb.CatBoostRegressor)
lr_adapter = sgml.SklearnAdapter(LinearRegression)
lgb_adapter = sgml.LGBMAdapter(lgb.LGBMRegressor)
xgb_adapter = sgml.XGBAdapter(xgb.XGBRegressor)

In [11]:
cb1 = sgml.CVModel('model', 'cb1', skf, config, cb_adapter).load_if_exists()

In [31]:
hparams = {
    'model_params': {'n_estimators': 10000, 'max_depth': 5, 'learning_rate': 0.2},
    'X_num': ['c1', 's1', 'c2', 's2'],
    'X_cat': ['md', 'weekday', 'country', 'y_8', 'store', 'product'],
    #'validation_fraction': 0.1
}
#result = cb1.adhoc(df_train, ss, hparams, task_type = 'GPU')
result = cb1.cv(df_train, hparams, task_type = 'GPU')
result['valid_scores'], result['train_scores']

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

Round:   0%|          | 0/10000 [00:00<?, ?it/s]

Round:   0%|          | 0/10000 [00:00<?, ?it/s]

Round:   0%|          | 0/10000 [00:00<?, ?it/s]

Round:   0%|          | 0/10000 [00:00<?, ?it/s]

Round:   0%|          | 0/10000 [00:00<?, ?it/s]

([0.046593402960826175,
  0.04660733434506866,
  0.046515691123249873,
  0.04622979327412978,
  0.04648041662296091],
 [0.044391870955446444,
  0.044488168765938495,
  0.04441082117504539,
  0.044449248434999554,
  0.04458178182615379])

In [32]:
lgb1 = sgml.CVModel('model', 'lgb1', skf, config, lgb_adapter).load_if_exists()

In [38]:
hparams = {
    'model_params': {'n_estimators': 5000, 'num_leaves': 31, 'learning_rate': 0.1},
    'X_num': ['c1', 's1', 'c2', 's2'],
    'X_cat': ['md', 'weekday', 'country', 'y_8', 'store', 'product'],
    #'validation_fraction': 0.1
}
#result = lgb1.adhoc(df_train, ss, hparams)
result = lgb1.cv(df_train, hparams)
result['valid_scores'], result['train_scores']

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

Round:   0%|          | 0/5000 [00:00<?, ?it/s]

Round:   0%|          | 0/5000 [00:00<?, ?it/s]

Round:   0%|          | 0/5000 [00:00<?, ?it/s]

Round:   0%|          | 0/5000 [00:00<?, ?it/s]

Round:   0%|          | 0/5000 [00:00<?, ?it/s]

([0.04478281981112351,
  0.04451517845214932,
  0.04469590453764895,
  0.04446544273786029,
  0.04465349156512319],
 [0.03355373506821657,
  0.03339781633358599,
  0.0334232239172728,
  0.03362009033976975,
  0.033840076928441744])

In [39]:
xgb1 = sgml.CVModel('model', 'xgb1', skf, config, xgb_adapter).load_if_exists()

In [60]:
hparams = {
    'model_params': {'n_estimators': 10000, 'max_depth': 7, 'learning_rate': 0.06},
    'X_num': ['c1', 's1', 'c2', 's2'],
    'X_ohe': ['md', 'weekday', 'country', 'y_8', 'store', 'product'], 'ohe': {'drop': 'if_binary'}, 
    'validation_fraction': 0.1
}
#result = xgb1.adhoc(df_train, ss, hparams, device = 'cuda')
result = xgb1.cv(df_train, hparams, device = 'cuda')
result['valid_scores'], result['train_scores']

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

Round:   0%|          | 0/10000 [00:00<?, ?it/s]

Round:   0%|          | 0/10000 [00:00<?, ?it/s]

Round:   0%|          | 0/10000 [00:00<?, ?it/s]

Round:   0%|          | 0/10000 [00:00<?, ?it/s]

Round:   0%|          | 0/10000 [00:00<?, ?it/s]

([0.047309395, 0.04695019, 0.047162827, 0.0470137, 0.04714548],
 [0.030449303, 0.030454403, 0.030381193, 0.030523246, 0.030497976])

In [63]:
models = [cb1, lgb1, xgb1]
df_stk = sgml.stack_cv(models, df_train[target])
df_stk.corr()

Unnamed: 0,cb1,lgb1,xgb1,num_sold
cb1,1.0,0.999164,0.998997,0.996558
lgb1,0.999164,1.0,0.999464,0.996793
xgb1,0.998997,0.999464,1.0,0.996508
num_sold,0.996558,0.996793,0.996508,1.0


In [75]:
mean_absolute_percentage_error(
    df_stk.iloc[:, :-1].dot([0.3, 0.6, 0.1]), df_stk[target]
)

0.043748826690744605

In [76]:
for i in models:
    if i.name.startswith('cb'):
        i.train(df_train, task_type = 'GPU')
    elif i.name.startswith('xgb'):
        i.train(df_train, device = 'cuda')
    else:
        i.train(df_train)

Round:   0%|          | 0/5000 [00:00<?, ?it/s]

Round:   0%|          | 0/10000 [00:00<?, ?it/s]

In [79]:
df_stk_prd = sgml.stack_prd(models, df_test, config)

In [83]:
np.exp(df_stk_prd.dot([0.3, 0.6, 0.1])).rename('num_sold').to_frame().to_csv(
    os.path.join('result', 'submission6.csv')
)

In [None]:
#!kaggle competitions submit -c playground-series-s5e1 -f result/submission6.csv -m "6"

In [88]:
cb2 = sgml.CVModel('model', 'cb2', gkf, config, cb_adapter).load_if_exists()

In [None]:
hparams = {
    'model_params': {'n_estimators': 10000, 'max_depth': 5, 'learning_rate': 0.2},
    'X_num': ['c1', 's1', 'c2', 's2'],
    'X_cat': ['md', 'weekday', 'country', 'y_8', 'store', 'product'],
    'validation_fraction': 0.1
}
result = cb1.adhoc(df_train, ss, hparams, task_type = 'GPU')
#result = cb1.cv(df_train, hparams, task_type = 'GPU')
result['valid_scores'], result['train_scores']

In [166]:
lgb2 = sgml.CVModel('model', 'lgb2', gkf, config2, lgb_adapter).load_if_exists()

In [169]:
hparams = {
    'model_params': {'n_estimators': 1000, 'num_leaves': 15, 'learning_rate': 0.01},
    'X_num': ['c1', 's1', 'c2', 's2'],
    'X_cat': ['md', 'weekday', 'country', 'y_8', 'store', 'product'],
}
result = lgb2.cv(df_train, hparams)
result['valid_scores'], result['train_scores'], np.mean(result['valid_scores'])

Fold:   0%|          | 0/4 [00:00<?, ?it/s]

Round:   0%|          | 0/1000 [00:00<?, ?it/s]

Round:   0%|          | 0/1000 [00:00<?, ?it/s]

Round:   0%|          | 0/1000 [00:00<?, ?it/s]

Round:   0%|          | 0/1000 [00:00<?, ?it/s]

([0.11726733098874209,
  0.0759907578038341,
  0.10495365385047431,
  0.08494575860302575],
 [0.06089597985639476,
  0.07499552392011985,
  0.07333114574578933,
  0.07212217862458374],
 0.09578937531151907)

In [178]:
lgb3 = sgml.CVModel('model', 'lgb3', gkf, config2, lgb_adapter).load_if_exists()

In [179]:
hparams = {
    'model_params': {'n_estimators': 1000, 'num_leaves': 31, 'learning_rate': 0.01},
    'X_num': ['c1', 's1', 'c2', 's2'],
    'X_cat': ['md', 'weekday', 'country', 'y_8', 'store', 'product'],
}
result = lgb3.cv(df_train, hparams)
result['valid_scores'], result['train_scores'], np.mean(result['valid_scores'])

Fold:   0%|          | 0/4 [00:00<?, ?it/s]

Round:   0%|          | 0/1000 [00:00<?, ?it/s]

Round:   0%|          | 0/1000 [00:00<?, ?it/s]

Round:   0%|          | 0/1000 [00:00<?, ?it/s]

Round:   0%|          | 0/1000 [00:00<?, ?it/s]

([0.11721972521785907,
  0.07379711340448902,
  0.10952981732481652,
  0.08393177973318464],
 [0.05105150392618727,
  0.06295748221564847,
  0.06312702675871551,
  0.06117127754788103],
 0.09611960892008731)

In [182]:
models = [lgb2, lgb3]
df_stk = sgml.stack_cv(models, df_train[target])
df_stk.corr()

Unnamed: 0,lgb2,lgb3,num_sold
lgb2,1.0,0.997257,0.987763
lgb3,0.997257,1.0,0.990815
num_sold,0.987763,0.990815,1.0


In [190]:
mean_absolute_percentage_error(
    df_stk[target], df_stk.iloc[:, :-1].mean(axis=1)
)

0.09959427359451899