In [1]:
import os, sys
import joblib
import importlib

import pandas as pd
import polars as pl
import numpy as np

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

import dproc, sgml, sgutil, sgpp, custpp

import lightgbm as lgb
import catboost as cb
import xgboost as xgb

print(sys.version)
for i in [pd, pl, mpl, sns, np, lgb, cb, xgb]:
    try:
        print(i.__name__, i.__version__)
    except:
        pass

3.12.6 (main, Sep 30 2024, 02:19:13) [GCC 9.4.0]
pandas 2.2.3
polars 1.12.0
matplotlib 3.8.4
seaborn 0.13.2
numpy 1.26.4
catboost 1.2.5
xgboost 2.1.2


In [2]:
df_train = pd.read_parquet(os.path.join('data', 'train.parquet'))
df_test = pd.read_parquet(os.path.join('data', 'test.parquet'))
target = 'num_sold'

In [3]:
df_train.loc[df_train['product'] == 'Kerneler Dark Mode', target] = df_train.loc[df_train['product'] == 'Kerneler Dark Mode', target].fillna(5)
df_train.loc[(df_train['product'] == 'Kerneler'), target] = \
    df_train.loc[(df_train['product'] == 'Kerneler')].groupby(['country', 'store'], observed = False)[target].transform(
        lambda x: x.interpolate()
    )
df_ratio_product = df_train.loc[~df_train['country'].isin(['Canada', 'Kenya'])].pipe(
    lambda x: x.pivot_table(index = x['days'], columns = x['product'], values=target, aggfunc = 'sum', observed = True)
).pipe(
    lambda x: x.divide(x.sum(axis=1), axis = 0)
)
df_ratio_country = df_train.loc[df_train['product'] != 'Holographic Goose'].pipe(
    lambda x: x.pivot_table(index = x['days'], columns = x['country'], values=target, aggfunc = 'sum', observed = True)
).pipe(
    lambda x: x.divide(x.sum(axis=1), axis = 0)
)
df_ratio_store = df_train.loc[(df_train['product'] != 'Holographic Goose') & ~df_train['country'].isin(['Canada', 'Kenya'])].pipe(
    lambda x: x.pivot_table(index = x['days'], columns = x['store'], values=target, aggfunc = 'sum', observed = True)
).pipe(
    lambda x: x.divide(x.sum(axis=1), axis = 0)
)
X_key = ['country', 'store', 'product']
df_ratio = pd.DataFrame(
    0,
    columns = pd.MultiIndex.from_product(df_train[X_key].apply(lambda x: x.unique().tolist())),
    index = df_train['days'].unique()
).apply(
    lambda x: df_ratio_country.loc[:, x.name[0]] * df_ratio_store.loc[:, x.name[1]] * df_ratio_product.loc[:, x.name[2]]
)
df_ratio_notna = pd.DataFrame(
    df_ratio.values * df_train.assign(target_notna = lambda x: x[target].notna()).pivot(
        index = 'days', columns = X_key, values = 'target_notna'
    ).values, index = df_ratio.index, columns = df_ratio.columns
)
s_est_daily_sum = df_train.groupby('days')[target].sum() * 1 / df_ratio_notna.sum(axis = 1)

df_train.loc[df_train[target].isna(), target] = df_train.loc[df_train[target].isna()].apply(
    lambda x: df_ratio.loc[x['days'], tuple(x[X_key])] * s_est_daily_sum.loc[x['days']], axis = 1
)

# Config

In [54]:
from sklearn.model_selection import GroupShuffleSplit, train_test_split, GroupKFold
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.linear_model import LinearRegression

def split(x, splitter, groups):
    for train_idx, valid_idx in splitter.split(x, x[target], groups = groups):
        return x.iloc[train_idx], x.iloc[valid_idx]

def get_validation_splitter(validation_fraction):
    gss = GroupShuffleSplit(n_splits = 1, train_size = 0.7, random_state = 123)
    return lambda x: split(x, gss, x['md'])

config = {
    'predict_func': lambda m, df, X: pd.Series(m.predict(df[X]), index = df.index),
    'score_func': lambda df, prds: -mean_absolute_percentage_error(df[target].sort_index(), prds.sort_index()),
    'validation_splitter': get_validation_splitter,
    'progress_callback': sgml.ProgressCallBack(), 
    'return_train_scores': True,
    'groups': 'year',
    'y': 'num_sold',
    'target_func': lambda a, b: np.log(b),
    'target_invfunc': lambda b: np.exp(b)
}

config2 = config.copy()
config2['groups'] = 'year'

gss = GroupShuffleSplit(n_splits = 1, random_state = 123)
gkf = GroupKFold(n_splits = 4)

cb_adapter = sgml.CBAdapter(cb.CatBoostRegressor)
lr_adapter = sgml.SklearnAdapter(LinearRegression)
lgb_adapter = sgml.LGBMAdapter(lgb.LGBMRegressor)
xgb_adapter = sgml.XGBAdapter(xgb.XGBRegressor)

In [98]:
cb1 = sgml.CVModel('model', 'cb1', gkf, config, cb_adapter).load_if_exists()
hparams = {
    'model_params': {'n_estimators': 3500, 'learning_rate': 0.1},
    'X_num': ['c1', 's1', 'c2', 's2', 'c3', 's3', 'c4', 's4', 'gdp'],
    'X_cat': ['weekday', 'country', 'store', 'product', 'holiday', 'month'],
    #'validation_fraction': 0.2
}
#result = cb1.adhoc(df_train, gss, hparams, task_type = 'GPU')
result = cb1.cv(df_train, hparams, task_type = 'GPU')
result['valid_scores'], np.mean(result['valid_scores']), result['train_scores']

([-0.07707171667778806,
  -0.07639860363226426,
  -0.12270632691844975,
  -0.061658447882939595],
 -0.08445877377786042,
 [-0.03954196856812238,
  -0.03976512165652241,
  -0.03949761084862665,
  -0.03950239157302807])

In [56]:
lgb1 = sgml.CVModel('model', 'lgb1', gkf, config, lgb_adapter).load_if_exists()
hparams = {
    'model_params': {'n_estimators': 2000, 'num_leaves': 31, 'learning_rate': 0.1},
    'X_num': ['c1', 's1', 'c2', 's2', 'c3', 's3', 'c4', 's4', 'gdp'],
    'X_cat': ['weekday', 'country', 'store', 'product', 'holiday', 'month'],
    #'validation_fraction': 0.2
}
#result = lgb1.adhoc(df_train, gss, hparams)
result = lgb1.cv(df_train, hparams)
result['valid_scores'], np.mean(result['valid_scores']), result['train_scores']

Fold:   0%|          | 0/4 [00:00<?, ?it/s]

Round:   0%|          | 0/2000 [00:00<?, ?it/s]

Round:   0%|          | 0/2000 [00:00<?, ?it/s]

Round:   0%|          | 0/2000 [00:00<?, ?it/s]

Round:   0%|          | 0/2000 [00:00<?, ?it/s]

([-0.06547458969931848,
  -0.05885628804165214,
  -0.12174519927913535,
  -0.061384096873470205],
 -0.07686504347339404,
 [-0.036971618145682394,
  -0.03761374668350337,
  -0.037131498773354615,
  -0.03712605005298313])

In [57]:
xgb1 = sgml.CVModel('model', 'xgb1', gkf, config, xgb_adapter).load_if_exists()
hparams = {
    'model_params': {'n_estimators': 1500, 'max_depth': 7, 'learning_rate': 0.06},
    'X_num': ['c1', 's1', 'c2', 's2', 'c3', 's3', 'c4', 's4', 'gdp'],
    'X_ohe': ['weekday', 'country', 'store', 'product', 'holiday', 'month'], 'ohe': {'drop': 'if_binary'}, 
    #'validation_fraction': 0.1
}
#result = xgb1.adhoc(df_train, gss, hparams, device = 'cuda')
result = xgb1.cv(df_train, hparams, device = 'cuda')
result['valid_scores'], np.mean(result['valid_scores']), result['train_scores']

Fold:   0%|          | 0/4 [00:00<?, ?it/s]

Round:   0%|          | 0/1500 [00:00<?, ?it/s]

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




Round:   0%|          | 0/1500 [00:00<?, ?it/s]

Round:   0%|          | 0/1500 [00:00<?, ?it/s]

Round:   0%|          | 0/1500 [00:00<?, ?it/s]

([-0.080923505, -0.07056465, -0.073136695, -0.08051474],
 -0.07628489,
 [-0.03594484, -0.036640495, -0.035900187, -0.035837043])

In [58]:
lr1 = sgml.CVModel('model', 'lr1', gkf, config, lr_adapter).load_if_exists()
hparams = {
    'model_params': {},
    'X_mm': ['gdp'],
    'X_ohe': ['weekday', 'country', 'store', 'product', 'holiday'], 'ohe': {'drop': 'first'},
    'X_pre': ['year', 'md'], 'X_pre_out': ['product_ratio'],
    'preprocessor': custpp.RatioEncoder()
}
result = lr1.cv(df_train, hparams)
result['valid_scores'], np.mean(result['valid_scores']), result['train_scores']

Fold:   0%|          | 0/4 [00:00<?, ?it/s]

([-0.11776300898143265,
  -0.0706493914643913,
  -0.08317018225569826,
  -0.08782677201028768],
 -0.08985233867795248,
 [-0.07490570191990986,
  -0.08828802832709043,
  -0.08739964870898849,
  -0.08599619494530683])

In [59]:
lgb2 = sgml.CVModel('model', 'lgb2', gkf, config, lgb_adapter).load_if_exists()
hparams = {
    'model_params': {'n_estimators': 4000, 'num_leaves': 15, 'learning_rate': 0.1},
    'X_num': ['c1', 's1', 'c2', 's2', 'c3', 's3', 'c4', 's4', 'gdp'],
    'X_cat': ['weekday', 'country', 'store', 'product', 'holiday', 'month'],
    #'validation_fraction': 0.2
}
#result = lgb1.adhoc(df_train, gss, hparams)
result = lgb2.cv(df_train, hparams)
result['valid_scores'], np.mean(result['valid_scores']), result['train_scores']

Fold:   0%|          | 0/4 [00:00<?, ?it/s]

Round:   0%|          | 0/4000 [00:00<?, ?it/s]

Round:   0%|          | 0/4000 [00:00<?, ?it/s]

Round:   0%|          | 0/4000 [00:00<?, ?it/s]

Round:   0%|          | 0/4000 [00:00<?, ?it/s]

([-0.06507281551682419,
  -0.05790313008619813,
  -0.11475737963923671,
  -0.060818140302467114],
 -0.07463786638618154,
 [-0.03749063873065157,
  -0.037962676726079586,
  -0.03763584301590932,
  -0.0375189931039882])

In [75]:
xgb2 = sgml.CVModel('model', 'xgb2', gkf, config, xgb_adapter).load_if_exists()
hparams = {
    'model_params': {'n_estimators': 1500, 'max_depth': 6, 'learning_rate': 0.06},
    'X_num': ['c1', 's1', 'c2', 's2', 'c3', 's3', 'c4', 's4', 'gdp'],
    'X_ohe': ['weekday', 'country', 'store', 'product', 'holiday', 'month'], 'ohe': {'drop': 'if_binary'}, 
    #'validation_fraction': 0.2
}
#result = xgb2.adhoc(df_train, gss, hparams, device = 'cuda')
result = xgb2.cv(df_train, hparams, device = 'cuda')
result['valid_scores'], np.mean(result['valid_scores']), result['train_scores']

([-0.080314144, -0.07050299, -0.0734187, -0.08072272],
 -0.07623964,
 [-0.03822202, -0.03868175, -0.03817244, -0.038157128])

In [109]:
cb2 = sgml.CVModel('model', 'cb2', gkf, config, cb_adapter).load_if_exists()
hparams = {
    'model_params': {'n_estimators': 3000, 'learning_rate': 0.1, 'max_depth': 7},
    'X_num': ['c1', 's1', 'c2', 's2', 'c3', 's3', 'c4', 's4', 'gdp'],
    'X_cat': ['weekday', 'country', 'store', 'product', 'holiday', 'month'],
    #'validation_fraction': 0.2
}
#result = cb2.adhoc(df_train, gss, hparams, task_type = 'GPU')
result = cb2.cv(df_train, hparams, task_type = 'GPU')
result['valid_scores'], np.mean(result['valid_scores']), result['train_scores']

Fold:   0%|          | 0/1 [00:00<?, ?it/s]

([-0.10038598925657231], -0.10038598925657231, [-0.03727272458091318])

In [130]:
lgb3 = sgml.CVModel('model', 'lgb3', gkf, config, lgb_adapter).load_if_exists()
hparams = {
    'model_params': {'n_estimators': 5000, 'num_leaves': 7, 'learning_rate': 0.1},
    'X_num': ['c1', 's1', 'c2', 's2', 'c3', 's3', 'c4', 's4', 'gdp'],
    'X_cat': ['weekday', 'country', 'store', 'product', 'holiday', 'month'],
    #'validation_fraction': 0.2
}
#result = lgb3.adhoc(df_train, gss, hparams)
result = lgb3.cv(df_train, hparams)
result['valid_scores'], np.mean(result['valid_scores']), result['train_scores']

Fold:   0%|          | 0/4 [00:00<?, ?it/s]

Round:   0%|          | 0/5000 [00:00<?, ?it/s]

Round:   0%|          | 0/5000 [00:00<?, ?it/s]

Round:   0%|          | 0/5000 [00:00<?, ?it/s]

Round:   0%|          | 0/5000 [00:00<?, ?it/s]

([-0.06496329504069424,
  -0.055323346483212736,
  -0.10645387787346779,
  -0.059629028996697094],
 -0.07159238709851797,
 [-0.039431401929232106,
  -0.03984736449031836,
  -0.03952141978897335,
  -0.03949263236835982])

In [147]:
lgb4 = sgml.CVModel('model', 'lgb4', gkf, config, lgb_adapter).load_if_exists()
hparams = {
    'model_params': {'n_estimators': 2000, 'num_leaves': 63, 'learning_rate': 0.01},
    'X_num': ['c1', 's1', 'c2', 's2', 'c3', 's3', 'c4', 's4', 'gdp'],
    'X_cat': ['weekday', 'country', 'store', 'product', 'holiday', 'month'],
    #'validation_fraction': 0.2
}
#result = lgb4.adhoc(df_train, gss, hparams)
result = lgb4.cv(df_train, hparams)
result['valid_scores'], np.mean(result['valid_scores']), result['train_scores']

Fold:   0%|          | 0/4 [00:00<?, ?it/s]

Round:   0%|          | 0/2000 [00:00<?, ?it/s]

Round:   0%|          | 0/2000 [00:00<?, ?it/s]

Round:   0%|          | 0/2000 [00:00<?, ?it/s]

Round:   0%|          | 0/2000 [00:00<?, ?it/s]

([-0.06618888932111208,
  -0.05853069985198123,
  -0.12583228208486702,
  -0.06288189177930822],
 -0.07835844075931714,
 [-0.040794537883216735,
  -0.041283370501131955,
  -0.041219040615845304,
  -0.04097670508350978])

In [160]:
xgb3 = sgml.CVModel('model', 'xgb3', gkf, config, xgb_adapter).load_if_exists()
hparams = {
    'model_params': {'n_estimators': 3000, 'max_depth': 5, 'learning_rate': 0.06},
    'X_num': ['c1', 's1', 'c2', 's2', 'c3', 's3', 'c4', 's4', 'gdp'],
    'X_ohe': ['weekday', 'country', 'store', 'product', 'holiday', 'month'], 'ohe': {'drop': 'if_binary'}, 
    #'validation_fraction': 0.2
}
#result = xgb3.adhoc(df_train, gss, hparams, device = 'cuda')
result = xgb3.cv(df_train, hparams, device = 'cuda')
result['valid_scores'], np.mean(result['valid_scores']), result['train_scores']

Fold:   0%|          | 0/4 [00:00<?, ?it/s]

Round:   0%|          | 0/3000 [00:00<?, ?it/s]

Round:   0%|          | 0/3000 [00:00<?, ?it/s]

Round:   0%|          | 0/3000 [00:00<?, ?it/s]

Round:   0%|          | 0/3000 [00:00<?, ?it/s]

([-0.08033634, -0.06972629, -0.073387295, -0.08019728],
 -0.0759118,
 [-0.038571253, -0.038944796, -0.038486548, -0.038461655])

In [185]:
xgb4 = sgml.CVModel('model', 'xgb4', gkf, config, xgb_adapter).load_if_exists()
hparams = {
    'model_params': {'n_estimators': 4000, 'max_depth': 4, 'learning_rate': 0.08},
    'X_num': ['c1', 's1', 'c2', 's2', 'c3', 's3', 'c4', 's4', 'gdp'],
    'X_ohe': ['weekday', 'country', 'store', 'product', 'holiday', 'month'], 'ohe': {'drop': 'if_binary'}, 
    #'validation_fraction': 0.2
}
#result = xgb4.adhoc(df_train, gss, hparams, device = 'cuda')
result = xgb4.cv(df_train, hparams, device = 'cuda')
result['valid_scores'], np.mean(result['valid_scores']), result['train_scores']

Fold:   0%|          | 0/4 [00:00<?, ?it/s]

Round:   0%|          | 0/4000 [00:00<?, ?it/s]

Round:   0%|          | 0/4000 [00:00<?, ?it/s]

Round:   0%|          | 0/4000 [00:00<?, ?it/s]

Round:   0%|          | 0/4000 [00:00<?, ?it/s]

([-0.07786964, -0.071193196, -0.06879352, -0.08099143],
 -0.07471195,
 [-0.03937829, -0.03971551, -0.039292816, -0.039254356])

In [201]:
xgb5 = sgml.CVModel('model', 'xgb5', gkf, config, xgb_adapter).load_if_exists()
hparams = {
    'model_params': {'n_estimators': 5000, 'max_depth': 3, 'learning_rate': 0.08},
    'X_num': ['c1', 's1', 'c2', 's2', 'c3', 's3', 'c4', 's4', 'gdp'],
    'X_ohe': ['weekday', 'country', 'store', 'product', 'holiday', 'month'], 'ohe': {'drop': 'if_binary'}, 
    'validation_fraction': 0.2
}
#result = xgb5.adhoc(df_train, gss, hparams, device = 'cuda')
result = xgb5.cv(df_train, hparams, device = 'cuda')
result['valid_scores'], np.mean(result['valid_scores']), result['train_scores']

Fold:   0%|          | 0/4 [00:00<?, ?it/s]

Round:   0%|          | 0/5000 [00:00<?, ?it/s]

Round:   0%|          | 0/5000 [00:00<?, ?it/s]

Round:   0%|          | 0/5000 [00:00<?, ?it/s]

Round:   0%|          | 0/5000 [00:00<?, ?it/s]

([-0.08061035, -0.0739915, -0.06982216, -0.08018472],
 -0.07615218,
 [-0.041126084, -0.04179851, -0.04102442, -0.04103463])

In [188]:
models = [cb1, lgb1, xgb5, lr1, lgb2, xgb2, lgb3, xgb3, xgb4]
df_stk = sgml.stack_cv(models, df_train[target])
df_stk.corr()

Unnamed: 0,cb1,lgb1,xgb1,lr1,lgb2,xgb2,lgb3,xgb3,xgb4,num_sold
cb1,1.0,0.998186,0.992317,0.98031,0.998061,0.992128,0.998168,0.99246,0.992518,0.988179
lgb1,0.998186,1.0,0.992722,0.983245,0.999457,0.992517,0.999093,0.9925,0.992763,0.989052
xgb1,0.992317,0.992722,1.0,0.988075,0.993197,0.999676,0.99395,0.999601,0.999499,0.989678
lr1,0.98031,0.983245,0.988075,1.0,0.983855,0.987874,0.984686,0.987587,0.988425,0.986529
lgb2,0.998061,0.999457,0.993197,0.983855,1.0,0.993152,0.999465,0.993109,0.993381,0.989551
xgb2,0.992128,0.992517,0.999676,0.987874,0.993152,1.0,0.993856,0.99967,0.999528,0.989653
lgb3,0.998168,0.999093,0.99395,0.984686,0.999465,0.993856,1.0,0.993964,0.994252,0.990752
xgb3,0.99246,0.9925,0.999601,0.987587,0.993109,0.99967,0.993964,1.0,0.999609,0.989626
xgb4,0.992518,0.992763,0.999499,0.988425,0.993381,0.999528,0.994252,0.999609,1.0,0.989946
num_sold,0.988179,0.989052,0.989678,0.986529,0.989551,0.989653,0.990752,0.989626,0.989946,1.0


In [209]:
weights = [0.1, 0.05, 0.05, 0.2, 0.1, 0.05, 0.3, 0.05, 0.1]
mean_absolute_percentage_error(
    df_stk.iloc[:, :-1].dot(weights), df_stk[target]
)

0.06506546575400698

In [206]:
for i in models:
    if i.name.startswith('cb'):
        i.train(df_train, task_type = 'GPU')
    elif i.name.startswith('xgb'):
        i.train(df_train, device = 'cuda')
    else:
        i.train(df_train)

Round:   0%|          | 0/2000 [00:00<?, ?it/s]

Round:   0%|          | 0/1500 [00:00<?, ?it/s]

Round:   0%|          | 0/4000 [00:00<?, ?it/s]

Round:   0%|          | 0/1500 [00:00<?, ?it/s]

Round:   0%|          | 0/5000 [00:00<?, ?it/s]

Round:   0%|          | 0/3000 [00:00<?, ?it/s]

Round:   0%|          | 0/4000 [00:00<?, ?it/s]

In [207]:
df_stk_prd = sgml.stack_prd(models, df_test, config)

In [210]:
np.exp(df_stk_prd.dot(weights)).rename('num_sold').to_frame().to_csv(
    os.path.join('result', 'submission9.csv')
)