In [71]:
import os, sys
import joblib
import importlib

import pandas as pd
import polars as pl
import numpy as np

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

import dproc, sgml, sgutil, sgpp, custpp

import lightgbm as lgb
import catboost as cb
import xgboost as xgb

print(sys.version)
for i in [pd, pl, mpl, sns, np, lgb, cb, xgb]:
    try:
        print(i.__name__, i.__version__)
    except:
        pass

3.12.6 (main, Sep 30 2024, 02:19:13) [GCC 9.4.0]
pandas 2.2.3
polars 1.12.0
matplotlib 3.8.4
seaborn 0.13.2
numpy 1.26.4
catboost 1.2.5
xgboost 2.1.2


In [5]:
df_train = pd.read_parquet(os.path.join('data', 'train.parquet'))
df_test = pd.read_parquet(os.path.join('data', 'test.parquet'))
target = 'num_sold'

In [6]:
df_train.loc[df_train['product'] == 'Kerneler Dark Mode', target] = df_train.loc[df_train['product'] == 'Kerneler Dark Mode', target].fillna(5)
df_train.loc[(df_train['product'] == 'Kerneler'), target] = \
    df_train.loc[(df_train['product'] == 'Kerneler')].groupby(['country', 'store'], observed = False)[target].transform(
        lambda x: x.interpolate()
    )
df_ratio_product = df_train.loc[~df_train['country'].isin(['Canada', 'Kenya'])].pipe(
    lambda x: x.pivot_table(index = x['days'], columns = x['product'], values=target, aggfunc = 'sum', observed = True)
).pipe(
    lambda x: x.divide(x.sum(axis=1), axis = 0)
)
df_ratio_country = df_train.loc[df_train['product'] != 'Holographic Goose'].pipe(
    lambda x: x.pivot_table(index = x['days'], columns = x['country'], values=target, aggfunc = 'sum', observed = True)
).pipe(
    lambda x: x.divide(x.sum(axis=1), axis = 0)
)
df_ratio_store = df_train.loc[(df_train['product'] != 'Holographic Goose') & ~df_train['country'].isin(['Canada', 'Kenya'])].pipe(
    lambda x: x.pivot_table(index = x['days'], columns = x['store'], values=target, aggfunc = 'sum', observed = True)
).pipe(
    lambda x: x.divide(x.sum(axis=1), axis = 0)
)
X_key = ['country', 'store', 'product']
df_ratio = pd.DataFrame(
    0,
    columns = pd.MultiIndex.from_product(df_train[X_key].apply(lambda x: x.unique().tolist())),
    index = df_train['days'].unique()
).apply(
    lambda x: df_ratio_country.loc[:, x.name[0]] * df_ratio_store.loc[:, x.name[1]] * df_ratio_product.loc[:, x.name[2]]
)
df_ratio_notna = pd.DataFrame(
    df_ratio.values * df_train.assign(target_notna = lambda x: x[target].notna()).pivot(
        index = 'days', columns = X_key, values = 'target_notna'
    ).values, index = df_ratio.index, columns = df_ratio.columns
)
s_est_daily_sum = df_train.groupby('days')[target].sum() * 1 / df_ratio_notna.sum(axis = 1)

df_train.loc[df_train[target].isna(), target] = df_train.loc[df_train[target].isna()].apply(
    lambda x: df_ratio.loc[x['days'], tuple(x[X_key])] * s_est_daily_sum.loc[x['days']], axis = 1
)

# Config

In [17]:
from sklearn.model_selection import GroupShuffleSplit, train_test_split, GroupKFold
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.linear_model import LinearRegression

def split(x, splitter, groups):
    for train_idx, valid_idx in splitter.split(x, x[target], groups = groups):
        return x.iloc[train_idx], x.iloc[valid_idx]

def get_validation_splitter(validation_fraction):
    gss = GroupShuffleSplit(n_splits = 1, train_size = 0.7, random_state = 123)
    return lambda x: split(x, gss, x['md'])

config = {
    'predict_func': lambda m, df, X: pd.Series(m.predict(df[X]), index = df.index),
    'score_func': lambda df, prds: mean_absolute_percentage_error(df[target].sort_index(), prds.sort_index()),
    'validation_splitter': get_validation_splitter,
    'progress_callback': sgml.ProgressCallBack(), 
    'return_train_scores': True,
    'groups': 'year',
    'y': 'num_sold',
    'target_func': lambda a, b: np.log(b),
    'target_invfunc': lambda b: np.exp(b)
}

config2 = config.copy()
config2['groups'] = 'year'

gss = GroupShuffleSplit(n_splits = 1, random_state = 123)
gkf = GroupKFold(n_splits = 4)

cb_adapter = sgml.CBAdapter(cb.CatBoostRegressor)
lr_adapter = sgml.SklearnAdapter(LinearRegression)
lgb_adapter = sgml.LGBMAdapter(lgb.LGBMRegressor)
xgb_adapter = sgml.XGBAdapter(xgb.XGBRegressor)

In [43]:
cb1 = sgml.CVModel('model', 'cb1', gkf, config, cb_adapter).load_if_exists()

In [28]:
hparams = {
    'model_params': {'n_estimators': 3500, 'learning_rate': 0.1},
    'X_num': ['c1', 's1', 'c2', 's2', 'c3', 's3', 'c4', 's4', 'gdp'],
    'X_cat': ['weekday', 'country', 'store', 'product', 'holiday', 'month'],
    #'validation_fraction': 0.2
}
#result = cb1.adhoc(df_train, gss, hparams, task_type = 'GPU')
result = cb1.cv(df_train, hparams, task_type = 'GPU')
result['valid_scores'], result['train_scores']

Fold:   0%|          | 0/4 [00:00<?, ?it/s]

([0.07775177826579524,
  0.075777966231794,
  0.12316303598759105,
  0.06138229127961395],
 [0.039552158538448845,
  0.039768135473713134,
  0.039486226256197694,
  0.039511857241174904])

In [46]:
lgb1 = sgml.CVModel('model', 'lgb1', gkf, config, lgb_adapter).load_if_exists()

In [47]:
hparams = {
    'model_params': {'n_estimators': 2000, 'num_leaves': 31, 'learning_rate': 0.1},
    'X_num': ['c1', 's1', 'c2', 's2', 'c3', 's3', 'c4', 's4', 'gdp'],
    'X_cat': ['weekday', 'country', 'store', 'product', 'holiday', 'month'],
    #'validation_fraction': 0.2
}
#result = lgb1.adhoc(df_train, gss, hparams)
result = lgb1.cv(df_train, hparams)
result['valid_scores'], result['train_scores']

Fold:   0%|          | 0/4 [00:00<?, ?it/s]

Round:   0%|          | 0/2000 [00:00<?, ?it/s]

Round:   0%|          | 0/2000 [00:00<?, ?it/s]

Round:   0%|          | 0/2000 [00:00<?, ?it/s]

Round:   0%|          | 0/2000 [00:00<?, ?it/s]

([0.06547458969931848,
  0.05885628804165214,
  0.12174519927913535,
  0.061384096873470205],
 [0.036971618145682394,
  0.03761374668350337,
  0.037131498773354615,
  0.03712605005298313])

In [55]:
xgb1 = sgml.CVModel('model', 'xgb1', gkf, config, xgb_adapter).load_if_exists()

In [56]:
hparams = {
    'model_params': {'n_estimators': 1500, 'max_depth': 7, 'learning_rate': 0.06},
    'X_num': ['c1', 's1', 'c2', 's2', 'c3', 's3', 'c4', 's4', 'gdp'],
    'X_ohe': ['weekday', 'country', 'store', 'product', 'holiday', 'month'], 'ohe': {'drop': 'if_binary'}, 
    #'validation_fraction': 0.1
}
#result = xgb1.adhoc(df_train, gss, hparams, device = 'cuda')
result = xgb1.cv(df_train, hparams, device = 'cuda')
result['valid_scores'], result['train_scores']

Fold:   0%|          | 0/4 [00:00<?, ?it/s]

Round:   0%|          | 0/1500 [00:00<?, ?it/s]

Round:   0%|          | 0/1500 [00:00<?, ?it/s]

Round:   0%|          | 0/1500 [00:00<?, ?it/s]

Round:   0%|          | 0/1500 [00:00<?, ?it/s]

([0.080923505, 0.07056465, 0.073136695, 0.08051474],
 [0.03594484, 0.036640495, 0.035900187, 0.035837043])

In [69]:
lr1 = sgml.CVModel('model', 'lr1', gkf, config, lr_adapter).load_if_exists()

In [73]:
hparams = {
    'model_params': {},
    'X_mm': ['gdp'],
    'X_ohe': ['weekday', 'country', 'store', 'product', 'holiday'], 'ohe': {'drop': 'first'},
    'X_pre': ['year', 'md'], 'X_pre_out': ['product_ratio'],
    'preprocessor': custpp.RatioEncoder()
}
result = lr1.cv(df_train, hparams)
result['valid_scores'], result['train_scores']

Fold:   0%|          | 0/4 [00:00<?, ?it/s]

([0.11776300898143265,
  0.0706493914643913,
  0.08317018225569826,
  0.08782677201028768],
 [0.07490570191990986,
  0.08828802832709043,
  0.08739964870898849,
  0.08599619494530683])

In [74]:
models = [cb1, lgb1, xgb1, lr1]
df_stk = sgml.stack_cv(models, df_train[target])
df_stk.corr()

Unnamed: 0,cb1,lgb1,xgb1,lr1,num_sold
cb1,1.0,0.997818,0.99161,0.977971,0.987031
lgb1,0.997818,1.0,0.992722,0.983245,0.989052
xgb1,0.99161,0.992722,1.0,0.988075,0.989678
lr1,0.977971,0.983245,0.988075,1.0,0.986529
num_sold,0.987031,0.989052,0.989678,0.986529,1.0


In [85]:
mean_absolute_percentage_error(
    df_stk.iloc[:, :-1].dot([0.1, 0.3, 0.3, 0.3]), df_stk[target]
)

0.0669141730830498

In [86]:
for i in models:
    if i.name.startswith('cb'):
        i.train(df_train, task_type = 'GPU')
    elif i.name.startswith('xgb'):
        i.train(df_train, device = 'cuda')
    else:
        i.train(df_train)

In [87]:
df_stk_prd = sgml.stack_prd(models, df_test, config)

In [88]:
np.exp(df_stk_prd.dot([0.1, 0.3, 0.3, 0.3])).rename('num_sold').to_frame().to_csv(
    os.path.join('result', 'submission8.csv')
)