# Preparation

In [1]:
import os, sys
import joblib

import pandas as pd
import numpy as np

import sklearn
import xgboost as xgb
import lightgbm as lgb
import catboost as cb

import matplotlib as mpl
import matplotlib as plt
import seaborn as sns

import dproc, sgml, sgpp, sgnn

print(sys.version)

for i in [pd, np, sklearn, xgb, lgb, cb, mpl, sns]:
    try:
        print(i.__name__, i.__version__)
    except:
        pass

2025-01-16 08:28:06.432672: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1737016086.444389   71835 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1737016086.447978   71835 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-16 08:28:06.460171: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


3.12.6 (main, Sep 30 2024, 02:19:13) [GCC 9.4.0]
pandas 2.2.3
numpy 1.26.4
sklearn 1.5.2
xgboost 2.1.2
catboost 1.2.5
matplotlib 3.8.4
seaborn 0.13.2


In [2]:
def get_data_path(name):
    return os.path.join('data', name)
target = 'class'
df_train = pd.read_parquet(get_data_path('train.parquet')).set_index('id')
df_test = pd.read_parquet(get_data_path('test.parquet')).set_index('id')
at = joblib.load(get_data_path('at.joblib'))
df_train = at.transform(df_train)
df_test = at.transform(df_test)

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import matthews_corrcoef
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit
from sklearn.linear_model import LogisticRegression

def get_validation_splitter(validation_fraction):
    return lambda x: train_test_split(x, test_size = validation_fraction, stratify=x[target])

config = {
    'predict_func': lambda m, df, X: pd.Series(m.predict_proba(df[X])[:, 1], index = df.index, name = 'p'),
    'score_func': lambda df, prds: matthews_corrcoef(df[target].sort_index(), (prds >= 0.5).sort_index()),
    'validation_splitter': get_validation_splitter,
    'progress_callback': sgml.ProgressCallBack(), 
    'return_train_scores': False,
    'y': target
}

skf = StratifiedKFold(5, random_state = 123, shuffle=True)
ss = StratifiedShuffleSplit(n_splits = 1, train_size = 0.6, random_state = 123)

cb_adapter = sgml.CBAdapter(cb.CatBoostClassifier)
lgb_adapter = sgml.LGBMAdapter(lgb.LGBMClassifier)
xgb_adapter = sgml.XGBAdapter(xgb.XGBClassifier)
nn_adapter = sgnn.NNAdapter(sgnn.NNClassifier)
lr_adapter = sgml.SklearnAdapter(LogisticRegression)

# CB

In [4]:
cb1 = sgml.CVModel('model', 'cb1', skf, config, cb_adapter).load_if_exists()

In [5]:
hparams = {
    'model_params': {'n_estimators': 2500, 'random_state': 123},
    'X_num': at.get_vars('num'), 'X_cat': at.get_vars('cat') + at.get_vars('ct2'),
    'combination_ctr' : 'CtrBorderCount=7', 'simple_ctr': 'CtrBorderCount=7',
    #'validation_fraction': 0.1, 
}
result = result = cb1.cv(df_train, hparams, task_type = 'GPU')
result['valid_scores'], np.mean(result['valid_scores'])

([0.9835747271473884,
  0.9831506292378257,
  0.9833330405378868,
  0.9834215069284793,
  0.9836217626106343],
 0.9834203332924429)

# LGB

In [6]:
lgb1 = sgml.CVModel('model', 'lgb1', skf, config, lgb_adapter).load_if_exists()

In [7]:
hparams = {
    'model_params': {'n_estimators': 3000, 'learning_rate': 0.02, 'random_state': 123},
    'X_num': at.get_vars('num'), 'X_cat': at.get_vars('cat'), 
    'validation_fraction': 0.1, 
}
#lgb1.adhoc(df_train, ss, hparams)
result = lgb1.cv(df_train, hparams)
result['valid_scores'], np.mean(result['valid_scores'])

([0.9841130575447955,
  0.984096406252419,
  0.9840590117258392,
  0.9841664078148387,
  0.9844671258720696],
 0.9841804018419925)

# XGB

In [8]:
xgb1 = sgml.CVModel('model', 'xgb1', skf, config, xgb_adapter).load_if_exists()

In [9]:
hparams = {
    'model_params': {'n_estimators': 5000, 'learning_rate': 0.05, 'random_state': 123},
    'X_num': at.get_vars('num'), 'X_cat': at.get_vars('cat') + at.get_vars('ct2'), 'ohe': {'min_frequency': 10, 'handle_unknown': 'ignore'},
    #'validation_fraction': 0.1, 
}
#xgb1.adhoc(df_train, ss, hparams, device = 'cuda')
result = xgb1.cv(df_train, hparams, device = 'cuda')
result['valid_scores'], np.mean(result['valid_scores'])

([0.9841453432612222,
  0.9838573785699578,
  0.9839689063922084,
  0.9841148101477533,
  0.9842436450434174],
 0.9840660166829117)

# NN

In [10]:
nn1 = sgml.CVModel('model', 'nn1', skf, config, nn_adapter).load_if_exists()

In [11]:
emb_config = {
    'caf__cap-color': 3,  'caf__cap-shape': 3, 'caf__cap-surface': 3,
    'caf__gill-attachment': 3,  'caf__gill-color': 3, 'caf__gill-spacing': 3, 
    'caf__stem-color': 3,  'caf__stem-root': 3, 'caf__stem-surface': 3, 
    'caf__does-bruise-or-bleed': 3, 'caf__habitat': 3, 'caf__has-ring': 3,  'caf__ring-type': 3, 'caf__season': 3, 
    'caf__spore-print-color': 3, 'caf__veil-color': 3, 'caf__veil-type': 3, 
    'cc__cap': 4, 'cc__gill': 4, 'cc__stem': 4
}
hparams = {
    'model_params': {
        'model_params': {
            'config': [
                {'unit': 32, 'activation': 'relu', 'batch_norm': True},
                {'unit': 32, 'activation': 'relu', 'batch_norm': True},
                {'unit': 16, 'activation': 'relu', 'batch_norm': True},
            ], 'embedding': [(1, df_train[k].value_counts().pipe(lambda x: (x >= 10)).sum(), v, 0, 0) for k, v in emb_config.items()],
        },
        'batch_size': 1024,
        'shuffle_size': 204800, 'epochs': 25, 'optimizer': ('Adam', {'learning_rate': 0.0003})
    }, 
    #'validation_fraction': 0.1,
    'X_std': at.get_vars('num'),
    'X_cat': [i for i in emb_config.keys()], 'cat': {'handle_unknown': 'use_encoded_value', 'unknown_value': -1}
}
#result = nn1.adhoc(df_train, ss, hparams)
result = nn1.cv(df_train, hparams)
result['valid_scores'], np.mean(result['valid_scores'])

([0.9829610986627615,
  0.9825517171770671,
  0.982920143528975,
  0.9829729121207089,
  0.9831135049507305],
 0.9829038752880486)

# LR

In [12]:
lr = sgml.CVModel('model', 'lr', skf, config, lr_adapter).load_if_exists()

In [13]:
hparams = {
    'model_params': {},
    'X_std':at.get_vars('num'),
    'X_ohe': at.get_vars('cat') + at.get_vars('ct2'), 'ohe': {'drop': 'first', 'handle_unknown': 'ignore', 'min_frequency': 10}
}
result = lr.cv(df_train, hparams)
result['valid_scores'], np.mean(result['valid_scores'])

([0.9568319787303333,
  0.9570208346141938,
  0.9571256673906168,
  0.9572044829870568,
  0.9565537605839487],
 0.95694734486123)

# Ensemble

In [14]:
models = [cb1, lgb1, xgb1, nn1, lr]

In [15]:
df_cv = sgml.stack_cv(models, df_train[target])

In [16]:
matthews_corrcoef(
    df_cv[target], df_cv.loc[:, ['cb1', 'lgb1', 'xgb1', 'nn1']].mean(axis = 1) >= 0.5
)

0.9845637959012646

In [17]:
lr_stk = sgml.CVModel('model', 'lr_stk', skf, config, lr_adapter).load_if_exists()

In [18]:
hparams = {
    'model_params': {}, 'X_num': ['cb1', 'lgb1', 'xgb1', 'nn1', 'lr']
}
result = lr_stk.cv(df_cv, hparams)
result['valid_scores'], np.mean(result['valid_scores'])

([0.9842827371353705,
  0.984561363842233,
  0.9844696490296913,
  0.9846958973206668,
  0.9846387570648915],
 0.9845296808785704)

In [19]:
for i in models:
    if i.name.startswith('cb'):
        i.train(df_train, task_type = 'GPU')
    elif i.name.startswith('xgb'):
        i.train(df_train, device = 'cuda')
    else:
        i.train(df_train)

In [20]:
df_stk_test = sgml.stack_prd([cb1, lgb1, xgb1, nn1], df_test, config)

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


I0000 00:00:1737016212.596900   71835 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 4762 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2060, pci bus id: 0000:01:00.0, compute capability: 7.5
I0000 00:00:1737016218.761168   72230 service.cc:148] XLA service 0x7f1e0000e8d0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1737016218.761185   72230 service.cc:156]   StreamExecutor device (0): NVIDIA GeForce RTX 2060, Compute Capability 7.5
2025-01-16 08:30:18.786678: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1737016218.813003   72230 cuda_dnn.cc:529] Loaded cuDNN version 90300
I0000 00:00:1737016219.157058   72230 devic

In [21]:
s_prd = pd.Series(
    at.body_transformer.label_transformer.inverse_transform((df_stk_test.mean(axis=1) >= 0.5).astype('int')),
    index = df_stk_test.index
)
s_prd.rename('class').to_frame().to_csv(os.path.join('result', 'submission1.csv'))

In [None]:
#!kaggle competitions submit -c playground-series-s4e8 -f result/submission1.csv -m '1'