# Preparation

In [1]:
import os, sys
import joblib

import pandas as pd
import numpy as np

import sklearn
import xgboost as xgb
import lightgbm as lgb
import catboost as cb

import matplotlib as mpl
import matplotlib as plt
import seaborn as sns

import dproc, sgml, sgpp, sgnn

print(sys.version)

for i in [pd, np, sklearn, xgb, lgb, cb, mpl, sns]:
    try:
        print(i.__name__, i.__version__)
    except:
        pass

2025-01-15 06:56:40.315454: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1736924200.327074   53344 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1736924200.330583   53344 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-15 06:56:40.342334: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


3.12.6 (main, Sep 30 2024, 02:19:13) [GCC 9.4.0]
pandas 2.2.3
numpy 1.26.4
sklearn 1.5.2
xgboost 2.1.2
catboost 1.2.5
matplotlib 3.8.4
seaborn 0.13.2


In [2]:
def get_data_path(name):
    return os.path.join('data', name)
target = 'class'
df_train = pd.read_parquet(get_data_path('train.parquet')).set_index('id')
df_test = pd.read_parquet(get_data_path('test.parquet')).set_index('id')
at = joblib.load(get_data_path('at.joblib'))
df_train = at.transform(df_train)
df_test = at.transform(df_test)

In [23]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import matthews_corrcoef
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit

def get_validation_splitter(validation_fraction):
    return lambda x: train_test_split(x, test_size = validation_fraction, stratify=x[target])

config = {
    'predict_func': lambda m, df, X: pd.Series(m.predict_proba(df[X])[:, 1], index = df.index, name = 'p'),
    'score_func': lambda df, prds: matthews_corrcoef(df[target].sort_index(), (prds >= 0.5).sort_index()),
    'validation_splitter': get_validation_splitter,
    'progress_callback': sgml.ProgressCallBack(), 
    'return_train_scores': False,
    'y': target
}

skf = StratifiedKFold(5, random_state = 123, shuffle=True)
ss = StratifiedShuffleSplit(n_splits = 1, train_size = 0.6, random_state = 123)

cb_adapter = sgml.CBAdapter(cb.CatBoostClassifier)
lgb_adapter = sgml.LGBMAdapter(lgb.LGBMClassifier)
xgb_adapter = sgml.XGBAdapter(xgb.XGBClassifier)
nn_adapter = sgnn.NNAdapter(sgnn.NNClassifier)

# CB

In [5]:
cb1 = sgml.CVModel('model', 'cb1', skf, config, cb_adapter).load_if_exists()

In [6]:
hparams = {
    'model_params': {'n_estimators': 2500, 'random_state': 123},
    'X_num': at.get_vars('num'), 'X_cat': at.get_vars('cat') + at.get_vars('ct2'),
    'combination_ctr' : 'CtrBorderCount=7', 'simple_ctr': 'CtrBorderCount=7',
    #'validation_fraction': 0.1, 
}
cb1.cv(df_train, hparams, task_type = 'GPU')

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

{'valid_scores': [0.9835747271473884,
  0.9831506292378257,
  0.9833330405378868,
  0.9834215069284793,
  0.9836217626106343],
 'model_result': [{'valid_result': metric   Logloss
   set        learn
   0       0.671709
   1       0.651331
   2       0.631051
   3       0.611632
   4       0.593199
   ...          ...
   2495    0.038008
   2496    0.038007
   2497    0.038007
   2498    0.038007
   2499    0.038006
   
   [2500 rows x 1 columns],
   'feature_importance': cat__caf__veil-type                0.026602
   cat__caf__spore-print-color        0.173658
   cat__caf__veil-color               0.178551
   pt__simp__stem-height              0.687867
   cat__caf__has-ring                 0.770576
   cat__caf__stem-root                0.794305
   cat__caf__habitat                  0.962259
   pt__simp__cap-diameter             1.036636
   cat__caf__gill-color               1.228838
   cat__caf__stem-surface             1.393870
   cat__caf__cap-shape                1.396602
   cat__ca

# LGB

In [25]:
lgb1 = sgml.CVModel('model', 'lgb1', skf, config, lgb_adapter).load_if_exists()

In [30]:
hparams = {
    'model_params': {'n_estimators': 3000, 'learning_rate': 0.02, 'random_state': 123},
    'X_num': at.get_vars('num'), 'X_cat': at.get_vars('cat'), 
    'validation_fraction': 0.1, 
}
#lgb1.adhoc(df_train, ss, hparams)
lgb1.cv(df_train, hparams)

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

Round:   0%|          | 0/3000 [00:00<?, ?it/s]

Round:   0%|          | 0/3000 [00:00<?, ?it/s]

Round:   0%|          | 0/3000 [00:00<?, ?it/s]

Round:   0%|          | 0/3000 [00:00<?, ?it/s]

Round:   0%|          | 0/3000 [00:00<?, ?it/s]

{'valid_scores': [0.9841130575447955,
  0.984096406252419,
  0.9840590117258392,
  0.9841664078148387,
  0.9844671258720696],
 'model_result': [{'valid_result': metric binary_logloss          
   set          training   valid_1
   0            0.675039  0.674933
   1            0.661723  0.661640
   2            0.648923  0.648854
   3            0.636770  0.636707
   4            0.625257  0.625220
   ...               ...       ...
   2995         0.032713  0.037918
   2996         0.032712  0.037917
   2997         0.032710  0.037917
   2998         0.032709  0.037916
   2999         0.032706  0.037917
   
   [3000 rows x 2 columns],
   'feature_importance': cat__caf__veil-type                 376
   cat__caf__spore-print-color         623
   cat__caf__veil-color                678
   cat__caf__season                    905
   cat__caf__has-ring                 1074
   cat__caf__stem-root                1306
   cat__caf__ring-type                1402
   cat__caf__habitat            

# XGB

In [31]:
xgb1 = sgml.CVModel('model', 'xgb1', skf, config, xgb_adapter).load_if_exists()

In [41]:
hparams = {
    'model_params': {'n_estimators': 5000, 'learning_rate': 0.05, 'random_state': 123},
    'X_num': at.get_vars('num'), 'X_cat': at.get_vars('cat') + at.get_vars('ct2'), 'ohe': {'min_frequency': 10, 'handle_unknown': 'ignore'},
    #'validation_fraction': 0.1, 
}
#xgb1.adhoc(df_train, ss, hparams, device = 'cuda')
xgb1.cv(df_train, hparams, device = 'cuda')

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

Round:   0%|          | 0/5000 [00:00<?, ?it/s]

Round:   0%|          | 0/5000 [00:00<?, ?it/s]

Round:   0%|          | 0/5000 [00:00<?, ?it/s]

Round:   0%|          | 0/5000 [00:00<?, ?it/s]

Round:   0%|          | 0/5000 [00:00<?, ?it/s]

{'valid_scores': [0.9841453432612222,
  0.9838573785699578,
  0.9839689063922084,
  0.9841148101477533,
  0.9842436450434174],
 'model_result': [{'valid_result': None,
   'feature_importance': cat__caf__cap-color_Unk     0.000000
   cat__cc__stem_igr           0.000000
   cat__cc__stem_ieUnk         0.000000
   cat__cc__stem_iUnkUnk       0.000000
   cat__cc__cap_hxu            0.000000
                                 ...   
   cat__caf__stem-surface_g    0.021341
   cat__caf__cap-surface_l     0.022191
   cat__caf__ring-type_z       0.022464
   cat__caf__gill-color_f      0.035571
   cat__cc__cap_gcn            0.055902
   Length: 1944, dtype: float32,
   'variables': array(['cat__caf__cap-color_Unk', 'cat__caf__cap-color_b',
          'cat__caf__cap-color_e', ..., 'pt__pt__stem-width',
          'pt__simp__cap-diameter', 'pt__simp__stem-height'], dtype=object),
   'train_shape': (2493556, 1944),
   'target': 'class',
   'target_func': None,
   'preprocessor': ColumnTransformer(trans

# NN

In [None]:
nn1 = sgml.CVModel('model', 'nn1', skf, config, nn_adapter).load_if_exists()

# Ensemble

In [45]:
models = [cb1, lgb1, xgb1]

In [47]:
df_cv = sgml.stack_cv(models, df_train[target])

In [50]:
matthews_corrcoef(
    df_cv[target], df_cv.iloc[:, :-1].mean(axis = 1) >= 0.5
)

0.9845124466503058

In [None]:
from sklearn.linear_model import LogisticRegression


In [53]:
for i in models:
    if i.name.startswith('cb'):
        i.train(df_train, task_type = 'GPU')
    elif i.name.startswith('xgb'):
        i.train(df_train, device = 'cuda')
    else:
        i.train(df_train)

KeyboardInterrupt: 