In [25]:
import sys, os, re

import joblib
import pandas as pd
import numpy as np

import lightgbm as lgb
import xgboost as xgb
import catboost as cb

import dproc, sgml, sgutil

for i in [pd, np, lgb, xgb, cb]:
    try:
        print(i.__name__, i.__version__)
    except:
        pass

pandas 2.2.2
numpy 1.26.4
lightgbm 4.3.0
xgboost 2.1.2
catboost 1.2.5


In [89]:
data_path = 'data'
files = {
    k: os.path.join(p, f)
    for k, p, f in [
        ('train', data_path, 'train.csv'),
        ('test', data_path, 'test.csv'),
        ('org', data_path, 'data.csv'),
        ('train_parquet', data_path, 'train.parquet'),
        ('org_parquet', data_path, 'org.parquet'),
        ('test_parquet', data_path, 'test.parquet'),
        ('var_pkl', data_path, 'var.pkl')
    ]
}

df_train = pd.read_parquet(files['train_parquet'])
df_test = pd.read_parquet(files['test_parquet']).set_index('id')
df_org = pd.read_parquet(files['org_parquet'])

target = 'Target'
sc = sgutil.SGCache('img', 'result')
at = joblib.load(os.path.join(data_path, 'at.joblib'))
df_train = at.transform(df_train)
df_test = at.transform(df_test)
df_org = at.transform(df_org.rename(columns = {"Daytime/evening attendance\t": "Daytime/evening attendance"}))

# Config

In [62]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold
import sgml, sgnn

skf = StratifiedKFold(5, shuffle=True, random_state=123)
ss = StratifiedShuffleSplit(n_splits=1, train_size=0.8, random_state=123)
target_values = at.body_transformer.label_transformer.classes_

def get_validation_splitter(validation_frac):
    return lambda x: train_test_split(x, test_size=validation_frac, stratify=x[target], random_state=123)

config = {
    'predict_func': lambda m, df, X: pd.DataFrame(m.predict_proba(df[X]), index=df.index),
    'score_func': lambda df, prd: accuracy_score(df[target], prd.idxmax(axis=1)),
    'progress_callback': sgml.ProgressCallBack(),
    'validation_splitter': get_validation_splitter, 'return_train_scores': False,
    'train_data_proc': lambda x: pd.concat([x, df_org], axis=0),
    'y': target
}

config2 = {
    'predict_func': lambda m, df, X: pd.DataFrame(m.predict_proba(df[X]), index=df.index),
    'score_func': lambda df, prd: accuracy_score(df[target], prd.idxmax(axis=1)),
    'progress_callback': sgml.ProgressCallBack(),
    'validation_splitter': get_validation_splitter, 'return_train_scores': False,
    'y': target
}

lgb_adapter = sgml.LGBMAdapter(lgb.LGBMClassifier)
xgb_adapter = sgml.XGBAdapter(xgb.XGBClassifier)
cb_adapter = sgml.CBAdapter(cb.CatBoostClassifier)
nn_adapter = sgnn.NNAdapter(sgnn.NNClassifier)
lr_adapter = sgml.SklearnAdapter(LogisticRegression)

In [28]:
at.get_vars('num') + at.get_vars('ev')

['Admission grade',
 'Age at enrollment',
 'Curricular units 1st sem (approved)',
 'Curricular units 1st sem (credited)',
 'Curricular units 1st sem (enrolled)',
 'Curricular units 1st sem (evaluations)',
 'Curricular units 1st sem (grade)',
 'Curricular units 1st sem (without evaluations)',
 'Curricular units 2nd sem (approved)',
 'Curricular units 2nd sem (credited)',
 'Curricular units 2nd sem (enrolled)',
 'Curricular units 2nd sem (evaluations)',
 'Curricular units 2nd sem (grade)',
 'Curricular units 2nd sem (without evaluations)',
 'GDP',
 'Previous qualification (grade)',
 'Unemployment rate',
 'not_approved_1st',
 'not_approved_2nd',
 'with_eval_1st',
 'with_eval_2nd']

In [29]:
df_train['np_Age at enrollment'].max()

30

# LR

In [30]:
lr = sgml.CVModel('model', 'lr', skf, config, lr_adapter).load_if_exists()

In [31]:
X_cont_sel = [
    'np_Age at enrollment',
    'Admission grade',
    'Previous qualification (grade)',
    'np_Curricular units 2nd sem (grade)', 'np_Curricular units 1st sem (grade)',
    'Curricular units 2nd sem (credited)', 'Curricular units 1st sem (credited)',
    'Curricular units 2nd sem (without evaluations)', 'Curricular units 1st sem (without evaluations)',
]

X_cat_sel = [
    'nom__Course', 'bool__Tuition fees up to date',
    'nom__Application mode', 'bool__Scholarship holder', 'bool__Gender',
    'nom__Previous qualification', 'bool__Debtor', "nom__Mother's qualification",
    "nom__Father's qualification", "nom__Mother's occupation", "nom__Father's occupation",
    'ord__Application order', 'bool__Displaced', 
    'np_Curricular units 2nd sem (approved)', 'np_Curricular units 1st sem (approved)',
    'np_Curricular units 2nd sem (enrolled)', 'np_Curricular units 1st sem (enrolled)',
    'np_Curricular units 2nd sem (evaluations)', 'np_Curricular units 1st sem (evaluations)'
]
hparams = {
    'model_params': {'C': 1, 'max_iter': 500, 'multi_class': 'multinomial'}, 
    'X_tgt': ['1st_eval_grade', '2nd_eval_grade'], 'tgt': {'random_state': 123 },
    'X_ohe': X_cat_sel + at.get_vars(('cat', 'ord')), 'X_mm': X_cont_sel, 'X_num': [], 'ohe': {'drop': 'first', 'handle_unknown': 'ignore'}
}
result = lr.cv(df_train, hparams)
np.mean(result['valid_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

0.8281580679782252

# CB

## cb1

In [32]:
cb1 = sgml.CVModel('model', 'cb1', skf, config, cb_adapter).load_if_exists()

In [33]:
hparams = {
    'model_params': {'n_estimators': 2500, 'learning_rate': 0.04, 'random_state': 123}, 
    'X_num': at.get_vars('num') + at.get_vars(('cat', 'pt')),
    'X_cat': at.get_vars(('cat', 'nom')) + at.get_vars(('cat', 'bool')) + at.get_vars(('cat', 'ord')), 
    #'validation_fraction': 0.1,
}
#result = cb1.adhoc(df_train, ss, hparams)
result = cb1.cv(df_train, hparams, task_type = 'GPU')

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

In [34]:
np.mean(result['valid_scores'])

0.8339605981136096

# LGB

## lgb1

In [35]:
lgb1 = sgml.CVModel('model', 'lgb1', skf, config, lgb_adapter).load_if_exists()

In [36]:
hparams = {
    'model_params': {'n_estimators': 4000, 'learning_rate': 0.007, 'colsample_bytree': 0.25}, 
    #'validation_fraction': 0.1,
    'X_num': at.get_vars('num') + at.get_vars(('cat', 'pt')), 
    'X_cat': at.get_vars(('cat', 'nom')) + at.get_vars(('cat', 'bool')) + at.get_vars(('cat', 'ord'))
}
#result = lgb1.adhoc(df_train, ss, hparams)
result = lgb1.cv(df_train, hparams)

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

Round:   0%|          | 0/4000 [00:00<?, ?it/s]

Round:   0%|          | 0/4000 [00:00<?, ?it/s]

Round:   0%|          | 0/4000 [00:00<?, ?it/s]

Round:   0%|          | 0/4000 [00:00<?, ?it/s]

Round:   0%|          | 0/4000 [00:00<?, ?it/s]

In [37]:
np.mean(result['valid_scores'])

0.8344179991425342

## lgb2

In [38]:
lgb2 = sgml.CVModel('model', 'lgb2', skf, config, lgb_adapter).load_if_exists()

In [39]:
hparams = {
    'model_params': {'n_estimators': 4000, 'num_leaves': 15, 'learning_rate': 0.02, 'colsample_bytree': 0.25}, 
    #'validation_fraction': 0.1,
    'X_num': at.get_vars('num') + at.get_vars(('cat', 'pt')), 
    'X_cat': at.get_vars(('cat', 'nom')) + at.get_vars(('cat', 'bool')) + at.get_vars(('cat', 'ord'))
}
#result = lgb2.adhoc(df_train, ss, hparams)
result = lgb2.cv(df_train, hparams)

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

Round:   0%|          | 0/4000 [00:00<?, ?it/s]

Round:   0%|          | 0/4000 [00:00<?, ?it/s]

Round:   0%|          | 0/4000 [00:00<?, ?it/s]

Round:   0%|          | 0/4000 [00:00<?, ?it/s]

Round:   0%|          | 0/4000 [00:00<?, ?it/s]

In [40]:
np.mean(result['valid_scores'])

0.8341435636490685

# XGB

## xgb1

In [41]:
xgb1 = sgml.CVModel('model', 'xgb1', skf, config, xgb_adapter).load_if_exists()

In [42]:
hparams = {
    'model_params': {'n_estimators': 4500, 'learning_rate': 0.007, 'colsample_bytree': 0.25}, 
    'X_num': at.get_vars('num') + at.get_vars(('cat', 'pt')), 
    'X_cat': at.get_vars(('cat', 'nom')) + at.get_vars(('cat', 'bool')) + at.get_vars(('cat', 'ord'))
    #'validation_fraction': 0.1,
}
#result = xgb1.adhoc(df_train, ss, hparams, device= 'cuda')
result = xgb1.cv(df_train, hparams, device= 'cuda')

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

Round:   0%|          | 0/4500 [00:00<?, ?it/s]

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




Round:   0%|          | 0/4500 [00:00<?, ?it/s]

Round:   0%|          | 0/4500 [00:00<?, ?it/s]

Round:   0%|          | 0/4500 [00:00<?, ?it/s]

Round:   0%|          | 0/4500 [00:00<?, ?it/s]

In [43]:
np.mean(result['valid_scores'])

0.8346793815288379

# NN

## NN1

In [44]:
nn1 = sgml.CVModel('model', 'nn1', skf, config, nn_adapter).load_if_exists()

In [45]:
emb_config = [
    ('nom__Course', 3),
    ('nom__Application mode', 2),
    ('nom__Previous qualification', 2),
    ("nom__Mother's qualification", 2),
    ("nom__Father's qualification", 2),
    ("nom__Mother's occupation", 2),
    ("nom__Father's occupation", 2),
    ('nom__Marital status', 2),
]
X_cat = [i for i,_ in emb_config]
embedding = [(1, len(df_train[a].cat.categories), b, 0, 0) for a, b in emb_config]
hparams = {
    'model_params': {
        'model_params': {
            'config': [
                {'unit': 32, 'activation': 'relu', 'batch_norm': True},
            ], 'embedding': embedding,
        },
        'batch_size': 1024,
        'shuffle_size': 204800, 'epochs': 50, 'optimizer': ('Adam', {'learning_rate': 0.0002})
    }, 
    'validation_fraction': 0.1,
    'X_std': at.get_vars('np'),
    'X_mm': ['ord__Application order'],
    'X_num': at.get_vars(('cat', 'bool')),
    'X_cat': [i for i,_ in emb_config],
}
result = nn1.cv(df_train, hparams)
result['valid_scores'], np.mean(result['valid_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

2025-01-16 22:32:15.441550: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:984] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2025-01-16 22:32:15.546379: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2251] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


Epoch:   0%|          | 0/50 [00:00<?, ?it/s]

Step:   0%|          | 0/58 [00:00<?, ?it/s]

Epoch:   0%|          | 0/50 [00:00<?, ?it/s]

Step:   0%|          | 0/58 [00:00<?, ?it/s]

Epoch:   0%|          | 0/50 [00:00<?, ?it/s]

Step:   0%|          | 0/58 [00:00<?, ?it/s]

Epoch:   0%|          | 0/50 [00:00<?, ?it/s]

Step:   0%|          | 0/58 [00:00<?, ?it/s]

Epoch:   0%|          | 0/50 [00:00<?, ?it/s]

Step:   0%|          | 0/58 [00:00<?, ?it/s]

([0.8267773131207528,
  0.825209095661265,
  0.825013068478829,
  0.8234333137293341,
  0.8233026204012285],
 0.824747082278282)

# Stacking

In [47]:
models = [lr, cb1, lgb1, xgb1, nn1, lgb2]
pd.Series([np.mean(i.cv_best_['score']) for i in models], index = [i.name for i in models]).rename('accuracy').to_frame().T

Unnamed: 0,lr,cb1,lgb1,xgb1,nn1,lgb2
accuracy,0.828158,0.833961,0.834418,0.834679,0.824747,0.834144


In [48]:
df_stk = sgml.stack_cv(models, df_train[target])
df_stk.head()

Unnamed: 0,lr_0,lr_1,lr_2,cb1_0,cb1_1,cb1_2,lgb1_0,lgb1_1,lgb1_2,xgb1_0,xgb1_1,xgb1_2,nn1_0,nn1_1,nn1_2,lgb2_0,lgb2_1,lgb2_2,Target
1,0.858329,0.127828,0.013842,0.926196,0.070178,0.003626,0.884549,0.096841,0.01861,0.900966,0.086165,0.012868,0.889123,0.108955,0.001921,0.924632,0.068568,0.0068,0
9,0.031521,0.062546,0.905933,0.025408,0.045505,0.929087,0.029732,0.055176,0.915092,0.02143,0.04626,0.93231,0.040308,0.05039,0.909301,0.026422,0.048343,0.925235,2
18,0.080162,0.364092,0.555746,0.062638,0.346,0.591362,0.082621,0.356373,0.561006,0.068766,0.352113,0.579121,0.058689,0.242998,0.698314,0.073793,0.372273,0.553934,2
20,0.912444,0.086933,0.000623,0.924855,0.074237,0.000908,0.947882,0.051346,0.000772,0.948736,0.049214,0.00205,0.9351,0.064233,0.000668,0.941298,0.058385,0.000317,0
22,0.028377,0.02989,0.941733,0.033389,0.017086,0.949525,0.044236,0.033052,0.922712,0.044807,0.022925,0.932268,0.026536,0.026836,0.946628,0.047522,0.036331,0.916147,2


In [55]:
X_stk = [i for i in df_stk.columns if i.split('_')[0] in ['cb1', 'xgb1', 'lgb1', 'lgb2'] and i.split('_')[1] in ['0', '1', '2']]
accuracy_score(
    df_train[target],
    df_stk[X_stk].T.groupby(df_stk.columns[1:-1].to_series().str.split('_', expand=True)[1]).mean().T.idxmax(axis=1).sort_index().astype('int')
)

0.834914660602734

In [63]:
lr_stk = sgml.CVModel('model', 'lr_stk', skf, config2, lr_adapter)#.load_if_exists()

In [67]:
X_stk = [i for i in df_stk.columns if i.split('_')[0] in ['cb1', 'xgb1', 'lgb1', 'lgb2'] and i.split('_')[1] in ['0', '1']]
result = lr_stk.cv(df_stk, {
    'model_params': {}, 'X_num': X_stk
})
result = lr_stk.cv_best_, np.mean(result['valid_scores'])
result[0]['score']

0.8345095664544318

In [68]:
for i in models:
    if i.name.startswith('cb'):
        i.train(df_train, task_type = 'GPU')
    elif i.name.startswith('xgb'):
        i.train(df_train, device = 'cuda')
    else:
        i.train(df_train)

Round:   0%|          | 0/4000 [00:00<?, ?it/s]

Round:   0%|          | 0/4500 [00:00<?, ?it/s]

Epoch:   0%|          | 0/50 [00:00<?, ?it/s]

Step:   0%|          | 0/68 [00:00<?, ?it/s]

Round:   0%|          | 0/4000 [00:00<?, ?it/s]

In [90]:
l = list()
for i in models:
    fname = os.path.join('result', '{}_test.parquet'.format(i.name))
    if os.path.exists(fname):
        l.append(pd.read_parquet(fname))
    else:
        l.append(i.get_predictor()(df_test).rename(columns = lambda x: '{}_{}'.format(i.name,  x)))
        l[-1].to_parquet(fname)

In [91]:
df_stk_test = pd.concat(l, axis=1)
#.T.groupby(df_stk.columns[1:-1].to_series().str.split('_', expand=True)[1]).mean().T.idxmax(axis=1).sort_index()

In [92]:
X_mean = [i for i in df_stk_test.columns if i.split('_')[0] in ['cb1', 'xgb1', 'lgb1', 'lgb2']]
X_grp = [i.split('_')[1] for i in X_mean if i.split('_')[0] in ['cb1', 'xgb1', 'lgb1', 'lgb2']]
s_prd = df_stk_test[X_mean].T.groupby(X_grp).mean().T.idxmax(axis=1).sort_index().rename(target)

In [93]:
pd.Series(
    at.body_transformer.label_transformer.inverse_transform(s_prd.astype('int')),
    index = s_prd.index.rename('id')
).rename('Target').to_frame().to_csv('result/submission4.csv')

In [94]:
!kaggle competitions submit -c playground-series-s4e6 -f result/submission4.csv -m '4'

100%|█████████████████████████████████████████| 759k/759k [00:01<00:00, 403kB/s]
Successfully submitted to Classification with an Academic Success Dataset