In [1]:
import sys, os, re

import joblib
import pandas as pd
import numpy as np

import lightgbm as lgb
import xgboost as xgb
import catboost as cb

import dproc, sgml, sgutil

for i in [pd, np, lgb, xgb, cb]:
    try:
        print(i.__name__, i.__version__)
    except:
        pass

pandas 2.2.2
numpy 1.26.4
lightgbm 4.3.0
xgboost 2.1.2
catboost 1.2.5


In [2]:
data_path = 'data'
files = {
    k: os.path.join(p, f)
    for k, p, f in [
        ('train', data_path, 'train.csv'),
        ('test', data_path, 'test.csv'),
        ('org', data_path, 'data.csv'),
        ('train_parquet', data_path, 'train.parquet'),
        ('org_parquet', data_path, 'org.parquet'),
        ('test_parquet', data_path, 'test.parquet'),
        ('var_pkl', data_path, 'var.pkl')
    ]
}

df_train = pd.read_parquet(files['train_parquet'])
df_test = pd.read_parquet(files['test_parquet'])
df_org = pd.read_parquet(files['org_parquet'])

target = 'Target'
sc = sgutil.SGCache('img', 'result')
at = joblib.load(os.path.join(data_path, 'at.joblib'))
df_train = at.transform(df_train)
df_test = at.transform(df_test)
df_org = at.transform(df_org.rename(columns = {"Daytime/evening attendance\t": "Daytime/evening attendance"}))

# Config

In [3]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold
import sgml, sgnn

skf = StratifiedKFold(5, shuffle=True, random_state=123)
ss = StratifiedShuffleSplit(n_splits=1, train_size=0.8, random_state=123)
target_values = at.body_transformer.label_transformer.classes_

def get_validation_splitter(validation_frac):
    return lambda x: train_test_split(x, test_size=validation_frac, stratify=x[target], random_state=123)

config = {
    'predict_func': lambda m, df, X: pd.DataFrame(m.predict_proba(df[X]), index=df.index),
    'score_func': lambda df, prd: accuracy_score(df[target], prd.idxmax(axis=1)),
    'progress_callback': sgml.ProgressCallBack(),
    'validation_splitter': get_validation_splitter, 'return_train_scores': False,
    'train_data_proc': lambda x: pd.concat([x, df_org], axis=0),
    'y': target
}

config2 = {
    'predict_func': lambda m, df, X: pd.DataFrame(m.predict_proba(df[X]), index=df.index, columns=target_values),
    'score_func': lambda df, prd: accuracy_score(df[target], prd.idxmax(axis=1)),
    'progress_callback': sgml.ProgressCallBack(),
    'validation_splitter': get_validation_splitter, 'return_train_scores': False,
    'y': target
}

lgb_adapter = sgml.LGBMAdapter(lgb.LGBMClassifier)
xgb_adapter = sgml.XGBAdapter(xgb.XGBClassifier)
cb_adapter = sgml.CBAdapter(cb.CatBoostClassifier)
nn_adapter = sgnn.NNAdapter(sgnn.NNClassifier)
lr_adapter = sgml.SklearnAdapter(LogisticRegression)

2025-01-13 22:58:34.180153: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-01-13 22:58:34.200578: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
at.get_vars('num') + at.get_vars('ev')

['Admission grade',
 'Age at enrollment',
 'Curricular units 1st sem (approved)',
 'Curricular units 1st sem (credited)',
 'Curricular units 1st sem (enrolled)',
 'Curricular units 1st sem (evaluations)',
 'Curricular units 1st sem (grade)',
 'Curricular units 1st sem (without evaluations)',
 'Curricular units 2nd sem (approved)',
 'Curricular units 2nd sem (credited)',
 'Curricular units 2nd sem (enrolled)',
 'Curricular units 2nd sem (evaluations)',
 'Curricular units 2nd sem (grade)',
 'Curricular units 2nd sem (without evaluations)',
 'GDP',
 'Previous qualification (grade)',
 'Unemployment rate',
 'not_approved_1st',
 'not_approved_2nd',
 'with_eval_1st',
 'with_eval_2nd']

In [5]:
df_train['np_Age at enrollment'].max()

30

# LR

In [6]:
lr = sgml.CVModel('model', 'lr', skf, config, lr_adapter).load_if_exists()

In [7]:
X_cont_sel = [
    'np_Age at enrollment',
    'Admission grade',
    'Previous qualification (grade)',
    'np_Curricular units 2nd sem (grade)', 'np_Curricular units 1st sem (grade)',
    'Curricular units 2nd sem (credited)', 'Curricular units 1st sem (credited)',
    'Curricular units 2nd sem (without evaluations)', 'Curricular units 1st sem (without evaluations)',
]

X_cat_sel = [
    'nom__Course', 'bool__Tuition fees up to date',
    'nom__Application mode', 'bool__Scholarship holder', 'bool__Gender',
    'nom__Previous qualification', 'bool__Debtor', "nom__Mother's qualification",
    "nom__Father's qualification", "nom__Mother's occupation", "nom__Father's occupation",
    'ord__Application order', 'bool__Displaced', 
    'np_Curricular units 2nd sem (approved)', 'np_Curricular units 1st sem (approved)',
    'np_Curricular units 2nd sem (enrolled)', 'np_Curricular units 1st sem (enrolled)',
    'np_Curricular units 2nd sem (evaluations)', 'np_Curricular units 1st sem (evaluations)'
]
hparams = {
    'model_params': {'C': 1, 'max_iter': 500, 'multi_class': 'multinomial'}, 
    'X_tgt': ['1st_eval_grade', '2nd_eval_grade'], 'tgt': {'random_state': 123 },
    'X_ohe': X_cat_sel + at.get_vars(('cat', 'ord')), 'X_mm': X_cont_sel, 'X_num': [], 'ohe': {'drop': 'first', 'handle_unknown': 'ignore'}
}
result = lr.cv(df_train, hparams)
np.mean(result['valid_scores'])

0.8281580679782252

# CB

## cb1

In [8]:
cb1 = sgml.CVModel('model', 'cb1', skf, config, cb_adapter).load_if_exists()

In [9]:
hparams = {
    'model_params': {'n_estimators': 2500, 'learning_rate': 0.04, 'random_state': 123}, 
    'X_num': at.get_vars('num') + at.get_vars(('cat', 'pt')),
    'X_cat': at.get_vars(('cat', 'nom')) + at.get_vars(('cat', 'bool')) + at.get_vars(('cat', 'ord')), 
    #'validation_fraction': 0.1,
}
#result = cb1.adhoc(df_train, ss, hparams)
result = cb1.cv(df_train, hparams, task_type = 'GPU')

In [10]:
np.mean(result['valid_scores'])

0.8337514887886407

# LGB

## lgb1

In [11]:
lgb1 = sgml.CVModel('model', 'lgb1', skf, config, lgb_adapter).load_if_exists()

In [12]:
hparams = {
    'model_params': {'n_estimators': 4000, 'learning_rate': 0.007, 'colsample_bytree': 0.25}, 
    #'validation_fraction': 0.1,
    'X_num': at.get_vars('num') + at.get_vars(('cat', 'pt')), 
    'X_cat': at.get_vars(('cat', 'nom')) + at.get_vars(('cat', 'bool')) + at.get_vars(('cat', 'ord'))
}
#result = lgb1.adhoc(df_train, ss, hparams)
result = lgb1.cv(df_train, hparams)

In [13]:
np.mean(result['valid_scores'])

0.8344179991425342

## lgb2

In [14]:
lgb2 = sgml.CVModel('model', 'lgb2', skf, config, lgb_adapter).load_if_exists()

In [16]:
hparams = {
    'model_params': {'n_estimators': 4000, 'num_leaves': 15, 'learning_rate': 0.02, 'colsample_bytree': 0.25}, 
    #'validation_fraction': 0.1,
    'X_num': at.get_vars('num') + at.get_vars(('cat', 'pt')), 
    'X_cat': at.get_vars(('cat', 'nom')) + at.get_vars(('cat', 'bool')) + at.get_vars(('cat', 'ord'))
}
#result = lgb2.adhoc(df_train, ss, hparams)
result = lgb2.cv(df_train, hparams)

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

Round:   0%|          | 0/4000 [00:00<?, ?it/s]

Round:   0%|          | 0/4000 [00:00<?, ?it/s]

Round:   0%|          | 0/4000 [00:00<?, ?it/s]

Round:   0%|          | 0/4000 [00:00<?, ?it/s]

Round:   0%|          | 0/4000 [00:00<?, ?it/s]

In [220]:
np.mean(result['valid_scores'])

0.8339083489637567

# XGB

## xgb1

In [17]:
xgb1 = sgml.CVModel('model', 'xgb1', skf, config, xgb_adapter).load_if_exists()

In [18]:
hparams = {
    'model_params': {'n_estimators': 4500, 'learning_rate': 0.007, 'colsample_bytree': 0.25}, 
    'X_num': at.get_vars('num') + at.get_vars(('cat', 'pt')), 
    'X_cat': at.get_vars(('cat', 'nom')) + at.get_vars(('cat', 'bool')) + at.get_vars(('cat', 'ord'))
    #'validation_fraction': 0.1,
}
#result = xgb1.adhoc(df_train, ss, hparams, device= 'cuda')
result = xgb1.cv(df_train, hparams, device= 'cuda')

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

Round:   0%|          | 0/4500 [00:00<?, ?it/s]

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




Round:   0%|          | 0/4500 [00:00<?, ?it/s]

Round:   0%|          | 0/4500 [00:00<?, ?it/s]

Round:   0%|          | 0/4500 [00:00<?, ?it/s]

Round:   0%|          | 0/4500 [00:00<?, ?it/s]

In [19]:
np.mean(result['valid_scores'])

0.8346793815288379

# NN

## NN1

In [20]:
nn1 = sgml.CVModel('model', 'nn1', skf, config, nn_adapter).load_if_exists()

In [21]:
emb_config = [
    ('nom__Course', 3),
    ('nom__Application mode', 2),
    ('nom__Previous qualification', 2),
    ("nom__Mother's qualification", 2),
    ("nom__Father's qualification", 2),
    ("nom__Mother's occupation", 2),
    ("nom__Father's occupation", 2),
    ('nom__Marital status', 2),
]
X_cat = [i for i,_ in emb_config]
embedding = [(1, len(df_train[a].cat.categories), b, 0, 0) for a, b in emb_config]
hparams = {
    'model_params': {
        'model_params': {
            'config': [
                {'unit': 32, 'activation': 'relu', 'batch_norm': True},
            ], 'embedding': embedding,
        },
        'batch_size': 1024,
        'shuffle_size': 204800, 'epochs': 50, 'optimizer': ('Adam', {'learning_rate': 0.0002})
    }, 
    'validation_fraction': 0.1,
    'X_std': at.get_vars('np'),
    'X_mm': ['ord__Application order'],
    'X_num': at.get_vars(('cat', 'bool')),
    'X_cat': [i for i,_ in emb_config],
}
result = nn1.cv(df_train, hparams)
result['valid_scores'], np.mean(result['valid_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

2025-01-13 23:04:25.303689: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:984] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2025-01-13 23:04:25.400844: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2251] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


Epoch:   0%|          | 0/50 [00:00<?, ?it/s]

Step:   0%|          | 0/58 [00:00<?, ?it/s]

Epoch:   0%|          | 0/50 [00:00<?, ?it/s]

Step:   0%|          | 0/58 [00:00<?, ?it/s]

Epoch:   0%|          | 0/50 [00:00<?, ?it/s]

Step:   0%|          | 0/58 [00:00<?, ?it/s]

Epoch:   0%|          | 0/50 [00:00<?, ?it/s]

Step:   0%|          | 0/58 [00:00<?, ?it/s]

Epoch:   0%|          | 0/50 [00:00<?, ?it/s]

Step:   0%|          | 0/58 [00:00<?, ?it/s]

([0.8269079979090433,
  0.8260585467851542,
  0.8218112911657083,
  0.8233026204012285,
  0.8215382604718029],
 0.8239237433465874)

# Stacking

In [22]:
models = [lr, cb1, lgb1, xgb1, nn1, lgb2]
pd.Series([np.mean(i.cv_best_['score']) for i in models], index = [i.name for i in models]).rename('accuracy').to_frame().T

Unnamed: 0,lr,cb1,lgb1,xgb1,nn1,lgb2
accuracy,0.828158,0.833751,0.834418,0.834679,0.823924,0.834144


In [222]:
df_stk = sgml.stack_cv(models, df_train[target])
df_stk.head()

Unnamed: 0,lr_Dropout,lr_Enrolled,lr_Graduate,cb1_Dropout,cb1_Enrolled,cb1_Graduate,lgb1_Dropout,lgb1_Enrolled,lgb1_Graduate,xgb1_Dropout,xgb1_Enrolled,xgb1_Graduate,nn1_Dropout,nn1_Enrolled,nn1_Graduate,lgb2_Dropout,lgb2_Enrolled,lgb2_Graduate,Target
1,0.852738,0.131012,0.01625,0.920081,0.076,0.003919,0.914037,0.080037,0.005927,0.905539,0.083846,0.010615,0.878178,0.109647,0.012175,0.942122,0.054044,0.003835,Dropout
9,0.031004,0.061109,0.907887,0.02169,0.045688,0.932622,0.017696,0.046013,0.936291,0.024608,0.051748,0.923644,0.02289,0.04161,0.935501,0.023254,0.05346,0.923286,Graduate
18,0.080756,0.364454,0.55479,0.056511,0.368815,0.574674,0.070207,0.360766,0.569026,0.071212,0.371091,0.557697,0.085262,0.398376,0.516362,0.084074,0.335101,0.580825,Graduate
20,0.911161,0.08808,0.00076,0.929785,0.069459,0.000757,0.945325,0.053189,0.001487,0.952722,0.045227,0.002051,0.936895,0.062015,0.00109,0.933445,0.065531,0.001024,Dropout
22,0.029186,0.029244,0.941571,0.04306,0.017175,0.939765,0.040251,0.020538,0.939211,0.048757,0.023055,0.928187,0.028856,0.026004,0.945141,0.041246,0.019703,0.93905,Graduate


In [226]:
X_stk = [i for i in df_stk.columns if i.split('_')[0] in ['cb1', 'xgb1', 'lgb1', 'lgb2'] and i.split('_')[1] in ['Dropout', 'Enrolled', 'Graduate']]
accuracy_score(
    df_train[target],
    df_stk[X_stk].T.groupby(df_stk.columns[1:-1].to_series().str.split('_', expand=True)[1]).mean().T.idxmax(axis=1).sort_index()
)

0.8348754541415092

In [227]:
lr_stk = sgml.CVModel.load_or_create('model', 'lr_stk', skf, config2, lr_adapter)

In [228]:
df_stk[X_stk].isna().sum()

cb1_Dropout      0
cb1_Enrolled     0
cb1_Graduate     0
lgb1_Dropout     0
lgb1_Enrolled    0
lgb1_Graduate    0
xgb1_Dropout     0
xgb1_Enrolled    0
xgb1_Graduate    0
lgb2_Dropout     0
lgb2_Enrolled    0
lgb2_Graduate    0
dtype: int64

In [229]:
X_stk = [i for i in df_stk.columns if i.split('_')[0] in ['cb1', 'xgb1', 'lgb1', 'lgb2'] and i.split('_')[1] in ['Dropout', 'Enrolled']]
result = lr_stk.cv(df_stk, {
    'model_params': {}, 'X_num': X_stk
})
lr_stk.cv_best_, np.mean(result['valid_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

({'score': 0.834235117297262,
  'hparams': {'model_params': {},
   'X_num': ['cb1_Dropout',
    'cb1_Enrolled',
    'lgb1_Dropout',
    'lgb1_Enrolled',
    'xgb1_Dropout',
    'xgb1_Enrolled',
    'lgb2_Dropout',
    'lgb2_Enrolled']},
  'prd':         Dropout  Enrolled  Graduate
  1      0.939793  0.046796  0.013411
  18     0.083592  0.310807  0.605600
  45     0.961008  0.029506  0.009486
  56     0.022009  0.044470  0.933521
  116    0.101482  0.564244  0.334274
  ...         ...       ...       ...
  76463  0.935114  0.051563  0.013323
  76471  0.110596  0.623268  0.266137
  76481  0.017602  0.043357  0.939041
  76507  0.134046  0.817625  0.048329
  76515  0.050234  0.169951  0.779815
  
  [76518 rows x 3 columns],
  'k': "{'model_params': {}, 'X_num': ['cb1_Dropout', 'cb1_Enrolled', 'lgb1_Dropout', 'lgb1_Enrolled', 'xgb1_Dropout', 'xgb1_Enrolled', 'lgb2_Dropout', 'lgb2_Enrolled']}"},
 0.834235117297262)

In [230]:
lr_stk.train(df_stk)

{'variables': ['lgb2_Enrolled',
  'lgb1_Enrolled',
  'cb1_Enrolled',
  'xgb1_Enrolled',
  'xgb1_Dropout',
  'cb1_Dropout',
  'lgb1_Dropout',
  'lgb2_Dropout'],
 'train_shape': (76518, 8),
 'target': 'Target',
 'target_func': None}

In [231]:
for i in models:
    if i.name.startswith('cb'):
        i.train(df_train, task_type = 'GPU')
    elif i.name.startswith('xgb'):
        i.train(df_train, device = 'cuda')
    else:
        i.train(df_train)

Round:   0%|          | 0/4000 [00:00<?, ?it/s]

In [232]:
df_test = pd.read_parquet(files['test_parquet']).set_index('id')
df_test = dproc.join_and_assign(df_test, pd_vars.procs_all(df_test)[0])
df_test['Application order_C'] = df_test['Application order_C'].astype('int')

In [233]:
l = list()
for i in models:
    fname = os.path.join('result', '{}_test.parquet'.format(i.name))
    if os.path.exists(fname):
        l.append(pd.read_parquet(fname))
    else:
        l.append(i.get_predictor()(df_test).rename(columns = lambda x: i.name + '_' + x))
        l[-1].to_parquet(fname)

In [234]:
df_stk_test = pd.concat(l, axis=1)
#.T.groupby(df_stk.columns[1:-1].to_series().str.split('_', expand=True)[1]).mean().T.idxmax(axis=1).sort_index()

In [235]:
X_mean = [i for i in df_stk_test.columns if i.split('_')[0] in ['cb1', 'xgb1', 'lgb1', 'lgb2']]
X_grp = [i.split('_')[1] for i in X_mean if i.split('_')[0] in ['cb1', 'xgb1', 'lgb1', 'lgb2']]
s_prd = df_stk_test[X_mean].T.groupby(X_grp).mean().T.idxmax(axis=1).sort_index().rename(target)

In [236]:
s_prd.to_frame().to_csv('result/submission3.csv')

In [237]:
#!kaggle competitions submit -c playground-series-s4e6 -f result/submission3.csv -m '3'

100%|█████████████████████████████████████████| 760k/760k [00:01<00:00, 400kB/s]
Successfully submitted to Classification with an Academic Success Dataset