In [1]:
import sys, os, re

import pandas as pd
import numpy as np

import lightgbm as lgb
import xgboost as xgb
import catboost as cb

import dproc, sgml, sgutil

for i in [pd, np, lgb, xgb, cb]:
    try:
        print(i.__name__, i.__version__)
    except:
        pass

pandas 2.2.2
numpy 1.26.4
lightgbm 4.3.0
xgboost 2.1.2
catboost 1.2.5


In [2]:
data_path = 'data'
files = {
    k: os.path.join(p, f)
    for k, p, f in [
        ('train', data_path, 'train.csv'),
        ('test', data_path, 'test.csv'),
        ('org', data_path, 'data.csv'),
        ('train_parquet', data_path, 'train.parquet'),
        ('org_parquet', data_path, 'org.parquet'),
        ('test_parquet', data_path, 'test.parquet'),
        ('var_pkl', data_path, 'var.pkl')
    ]
}

df_train = pd.read_parquet(files['train_parquet'])
df_test = pd.read_parquet(files['test_parquet'])

target = 'Target'
sc = sgutil.SGCache('img', 'result')
pd_vars = dproc.PD_Vars.load(os.path.join(data_path, 'vars'))
df_train, failed = pd_vars.procs_all(df_train)
df_org = pd.read_parquet(files['org_parquet'])
df_org = dproc.join_and_assign(df_org, pd_vars.procs_all(df_org)[0])
df_org['Application order_C'] = df_org['Application order_C'].astype('int')

[]

# Config

In [196]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold
import sgml, sgnn

skf = StratifiedKFold(5, shuffle=True, random_state=123)
ss = StratifiedShuffleSplit(n_splits=1, train_size=0.8, random_state=123)
target_values = np.sort(df_train[target].unique())

def get_validation_splitter(validation_frac):
    return lambda x: train_test_split(x, test_size=validation_frac, stratify=x[target], random_state=123)

config = {
    'predict_func': lambda m, df, X: pd.DataFrame(m.predict_proba(df[X]), index=df.index, columns=target_values),
    'score_func': lambda df, prd: accuracy_score(df[target], prd.idxmax(axis=1)),
    'progress_callback': sgml.ProgressCallBack(),
    'validation_splitter': get_validation_splitter, 'return_train_scores': False,
    'train_data_proc': lambda x: pd.concat([x, df_org], axis=0),
    'y': target
}

config2 = {
    'predict_func': lambda m, df, X: pd.DataFrame(m.predict_proba(df[X]), index=df.index, columns=target_values),
    'score_func': lambda df, prd: accuracy_score(df[target], prd.idxmax(axis=1)),
    'progress_callback': sgml.ProgressCallBack(),
    'validation_splitter': get_validation_splitter, 'return_train_scores': False,
    'y': target
}

lgb_adapter = sgml.LGBMAdapter(lgb.LGBMClassifier)
xgb_adapter = sgml.XGBAdapter(xgb.XGBClassifier, lambda _, x: x.map({k: v for v, k in enumerate(target_values)}))
cb_adapter = sgml.CBAdapter(cb.CatBoostClassifier)
nn_adapter = sgnn.NNAdapter(sgnn.NNClassifier)
lr_adapter = sgml.SklearnAdapter(LogisticRegression)

In [4]:
X_num = [
    'Admission grade', 'Age at enrollment', 'Application order' ,
    'Curricular units 1st sem (approved)', 'Curricular units 1st sem (credited)',
    'Curricular units 1st sem (enrolled)', 'Curricular units 1st sem (evaluations)',
    'Curricular units 1st sem (grade)', 'Curricular units 1st sem (without evaluations)',
    'Curricular units 2nd sem (approved)', 'Curricular units 2nd sem (credited)',
    'Curricular units 2nd sem (enrolled)', 'Curricular units 2nd sem (evaluations)',
    'Curricular units 2nd sem (grade)', 'Curricular units 2nd sem (without evaluations)',
    'GDP', 'Inflation rate', 'Previous qualification (grade)', 'Unemployment rate',
    'not_approved_1st', 'not_approved_2nd'
]
X_nom = [
    'Application mode', 'Course', "Father's occupation", "Father's qualification", "Mother's occupation", "Mother's qualification", 
    'Marital status',  'Nacionality', 'Previous qualification'
]
X_bool = [
    'Daytime/evening attendance', 'Debtor', 'Displaced', 'Educational special needs', 'Gender', 'International', 'Scholarship holder', 'Tuition fees up to date'
]

In [5]:
X_cat_r = pd_vars.df_var.query("src == 'cat_procs'").index.sort_values()
display(X_cat_r)
X_cat_r = X_cat_r.tolist()

Index(['Application mode_R', 'Application order_C', 'Course_R',
       'Father's occupation_R', 'Father's qualification_R', 'Marital status_R',
       'Mother's occupation_R', 'Mother's qualification_R', 'Nacionality_R',
       'Previous qualification_R'],
      dtype='object')

# LR

In [152]:
lr = sgml.CVModel.load_or_create('model', 'lr', skf, config, lr_adapter)

In [153]:
X_cont_sel = [
    'Age at enrollment_C',
    'Admission grade',
    'Previous qualification (grade)',
    'Curricular units 2nd sem (grade)_C', 'Curricular units 1st sem (grade)_C',
    'Curricular units 2nd sem (credited)', 'Curricular units 1st sem (credited)',
    'Curricular units 2nd sem (without evaluations)', 'Curricular units 1st sem (without evaluations)',
]

X_cat_sel = [
    'Course_R', 'Tuition fees up to date',
    'Application mode_R', 'Scholarship holder', 'Gender',
    'Previous qualification_R', 'Debtor', "Mother's qualification_R",
    "Father's qualification_R", "Mother's occupation_R", "Father's occupation_R",
    'Application order_C', 'Displaced',  
    'Curricular units 2nd sem (approved)_C', 'Curricular units 1st sem (approved)_C',
    'Curricular units 2nd sem (enrolled)_C', 'Curricular units 1st sem (enrolled)_C',
    'Curricular units 2nd sem (evaluations)_C', 'Curricular units 1st sem (evaluations)_C'
]
hparams = {
    'model_params': {'C': 1, 'max_iter': 500, 'multi_class': 'multinomial'}, 
    'X_tgt': ['2nd_eval_grade_C', '1st_eval_grade_C'], 'tgt': {'random_state': 123 },
    'X_ohe': X_cat_sel, 'X_mm': X_cont_sel, 'X_num': [], 'ohe': {'drop': 'first', 'handle_unknown': 'ignore'}
}
X_num_sel = []
result = lr.cv(df_train, hparams, lr_adapter)
np.mean(result['valid_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

0.828144994375507

# CB

## cb1

In [155]:
cb1 = sgml.CVModel.load_or_create('model', 'cb1', skf, config, cb_adapter)

In [166]:
hparams = {
    'model_params': {'n_estimators': 2500, 'learning_rate': 0.04, 'random_state': 123}, 
    'X_num': X_num,
    'X_cat': X_bool[1:] + X_cat_r, 
    #'validation_fraction': 0.1,
}
#result = cb1.adhoc(df_train, ss, hparams)
result = cb1.cv(df_train, hparams, task_type = 'GPU')

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

In [167]:
np.mean(result['valid_scores'])

0.8334901243359483

# LGB

## lgb1

In [169]:
lgb1 = sgml.CVModel.load_or_create('model', 'lgb1', skf, config, lgb_adapter)

In [171]:
hparams = {
    'model_params': {'n_estimators': 4000, 'learning_rate': 0.007, 'colsample_bytree': 0.25}, 
    #'validation_fraction': 0.1,
    'X_num': X_num, 'X_cat': X_bool[1:] + X_cat_r
}
#result = lgb1.adhoc(df_train, ss, hparams)
result = lgb1.cv(df_train, hparams)

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

Round:   0%|          | 0/4000 [00:00<?, ?it/s]

Round:   0%|          | 0/4000 [00:00<?, ?it/s]

Round:   0%|          | 0/4000 [00:00<?, ?it/s]

Round:   0%|          | 0/4000 [00:00<?, ?it/s]

Round:   0%|          | 0/4000 [00:00<?, ?it/s]

In [172]:
np.mean(result['valid_scores'])

0.8343396241367828

## lgb2

In [208]:
lgb2 = sgml.CVModel.load_or_create('model', 'lgb2', skf, config, lgb_adapter)

In [219]:
hparams = {
    'model_params': {'n_estimators': 4000, 'num_leaves': 15, 'learning_rate': 0.02, 'colsample_bytree': 0.25}, 
    #'validation_fraction': 0.1,
    'X_num': X_num, 'X_cat': X_bool[1:] + X_cat_r
}
#result = lgb2.adhoc(df_train, ss, hparams)
result = lgb2.cv(df_train, hparams)

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

Round:   0%|          | 0/4000 [00:00<?, ?it/s]

Round:   0%|          | 0/4000 [00:00<?, ?it/s]

Round:   0%|          | 0/4000 [00:00<?, ?it/s]

Round:   0%|          | 0/4000 [00:00<?, ?it/s]

Round:   0%|          | 0/4000 [00:00<?, ?it/s]

In [220]:
np.mean(result['valid_scores'])

0.8339083489637567

# XGB

## xgb1

In [173]:
xgb1 = sgml.CVModel.load_or_create('model', 'xgb1', skf, config, xgb_adapter)

In [174]:
hparams = {
    'model_params': {'n_estimators': 4500, 'learning_rate': 0.007, 'colsample_bytree': 0.25}, 
    'X_num': X_num, 'X_cat': X_bool + X_cat_r,
    #'validation_fraction': 0.1,
}
#result = xgb1.adhoc(df_train, ss, hparams, device= 'cuda')
result = xgb1.cv(df_train, hparams, device= 'cuda')

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

Round:   0%|          | 0/4500 [00:00<?, ?it/s]

Round:   0%|          | 0/4500 [00:00<?, ?it/s]

Round:   0%|          | 0/4500 [00:00<?, ?it/s]

Round:   0%|          | 0/4500 [00:00<?, ?it/s]

Round:   0%|          | 0/4500 [00:00<?, ?it/s]

In [175]:
np.mean(result['valid_scores'])

0.8339998155058377

# NN

## NN1

In [179]:
nn1 = sgml.CVModel.load_or_create('model', 'nn1', skf, config, nn_adapter)

In [185]:
emb_config = [
    ('Course_R', 3),
    ('Application mode_R', 2),
    ('Previous qualification_R', 2),
    ("Mother's qualification_R", 2),
    ("Father's qualification_R", 2),
    ("Mother's occupation_R", 2),
    ("Father's occupation_R", 2),
    ('Marital status_R', 2),
]
X_clip = [
    'Curricular units 2nd sem (approved)_C',
    'Curricular units 1st sem (approved)_C',
    'Curricular units 2nd sem (enrolled)_C',
    'Curricular units 1st sem (enrolled)_C',
    'Curricular units 2nd sem (evaluations)_C',
    'Curricular units 1st sem (evaluations)_C',
    'Curricular units 2nd sem (grade)_C',
    'Curricular units 1st sem (grade)_C',
    'Age at enrollment_C',
    'Curricular units 1st sem (without evaluations)_C',
    'Curricular units 2nd sem (without evaluations)_C',
    'Curricular units 2nd sem (credited)_C',
    'Curricular units 1st sem (credited)_C'
]
X_cat = [i for i,_ in emb_config]
embedding = [(1, len(df_train[a].cat.categories), b, 0, 0) for a, b in emb_config]
hparams = {
    'model_params': {
        'model_params': {
            'config': [
                {'unit': 32, 'activation': 'relu', 'batch_norm': False},
            ], 'embedding': embedding,
        },
        'batch_size': 2048,
        'shuffle_size': 204800, 'epochs': 80, 'optimizer': ('Adam', {'learning_rate': 0.0005})
    }, 
    'X_tgt': ['2nd_eval_grade_C', '1st_eval_grade_C'], 'tgt': {'random_state': 123 },
    #'validation_fraction': 0.1,
    'X_std': X_clip + ['Previous qualification (grade)', 'Admission grade', 'Unemployment rate', 'GDP'],
    'X_mm': ['Application order_C'],
    'X_num': X_bool[1:],
    'X_ohe': ['Application order_C', 'Curricular units 2nd sem (approved)_C', 'Curricular units 1st sem (approved)_C', 
              'Curricular units 2nd sem (enrolled)_C', 'Curricular units 1st sem (enrolled)_C',
              'Curricular units 2nd sem (evaluations)_C', 'Curricular units 1st sem (evaluations)_C'
    ],
    'X_cat': X_cat,
}
result = nn1.cv(df_train, hparams)
result['valid_scores'], np.mean(result['valid_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

Epoch:   0%|          | 0/80 [00:00<?, ?it/s]

Step:   0%|          | 0/33 [00:00<?, ?it/s]

Epoch:   0%|          | 0/80 [00:00<?, ?it/s]

Step:   0%|          | 0/33 [00:00<?, ?it/s]

Epoch:   0%|          | 0/80 [00:00<?, ?it/s]

Step:   0%|          | 0/33 [00:00<?, ?it/s]

Epoch:   0%|          | 0/80 [00:00<?, ?it/s]

Step:   0%|          | 0/33 [00:00<?, ?it/s]

Epoch:   0%|          | 0/80 [00:00<?, ?it/s]

Step:   0%|          | 0/33 [00:00<?, ?it/s]

([0.8305671719811814,
  0.8297177208572922,
  0.8289336121275483,
  0.8243481670260733,
  0.8300986734627197],
 0.8287330690909631)

# Stacking

In [221]:
models = [lr, cb1, lgb1, xgb1, nn1, lgb2]
pd.Series([np.mean(i.cv_best_['score']) for i in models], index = [i.name for i in models]).rename('accuracy').to_frame().T

Unnamed: 0,lr,cb1,lgb1,xgb1,nn1,lgb2
accuracy,0.828145,0.83349,0.83434,0.834,0.828733,0.833908


In [222]:
df_stk = sgml.stack_cv(models, df_train[target])
df_stk.head()

Unnamed: 0,lr_Dropout,lr_Enrolled,lr_Graduate,cb1_Dropout,cb1_Enrolled,cb1_Graduate,lgb1_Dropout,lgb1_Enrolled,lgb1_Graduate,xgb1_Dropout,xgb1_Enrolled,xgb1_Graduate,nn1_Dropout,nn1_Enrolled,nn1_Graduate,lgb2_Dropout,lgb2_Enrolled,lgb2_Graduate,Target
1,0.852738,0.131012,0.01625,0.920081,0.076,0.003919,0.914037,0.080037,0.005927,0.905539,0.083846,0.010615,0.878178,0.109647,0.012175,0.942122,0.054044,0.003835,Dropout
9,0.031004,0.061109,0.907887,0.02169,0.045688,0.932622,0.017696,0.046013,0.936291,0.024608,0.051748,0.923644,0.02289,0.04161,0.935501,0.023254,0.05346,0.923286,Graduate
18,0.080756,0.364454,0.55479,0.056511,0.368815,0.574674,0.070207,0.360766,0.569026,0.071212,0.371091,0.557697,0.085262,0.398376,0.516362,0.084074,0.335101,0.580825,Graduate
20,0.911161,0.08808,0.00076,0.929785,0.069459,0.000757,0.945325,0.053189,0.001487,0.952722,0.045227,0.002051,0.936895,0.062015,0.00109,0.933445,0.065531,0.001024,Dropout
22,0.029186,0.029244,0.941571,0.04306,0.017175,0.939765,0.040251,0.020538,0.939211,0.048757,0.023055,0.928187,0.028856,0.026004,0.945141,0.041246,0.019703,0.93905,Graduate


In [226]:
X_stk = [i for i in df_stk.columns if i.split('_')[0] in ['cb1', 'xgb1', 'lgb1', 'lgb2'] and i.split('_')[1] in ['Dropout', 'Enrolled', 'Graduate']]
accuracy_score(
    df_train[target],
    df_stk[X_stk].T.groupby(df_stk.columns[1:-1].to_series().str.split('_', expand=True)[1]).mean().T.idxmax(axis=1).sort_index()
)

0.8348754541415092

In [227]:
lr_stk = sgml.CVModel.load_or_create('model', 'lr_stk', skf, config2, lr_adapter)

In [228]:
df_stk[X_stk].isna().sum()

cb1_Dropout      0
cb1_Enrolled     0
cb1_Graduate     0
lgb1_Dropout     0
lgb1_Enrolled    0
lgb1_Graduate    0
xgb1_Dropout     0
xgb1_Enrolled    0
xgb1_Graduate    0
lgb2_Dropout     0
lgb2_Enrolled    0
lgb2_Graduate    0
dtype: int64

In [229]:
X_stk = [i for i in df_stk.columns if i.split('_')[0] in ['cb1', 'xgb1', 'lgb1', 'lgb2'] and i.split('_')[1] in ['Dropout', 'Enrolled']]
result = lr_stk.cv(df_stk, {
    'model_params': {}, 'X_num': X_stk
})
lr_stk.cv_best_, np.mean(result['valid_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

({'score': 0.834235117297262,
  'hparams': {'model_params': {},
   'X_num': ['cb1_Dropout',
    'cb1_Enrolled',
    'lgb1_Dropout',
    'lgb1_Enrolled',
    'xgb1_Dropout',
    'xgb1_Enrolled',
    'lgb2_Dropout',
    'lgb2_Enrolled']},
  'prd':         Dropout  Enrolled  Graduate
  1      0.939793  0.046796  0.013411
  18     0.083592  0.310807  0.605600
  45     0.961008  0.029506  0.009486
  56     0.022009  0.044470  0.933521
  116    0.101482  0.564244  0.334274
  ...         ...       ...       ...
  76463  0.935114  0.051563  0.013323
  76471  0.110596  0.623268  0.266137
  76481  0.017602  0.043357  0.939041
  76507  0.134046  0.817625  0.048329
  76515  0.050234  0.169951  0.779815
  
  [76518 rows x 3 columns],
  'k': "{'model_params': {}, 'X_num': ['cb1_Dropout', 'cb1_Enrolled', 'lgb1_Dropout', 'lgb1_Enrolled', 'xgb1_Dropout', 'xgb1_Enrolled', 'lgb2_Dropout', 'lgb2_Enrolled']}"},
 0.834235117297262)

In [230]:
lr_stk.train(df_stk)

{'variables': ['lgb2_Enrolled',
  'lgb1_Enrolled',
  'cb1_Enrolled',
  'xgb1_Enrolled',
  'xgb1_Dropout',
  'cb1_Dropout',
  'lgb1_Dropout',
  'lgb2_Dropout'],
 'train_shape': (76518, 8),
 'target': 'Target',
 'target_func': None}

In [231]:
for i in models:
    if i.name.startswith('cb'):
        i.train(df_train, task_type = 'GPU')
    elif i.name.startswith('xgb'):
        i.train(df_train, device = 'cuda')
    else:
        i.train(df_train)

Round:   0%|          | 0/4000 [00:00<?, ?it/s]

In [232]:
df_test = pd.read_parquet(files['test_parquet']).set_index('id')
df_test = dproc.join_and_assign(df_test, pd_vars.procs_all(df_test)[0])
df_test['Application order_C'] = df_test['Application order_C'].astype('int')

In [233]:
l = list()
for i in models:
    fname = os.path.join('result', '{}_test.parquet'.format(i.name))
    if os.path.exists(fname):
        l.append(pd.read_parquet(fname))
    else:
        l.append(i.get_predictor()(df_test).rename(columns = lambda x: i.name + '_' + x))
        l[-1].to_parquet(fname)

In [234]:
df_stk_test = pd.concat(l, axis=1)
#.T.groupby(df_stk.columns[1:-1].to_series().str.split('_', expand=True)[1]).mean().T.idxmax(axis=1).sort_index()

In [235]:
X_mean = [i for i in df_stk_test.columns if i.split('_')[0] in ['cb1', 'xgb1', 'lgb1', 'lgb2']]
X_grp = [i.split('_')[1] for i in X_mean if i.split('_')[0] in ['cb1', 'xgb1', 'lgb1', 'lgb2']]
s_prd = df_stk_test[X_mean].T.groupby(X_grp).mean().T.idxmax(axis=1).sort_index().rename(target)

In [236]:
s_prd.to_frame().to_csv('result/submission3.csv')

In [237]:
#!kaggle competitions submit -c playground-series-s4e6 -f result/submission3.csv -m '3'

100%|█████████████████████████████████████████| 760k/760k [00:01<00:00, 400kB/s]
Successfully submitted to Classification with an Academic Success Dataset