In [1]:
import os
import joblib
import numpy as np
import pandas as pd
import polars as pl

import pandas.api.types
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

import lightgbm as lgb
import xgboost as xgb
import catboost as cb

import lifelines
from lifelines.utils import concordance_index

import sys

print(sys.version)
for i in [np, pd, pl, mpl, sns, lifelines, sklearn, lgb, xgb, cb]:
    try:
        print(i.__name__, i.__version__)
    except:
        print(i.__name__)

3.12.6 (main, Sep 30 2024, 02:19:13) [GCC 9.4.0]
numpy 1.26.4
pandas 2.2.3
polars 1.12.0
matplotlib 3.8.4
seaborn 0.13.2
lifelines 0.30.0
sklearn 1.5.2
lightgbm
xgboost 2.1.2
catboost 1.2.5


In [3]:
from functools import partial
import dproc, sgutil, sgpp, sgml, custpp
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate, KFold, ShuffleSplit, train_test_split
from sklearn.impute import SimpleImputer
from lifelines import NelsonAalenFitter

In [4]:
data_path = 'data'
model_path = 'model'

p2 =joblib.load(os.path.join(model_path, 'p2.joblib'))
df_train = p2.transform([os.path.join(data_path, 'train.csv')])

In [5]:
def score(df, prds):
    return df.groupby('race_group', observed=True).apply(
        lambda x: concordance_index(x['efs_time'], -prds.loc[x.index], x['efs']), include_groups=False
    ).pipe(
        lambda x: float(x.mean() - x.std(ddof=0))
    )

In [6]:
X_bool = ['graft_type', 'prod_type']
X_tri = [
    'arrhythmia', 'cardiac', 'diabetes', 'hepatic_mild', 'hepatic_severe',
    'in_vivo_tcd', 'melphalan_dose', 'mrd_hct', 'obesity', 'peptic_ulcer',
    'prior_tumor', 'psych_disturb', 'pulm_moderate', 'pulm_severe', 'renal_issue',
    'rheum_issue', 'rituximab', 'vent_hist'
]
X_nom = [
    'cmv_status', 'conditioning_intensity', 'cyto_score', 'cyto_score_detail', 'donor_related',
    'dri_score', 'ethnicity', 'gvhd_proph', 'prim_disease_hct', 'race_group', 'sex_match',
    'tbi_status', 'tce_div_match', 'tce_imm_match', 'tce_match'
]
X_na = [
    'arrhythmia_na', 'cardiac_na', 'diabetes_na', 'hepatic_mild_na', 'hepatic_severe_na',
    'obesity_na', 'peptic_ulcer_na', 'prior_tumor_na', 'psych_disturb_na', 'pulm_moderate_na',
    'pulm_severe_na', 'renal_issue_na', 'rheum_issue_na'
]
X_cont = ['age_at_hct', 'donor_age']
X_int = [
    'comorbidity_score', 'hla_high_res_10', 'hla_high_res_6', 'hla_high_res_8', 'hla_low_res_10',
    'hla_low_res_6', 'hla_low_res_8', 'hla_match_a_high', 'hla_match_a_low', 'hla_match_b_high',
    'hla_match_b_low', 'hla_match_drb1_low', 'hla_match_c_high', 'hla_match_c_low', 'hla_match_dqb1_high', 'hla_match_dqb1_low',
    'hla_match_drb1_high', 'hla_nmdp_6', 'karnofsky_score', 'year_hct'
]
X_all = X_tri + X_cont + X_int + X_na + X_bool + X_nom

In [10]:
from sklearn.model_selection import StratifiedKFold
def score(df, prds):
    return df.groupby('race_group', observed=True).apply(
        lambda x: concordance_index(x['efs_time'], -prds.loc[x.index], x['efs']), include_groups=False
    ).pipe(
        lambda x: float(x.mean() - x.std(ddof=0))
    )
def get_validation_splitter(validation_fraction):
    return lambda x: train_test_split(x, test_size = validation_fraction)

config2 = {
    'predict_func': lambda m, df, X: pd.Series(m.predict_proba(df[X])[:, 1], index = df.index),
    'score_func': score,
    'validation_splitter': get_validation_splitter,
    'progress_callback': sgml.ProgressCallBack(), 
    'return_train_scores': True,
    'train_data_proc': custpp.filter_censor_data,
    'y': 'efs_b',
}
skf = StratifiedKFold(5, random_state = 123, shuffle = True)
lgb_adapter = sgml.LGBMAdapter(lgb.LGBMClassifier)
xgb_adapter = sgml.XGBAdapter(xgb.XGBClassifier)
cb_adapter = sgml.CBAdapter(cb.CatBoostClassifier)

In [13]:
lgb4_1 = sgml.CVModel('model', 'lgb4_1', skf, config2, lgb_adapter).load_if_exists()
lgb4_2 = sgml.CVModel('model', 'lgb4_2', skf, config2, lgb_adapter).load_if_exists()
lgb4_3 = sgml.CVModel('model', 'lgb4_3', skf, config2, lgb_adapter).load_if_exists()
lgb4_4 = sgml.CVModel('model', 'lgb4_4', skf, config2, lgb_adapter).load_if_exists()
hparams = {
    'model_params': {'num_leaves':  15, 'n_estimators':  2000, 'colsample_bytree': 0.25, 'learning_rate': 0.02},
    'X_num': X_tri + X_cont + X_int + X_na + X_bool, 
    'X_cat': X_nom, 'cat': {'handle_unknown': 'use_encoded_value', 'unknown_value': -1},
    #'validation_fraction': 0.1
}

for l, q in zip([lgb4_1, lgb4_2, lgb4_3, lgb4_4], [0.25, 0.5, 0.75, 0.95]):
    t = df_train.loc[df_train['efs'] == 1.0, 'efs_time'].quantile(q)
    df_train = df_train.assign(efs_b = lambda x: x['efs_time'] < t)
    hparams['train_data_proc_param'] = {'t': t}
    l.cv(df_train, hparams)

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

Round:   0%|          | 0/2000 [00:00<?, ?it/s]

Round:   0%|          | 0/2000 [00:00<?, ?it/s]

Round:   0%|          | 0/2000 [00:00<?, ?it/s]

Round:   0%|          | 0/2000 [00:00<?, ?it/s]

Round:   0%|          | 0/2000 [00:00<?, ?it/s]

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

Round:   0%|          | 0/2000 [00:00<?, ?it/s]

Round:   0%|          | 0/2000 [00:00<?, ?it/s]

Round:   0%|          | 0/2000 [00:00<?, ?it/s]

Round:   0%|          | 0/2000 [00:00<?, ?it/s]

Round:   0%|          | 0/2000 [00:00<?, ?it/s]

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

Round:   0%|          | 0/2000 [00:00<?, ?it/s]

Round:   0%|          | 0/2000 [00:00<?, ?it/s]

Round:   0%|          | 0/2000 [00:00<?, ?it/s]

Round:   0%|          | 0/2000 [00:00<?, ?it/s]

Round:   0%|          | 0/2000 [00:00<?, ?it/s]

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

Round:   0%|          | 0/2000 [00:00<?, ?it/s]

Round:   0%|          | 0/2000 [00:00<?, ?it/s]

Round:   0%|          | 0/2000 [00:00<?, ?it/s]

Round:   0%|          | 0/2000 [00:00<?, ?it/s]

Round:   0%|          | 0/2000 [00:00<?, ?it/s]

In [14]:
score(
    df_train,
    pd.concat(
        [i.cv_best_['prd'].sort_index() for i in [lgb4_1, lgb4_2, lgb4_3, lgb4_4]], axis=1
    ).dot([0.2, 0.2, 0.3, 0.3])
)

0.681486151592398

In [15]:
from sklearn.model_selection import StratifiedKFold
xgb4_1 = sgml.CVModel('model', 'xgb4_1', skf, config2, xgb_adapter).load_if_exists()
xgb4_2 = sgml.CVModel('model', 'xgb4_2', skf, config2, xgb_adapter).load_if_exists()
xgb4_3 = sgml.CVModel('model', 'xgb4_3', skf, config2, xgb_adapter).load_if_exists()
xgb4_4 = sgml.CVModel('model', 'xgb4_4', skf, config2, xgb_adapter).load_if_exists()
hparams = {
    'model_params': {'max_depth':  3, 'n_estimators':  3000, 'colsample_bytree': 0.25, 'learning_rate': 0.02},
    'X_num': X_tri + X_cont + X_int + X_na + X_bool, 
    'X_cat': X_nom, 'cat': {'handle_unknown': 'ignore'},
    #'validation_fraction': 0.1
}

for x, q in zip([xgb4_1, xgb4_2, xgb4_3, xgb4_4], [0.25, 0.5, 0.75, 0.95]):
    t = df_train.loc[df_train['efs'] == 1.0, 'efs_time'].quantile(q)
    df_train = df_train.assign(efs_b = lambda x: x['efs_time'] < t)
    hparams['train_data_proc_param'] = {'t': t}
    x.cv(df_train, hparams, device='cuda')

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

Round:   0%|          | 0/3000 [00:00<?, ?it/s]

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




Round:   0%|          | 0/3000 [00:00<?, ?it/s]

Round:   0%|          | 0/3000 [00:00<?, ?it/s]

Round:   0%|          | 0/3000 [00:00<?, ?it/s]

Round:   0%|          | 0/3000 [00:00<?, ?it/s]

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

Round:   0%|          | 0/3000 [00:00<?, ?it/s]

Round:   0%|          | 0/3000 [00:00<?, ?it/s]

Round:   0%|          | 0/3000 [00:00<?, ?it/s]

Round:   0%|          | 0/3000 [00:00<?, ?it/s]

Round:   0%|          | 0/3000 [00:00<?, ?it/s]

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

Round:   0%|          | 0/3000 [00:00<?, ?it/s]

Round:   0%|          | 0/3000 [00:00<?, ?it/s]

Round:   0%|          | 0/3000 [00:00<?, ?it/s]

Round:   0%|          | 0/3000 [00:00<?, ?it/s]

Round:   0%|          | 0/3000 [00:00<?, ?it/s]

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

Round:   0%|          | 0/3000 [00:00<?, ?it/s]

Round:   0%|          | 0/3000 [00:00<?, ?it/s]

Round:   0%|          | 0/3000 [00:00<?, ?it/s]

Round:   0%|          | 0/3000 [00:00<?, ?it/s]

Round:   0%|          | 0/3000 [00:00<?, ?it/s]

In [30]:
score(
    df_train,
    pd.concat(
        [i.cv_best_['prd'].sort_index() for i in [xgb4_1, xgb4_2, xgb4_3, xgb4_4]], axis=1
    ).dot([0.2, 0.2, 0.3, 0.3])
)

0.6798441491030642

In [29]:
score(
    df_train,
    pd.concat(
        [i.cv_best_['prd'].sort_index() for i in [lgb4_1, lgb4_2, lgb4_3, lgb4_4]], axis=1
    ).dot([0.2, 0.2, 0.3, 0.3]) * 0.8 + 
    pd.concat(
        [i.cv_best_['prd'].sort_index() for i in [xgb4_1, xgb4_2, xgb4_3, xgb4_4]], axis=1
    ).dot([0.2, 0.2, 0.3, 0.3]) * 0.2
)

0.6815356387039385

In [22]:
for l, q in zip([lgb4_1, lgb4_2, lgb4_3, lgb4_4], [0.25, 0.5, 0.75, 0.95]):
    t = df_train.loc[df_train['efs'] == 1.0, 'efs_time'].quantile(q)
    df_train = df_train.assign(efs_b = lambda x: x['efs_time'] < t)
    l.train(df_train)

for x, q in zip([xgb4_1, xgb4_2, xgb4_3, xgb4_4], [0.25, 0.5, 0.75, 0.95]):
    t = df_train.loc[df_train['efs'] == 1.0, 'efs_time'].quantile(q)
    df_train = df_train.assign(efs_b = lambda x: x['efs_time'] < t)
    x.train(df_train, device='cuda')

Round:   0%|          | 0/2000 [00:00<?, ?it/s]

Round:   0%|          | 0/2000 [00:00<?, ?it/s]

Round:   0%|          | 0/2000 [00:00<?, ?it/s]

Round:   0%|          | 0/2000 [00:00<?, ?it/s]

Round:   0%|          | 0/3000 [00:00<?, ?it/s]

Round:   0%|          | 0/3000 [00:00<?, ?it/s]

Round:   0%|          | 0/3000 [00:00<?, ?it/s]

Round:   0%|          | 0/3000 [00:00<?, ?it/s]