In [1]:
!pip install --upgrade lifelines


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import os
import joblib
import numpy as np
import pandas as pd
import polars as pl

import pandas.api.types
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

import lightgbm as lgb
import xgboost as xgb
import catboost as cb

import lifelines
from lifelines.utils import concordance_index

import sys

print(sys.version)
for i in [np, pd, pl, mpl, sns, lifelines, sklearn, lgb, xgb, cb]:
    try:
        print(i.__name__, i.__version__)
    except:
        print(i.__name__)

3.12.6 (main, Sep 30 2024, 02:19:13) [GCC 9.4.0]
numpy 1.26.4
pandas 2.2.3
polars 1.12.0
matplotlib 3.8.4
seaborn 0.13.2
lifelines 0.30.0
sklearn 1.5.2
lightgbm
xgboost 2.1.2
catboost 1.2.5


In [3]:
import dproc, sgutil, sgpp, sgml, custpp
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate, KFold, ShuffleSplit, train_test_split
from sklearn.impute import SimpleImputer
from lifelines import KaplanMeierFitter

In [4]:
data_path = 'data'
img_path = 'img'
result_path = 'result'
model_path = 'model'

sc = sgutil.SGCache(img_path, result_path)

X_4 = [
    'psych_disturb', 'diabetes', 'arrhythmia', 'renal_issue', 'pulm_severe', 'obesity', 'hepatic_severe', 
    'prior_tumor', 'peptic_ulcer','rheum_issue', 'hepatic_mild', 'cardiac','pulm_moderate'
]

X_int_fm = [
    'comorbidity_score', 'hla_high_res_10', 'hla_high_res_6', 'hla_high_res_8', 'hla_low_res_10',
    'hla_low_res_6', 'hla_low_res_8', 'hla_match_a_high', 'hla_match_a_low', 'hla_match_b_high',
    'hla_match_b_low', 'hla_match_drb1_low', 'hla_match_c_high', 'hla_match_c_low', 'hla_match_dqb1_high', 'hla_match_dqb1_low',
    'hla_match_drb1_high', 'hla_nmdp_6', 'karnofsky_score'
]

X_2 = [
    ('graft_type', {'peripheral blood': 0, 'bone marrow': 1}),
    ('prod_type', {'pb': 0, 'bm': 1}),
    ('vent_hist', {'no': -1, 'yes': 1}),
    ('rituximab', {'no': -1, 'yes': 1}),
    ('mrd_hct', {'negative': -1, 'positive': 1}),
    ('in_vivo_tcd', {'no': -1, 'yes': 1}),
    ('melphalan_dose', {'n/a, mel not given': -1, 'mel': 1})
]
X_na = X_4 +  X_int_fm +  ['donor_age']
X_nom_na = ['cmv_status', 'conditioning_intensity', 'cyto_score', 'cyto_score_detail', 'donor_related',
    'dri_score', 'ethnicity', 'gvhd_proph', 'sex_match', 'tce_div_match', 'tce_imm_match', 'tce_match'
]
X_nom_nna = ['prim_disease_hct', 'race_group', 'tbi_status']
p2 = make_pipeline(
    sgpp.PolarsProcessor({'ID': pl.Int64}),
    sgpp.PandasCoverter(index_col = 'ID'),
    custpp.CIBMTTransformer(X_2, X_4, X_na, X_nom_na + X_nom_nna),
    sgpp.ApplyWrapper(SimpleImputer(strategy='most_frequent').set_output(transform='pandas'), X_int_fm, postfix = '_fm'),
    sgpp.ApplyWrapper(SimpleImputer(strategy='mean').set_output(transform='pandas'), ['donor_age'], postfix = '_fm'),
    sgpp.ApplyWrapper(sgpp.CatArrangerFreq(1, 'na', 'na'), X_nom_na),
    sgpp.ApplyWrapper(sgpp.CatOOVFilter(), X_nom_nna),
)
df_train = p2.fit_transform(['data/train.csv'])
joblib.dump(p2, os.path.join('model', 'p2.joblib'))

['model/p2.joblib']

In [5]:
def transform_survival_probability(df, time_col='efs_time', event_col='efs'):
    kmf = KaplanMeierFitter()
    kmf.fit(df[time_col], df[event_col])
    y = kmf.survival_function_at_times(df[time_col]).values
    return y
df_train['kmf'] = transform_survival_probability(df_train, time_col='efs_time', event_col='efs')

In [6]:
X_bool = ['graft_type', 'prod_type']
X_tri = [
    'arrhythmia', 'cardiac', 'diabetes', 'hepatic_mild', 'hepatic_severe',
    'in_vivo_tcd', 'melphalan_dose', 'mrd_hct', 'obesity', 'peptic_ulcer',
    'prior_tumor', 'psych_disturb', 'pulm_moderate', 'pulm_severe', 'renal_issue',
    'rheum_issue', 'rituximab', 'vent_hist'
]
X_nom = [
    'cmv_status', 'conditioning_intensity', 'cyto_score', 'cyto_score_detail', 'donor_related',
    'dri_score', 'ethnicity', 'gvhd_proph', 'prim_disease_hct', 'race_group', 'sex_match',
    'tbi_status', 'tce_div_match', 'tce_imm_match', 'tce_match'
]
X_na = [
    'arrhythmia_na', 'cardiac_na', 'diabetes_na', 'hepatic_mild_na', 'hepatic_severe_na',
    'obesity_na', 'peptic_ulcer_na', 'prior_tumor_na', 'psych_disturb_na', 'pulm_moderate_na',
    'pulm_severe_na', 'renal_issue_na', 'rheum_issue_na'
]
X_cont = ['age_at_hct', 'donor_age']
X_int = [
    'comorbidity_score', 'hla_high_res_10', 'hla_high_res_6', 'hla_high_res_8', 'hla_low_res_10',
    'hla_low_res_6', 'hla_low_res_8', 'hla_match_a_high', 'hla_match_a_low', 'hla_match_b_high',
    'hla_match_b_low', 'hla_match_drb1_low', 'hla_match_c_high', 'hla_match_c_low', 'hla_match_dqb1_high', 'hla_match_dqb1_low',
    'hla_match_drb1_high', 'hla_nmdp_6', 'karnofsky_score', 'year_hct'
]


In [7]:
def score(df, prds):
    return df.groupby('race_group', observed=True).apply(
        lambda x: concordance_index(x['efs_time'], -prds.loc[x.index], x['efs']), include_groups=False
    ).pipe(
        lambda x: float(x.mean() - x.std(ddof=0))
    )
def get_validation_splitter(validation_fraction):
    return lambda x: train_test_split(x, test_size = validation_fraction)

config = {
    'predict_func': lambda m, df, X: pd.Series(m.predict(df[X]), index = df.index),
    'score_func': score,
    'validation_splitter': get_validation_splitter,
    'progress_callback': sgml.ProgressCallBack(), 
    'return_train_scores': True,
    'y': 'kmf',
}

kf = KFold(n_splits=10, shuffle=True, random_state = 123)
ss = ShuffleSplit(n_splits=1, random_state = 123)
lgb_adapter = sgml.LGBMAdapter(lgb.LGBMRegressor)
xgb_adapter = sgml.XGBAdapter(xgb.XGBRegressor)
cb_adapter = sgml.CBAdapter(cb.CatBoostRegressor)

# LGB1

In [8]:
lgb1 = sgml.CVModel('model', 'lgb1', kf, config, lgb_adapter).load_if_exists()

In [44]:
hparams = {
    'model_params': {'num_leaves':  15, 'n_estimators':  3000, 'colsample_bytree': 0.25, 'learning_rate': 0.02},
    'X_num': X_tri + X_cont + X_int + X_na + X_bool, 
    'X_cat': X_nom, 'cat': {'handle_unknown': 'use_encoded_value', 'unknown_value': -1},
    #'validation_fraction': 0.1
}
#result = lgb1.adhoc(df_train, ss, hparams)
result = lgb1.cv(df_train, hparams)

In [45]:
np.mean(result['valid_scores'])

0.666239317894867

In [10]:
score(df_train, lgb1.cv_best_['prd'].sort_index())

0.6734179596864021

# XGB1

In [11]:
xgb1 = sgml.CVModel('model', 'xgb1', kf, config, xgb_adapter).load_if_exists()

In [42]:
hparams = {
    'model_params': {
        'max_depth': 3, 'colsample_bytree': 0.5, 'subsample': 0.8, 'n_estimators': 2000, 'learning_rate': 0.02, 'min_child_weight':  80
    },
    'X_num':  X_tri + X_cont + X_int + X_na + X_bool, 'X_cat': X_nom, 'cat': {'handle_unknown': 'ignore'},
    #'validation_fraction': 0.1,
}
#result = xgb1.adhoc(df_train, ss, hparams, device = 'cuda')
result = xgb1.cv(df_train, hparams, device = 'cuda')

In [43]:
np.mean(result['valid_scores'])

0.665205354683022

In [13]:
score(df_train, xgb1.cv_best_['prd'].sort_index())

0.6724051710595899

# CB1

In [17]:
cb1 = sgml.CVModel('model', 'cb1', kf, config, cb_adapter).load_if_exists()

In [38]:
hparams = {
    'model_params': {
        'max_depth': 6, 'n_estimators': 3500, 'learning_rate': 0.03
    },
    'X_num':  X_tri + X_cont + X_int + X_na + X_bool, 'X_cat': X_nom
    #'validation_fraction': 0.1,
}
#result = cb1.adhoc(df_train, ss, hparams, task_type='GPU')
result = cb1.cv(df_train, hparams, task_type = 'GPU')

In [39]:
np.mean(result['valid_scores'])#, result['model_result'][0]['valid_result'].idxmin()

0.6642239819854309

In [40]:
score(df_train, cb1.cv_best_['prd'].sort_index())

0.672219746515295

In [20]:
models = [lgb1, xgb1, cb1]
df_stk = sc.cache_result(
    'phase2_stk',
    lambda : sgml.stack_cv(models)
)

In [23]:
score(
    df_train,
    df_stk.sort_index()[['lgb1', 'xgb1', 'cb1']].dot([0.4, 0.3, 0.3])
)

0.6749017855832007

In [24]:
for i in models:
    if i.name.startswith('cb'):
        i.train(df_train, task_type = 'GPU')
    elif i.name.startswith('xgb'):
        i.train(df_train, device = 'cuda')
    else:
        i.train(df_train)

In [None]:
hparams = {
    'model_params': {
        'max_depth': 6, 'n_estimators': 3500, 'learning_rate': 0.03
    },
    'X_num':  X_tri + X_cont + X_int + X_na + X_bool, 'X_cat': X_nom
    #'validation_fraction': 0.1,
}
#result = cb1.adhoc(df_train, ss, hparams, task_type='GPU')
result = cb1.cv(df_train, hparams, task_type = 'GPU')