In [1]:
!pip install --upgrade lifelines

Collecting lifelines
  Downloading lifelines-0.30.0-py3-none-any.whl.metadata (3.2 kB)
Collecting autograd-gamma>=0.3 (from lifelines)
  Downloading autograd-gamma-0.5.0.tar.gz (4.0 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting formulaic>=0.2.2 (from lifelines)
  Downloading formulaic-1.1.1-py3-none-any.whl.metadata (6.9 kB)
Collecting interface-meta>=1.2.0 (from formulaic>=0.2.2->lifelines)
  Downloading interface_meta-1.3.0-py3-none-any.whl.metadata (6.7 kB)
Downloading lifelines-0.30.0-py3-none-any.whl (349 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m349.3/349.3 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0mta [36m0:00:01[0m
[?25hDownloading formulaic-1.1.1-py3-none-any.whl (115 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.7/115.7 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading interface_meta-1.3.0-py3-none-any.whl (14 kB)
Building wheels for collected packages: autograd-gamma
  Building wheel for

In [2]:
import os
import joblib
import numpy as np
import pandas as pd
import polars as pl

import pandas.api.types
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

import lightgbm as lgb
import xgboost as xgb
import catboost as cb

import lifelines
from lifelines.utils import concordance_index

import sys

print(sys.version)
for i in [np, pd, pl, mpl, sns, lifelines, sklearn, lgb, xgb, cb]:
    try:
        print(i.__name__, i.__version__)
    except:
        print(i.__name__)

3.10.12 (main, Nov  6 2024, 20:22:13) [GCC 11.4.0]
numpy 1.26.4
pandas 2.2.3
polars 1.9.0
matplotlib 3.7.5
seaborn 0.12.2
lifelines 0.30.0
sklearn 1.2.2
lightgbm 4.5.0
xgboost 2.0.3
catboost 1.2.7


In [3]:
import dproc, sgutil, sgpp, sgml, custpp
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate, KFold, ShuffleSplit, train_test_split
import tensorflow as tf

In [4]:
data_path = 'data'
model_path = 'model'

p3 =joblib.load(os.path.join(model_path, 'p3.joblib'))
df_train = p3.transform([os.path.join(data_path, 'train.csv')])

In [5]:
X_bool = ['graft_type', 'prod_type']
X_tri = [
    'arrhythmia', 'cardiac', 'diabetes', 'hepatic_mild', 'hepatic_severe',
    'in_vivo_tcd', 'melphalan_dose', 'mrd_hct', 'obesity', 'peptic_ulcer',
    'prior_tumor', 'psych_disturb', 'pulm_moderate', 'pulm_severe', 'renal_issue',
    'rheum_issue', 'rituximab', 'vent_hist'
]
X_nom = [
    'cmv_status', 'conditioning_intensity', 'cyto_score', 'cyto_score_detail', 'donor_related',
    'dri_score', 'ethnicity', 'gvhd_proph', 'prim_disease_hct', 'race_group', 'sex_match',
    'tbi_status', 'tce_div_match', 'tce_imm_match', 'tce_match'
]
X_na = [
    'arrhythmia_na', 'cardiac_na', 'diabetes_na', 'hepatic_mild_na', 'hepatic_severe_na',
    'obesity_na', 'peptic_ulcer_na', 'prior_tumor_na', 'psych_disturb_na', 'pulm_moderate_na',
    'pulm_severe_na', 'renal_issue_na', 'rheum_issue_na'
]
X_cont = ['age_at_hct', 'donor_age']
X_int = [
    'comorbidity_score', 'hla_high_res_10', 'hla_high_res_6', 'hla_high_res_8', 'hla_low_res_10',
    'hla_low_res_6', 'hla_low_res_8', 'hla_match_a_high', 'hla_match_a_low', 'hla_match_b_high',
    'hla_match_b_low', 'hla_match_drb1_low', 'hla_match_c_high', 'hla_match_c_low', 'hla_match_dqb1_high', 'hla_match_dqb1_low',
    'hla_match_drb1_high', 'hla_nmdp_6', 'karnofsky_score', 'year_hct'
]
X_all = X_tri + X_cont + X_int + X_na + X_bool + X_nom

In [105]:
def score(df, prds):
    return df.groupby('race_group', observed=True).apply(
        lambda x: concordance_index(x['efs_time'], -prds.loc[x.index], x['efs']), include_groups=False
    ).pipe(
        lambda x: float(x.mean() - x.std(ddof=0))
    )

In [234]:
class RiskModel(tf.keras.Model):
    def __init__(self, emb_cols, cont_var, d_layers , **argv):
        super().__init__()
        self.emb_layers = {
            v: tf.keras.layers.Embedding(c, s, name=v) for v, c, s in emb_cols
        }
        self.cc = tf.keras.layers.Concatenate(axis=-1)
        self.d_model = tf.keras.models.Sequential([
            tf.keras.layers.Dense(**params) for params in d_layers
        ])
        self.cont_var = cont_var
        #self.sigmoid = tf.keras.layers.Activation('sigmoid')
        self.eps = 1e-7

    def __call__(self, X):
        #print(X)
        cc_list = [tf.squeeze(v(X[k]), axis=-2) for k, v in self.emb_layers.items()] + [X[self.cont_var]]
        X_cc = self.cc(cc_list)
        return self.d_model(X_cc)
    def compute_loss(self, X, y, y_pred, sample_weight):
        sz = tf.shape(y)[0]
        A = tf.reshape(tf.repeat(y[:, 0], sz), (-1, sz))
        B = tf.transpose(A)
        C = tf.cast(A < B, dtype=tf.float32)
        A_prd = tf.reshape(tf.repeat(y_pred, sz), (-1, sz))
        B_prd = tf.transpose(A_prd)
        hinge_loss = tf.keras.ops.relu(1 - (A_prd - B_prd))
        D = tf.reshape(tf.repeat(y[:, 1], sz), (-1, sz))
        mask = C * D
        loss_ = tf.reduce_sum(tf.reduce_sum(mask * hinge_loss, axis=-1) / (tf.reduce_sum(mask, axis = -1) + self.eps))
        loss_ = tf.reduce_sum(loss_) / (tf.reduce_sum(y[:, 1]) + self.eps)
        return loss_

In [235]:
import tensorflow as tf
import sgnn
from functools import partial


def to_tf_dataset(X, Y=None, sample_weights=None, cat=[], cont='Cont'):
    d = {}
    for i, n in enumerate(cat):
        d[n] = np.expand_dims(X.iloc[:, i], axis=-1)
    if cont is not None:
        d[cont] =  X.iloc[:, len(cat):].astype('float32')
    if Y is None:
        return tf.data.Dataset.from_tensor_slices(d)
    else:
        if type(Y) == pd.Series or type(Y) == pd.DataFrame:
            if sample_weights is None:
                return tf.data.Dataset.from_tensor_slices((d, Y.values))
            else:
                return tf.data.Dataset.from_tensor_slices((d, Y.values, sample_weights.values))
        else:
            if sample_weights is None:
                return tf.data.Dataset.from_tensor_slices((d, Y))
            else:
                return tf.data.Dataset.from_tensor_slices((d, Y, sample_weights))


In [236]:
s_emb = df_train[X_nom].apply(lambda x: len(x.cat.categories)).rename('cardinality').to_frame().join(
    pd.Series({
        'cmv_status': 3, 'conditioning_intensity': 3, 'cyto_score':3, 'cyto_score_detail': 3, 'donor_related': 2,
        'dri_score': 4, 'ethnicity': 2, 'gvhd_proph': 4, 'prim_disease_hct': 4, 'race_group': 3, 'sex_match': 3,
        'tbi_status': 3, 'tce_div_match': 3, 'tce_imm_match': 3, 'tce_match': 3
    }, name = 'emb_size')
).reset_index().apply(tuple, axis=1)
s_emb

0                 (cmv_status, 5, 3)
1     (conditioning_intensity, 7, 3)
2                 (cyto_score, 8, 3)
3          (cyto_score_detail, 6, 3)
4              (donor_related, 4, 2)
5                 (dri_score, 11, 4)
6                  (ethnicity, 4, 2)
7                (gvhd_proph, 17, 4)
8          (prim_disease_hct, 18, 4)
9                 (race_group, 6, 3)
10                 (sex_match, 5, 3)
11                (tbi_status, 8, 3)
12             (tce_div_match, 5, 3)
13             (tce_imm_match, 9, 3)
14                 (tce_match, 5, 3)
dtype: object

In [237]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OrdinalEncoder

ct = ColumnTransformer([
    ('std', make_pipeline(SimpleImputer(), StandardScaler()), X_cont + X_int[:-1]),
    ('mm', make_pipeline(SimpleImputer(), MinMaxScaler()), X_int[-1:]), 
    ('pt', 'passthrough', X_bool + X_tri + X_na)
]).set_output(transform='pandas')

In [238]:
from sklearn.model_selection import ShuffleSplit
ss = ShuffleSplit(n_splits = 1, random_state = 123)

In [241]:
sp = ss
for train_idx, valid_idx in sp.split(df_train[X_all], df_train['efs']):
    r_model = RiskModel(
        s_emb.tolist(), 'Continuous', [
            {'units': 128, 'activation': 'relu', 'kernel_initializer': 'he_uniform'},
            {'units': 64, 'activation': 'relu', 'kernel_initializer': 'he_uniform'},
            {'units': 1, 'kernel_initializer': 'he_uniform'},
        ]
    )
    r_model.compile(tf.keras.optimizers.Adam(1e-4))
    df_cv_train = df_train.iloc[train_idx]
    ds_train = to_tf_dataset(
        pd.concat([
            df_cv_train[X_nom].apply(lambda x: x.cat.codes), ct.fit_transform(df_cv_train[X_cont + X_int + X_bool + X_tri + X_na]),
        ], axis=1),
        df_cv_train[['efs_time', 'efs']], cat = X_nom, cont = 'Continuous'
    )
    r_model.fit(ds_train.shuffle(1024000).batch(2048), epochs=300, verbose=0)
    df_valid = df_train.iloc[valid_idx]
    ds_valid = to_tf_dataset(
        pd.concat([
            df_valid[X_nom].apply(lambda x: x.cat.codes), ct.transform(df_valid[X_cont + X_int + X_bool + X_tri + X_na]),
        ], axis=1), cat = X_nom, cont = 'Continuous'
    )
    print(
        score(
            df_valid,
            pd.Series(np.squeeze(r_model.predict(ds_valid.batch(1024))), index = df_valid.index)
        ), score(
            df_cv_train,
            pd.Series(np.squeeze(r_model.predict(ds_train.batch(1024))), index = df_cv_train.index)
        )
    )

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 134ms/step
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step
0.6303702070881629 0.7085270405771744
