In [1]:
import os
import joblib
import numpy as np
import pandas as pd
import polars as pl

import pandas.api.types
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

import lightgbm as lgb
import xgboost as xgb
import catboost as cb

import lifelines
from lifelines.utils import concordance_index

import sys

print(sys.version)
for i in [np, pd, pl, mpl, sns, lifelines, sklearn, lgb, xgb, cb]:
    try:
        print(i.__name__, i.__version__)
    except:
        print(i.__name__)

3.12.6 (main, Sep 30 2024, 02:19:13) [GCC 9.4.0]
numpy 1.26.4
pandas 2.2.3
polars 1.12.0
matplotlib 3.8.4
seaborn 0.13.2
lifelines 0.30.0
sklearn 1.5.2
lightgbm
xgboost 2.1.2
catboost 1.2.5


In [2]:
import dproc, sgutil, sgpp, sgml, custpp
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate, KFold, ShuffleSplit, train_test_split
import tensorflow as tf

2025-02-24 09:06:31.604052: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1740387991.739618   70200 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1740387991.790089   70200 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-24 09:06:32.131204: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
data_path = 'data'
model_path = 'model'

p3 =joblib.load(os.path.join(model_path, 'p3.joblib'))
df_train = p3.transform([os.path.join(data_path, 'train.csv')])

In [5]:
X_bool = ['graft_type', 'prod_type']
X_tri = [
    'arrhythmia', 'cardiac', 'diabetes', 'hepatic_mild', 'hepatic_severe',
    'in_vivo_tcd', 'melphalan_dose', 'mrd_hct', 'obesity', 'peptic_ulcer',
    'prior_tumor', 'psych_disturb', 'pulm_moderate', 'pulm_severe', 'renal_issue',
    'rheum_issue', 'rituximab', 'vent_hist'
]
X_nom = [
    'cmv_status', 'conditioning_intensity', 'cyto_score', 'cyto_score_detail', 'donor_related',
    'dri_score', 'ethnicity', 'gvhd_proph', 'prim_disease_hct', 'race_group', 'sex_match',
    'tbi_status', 'tce_div_match', 'tce_imm_match', 'tce_match'
]
X_na = [
    'arrhythmia_na', 'cardiac_na', 'diabetes_na', 'hepatic_mild_na', 'hepatic_severe_na',
    'obesity_na', 'peptic_ulcer_na', 'prior_tumor_na', 'psych_disturb_na', 'pulm_moderate_na',
    'pulm_severe_na', 'renal_issue_na', 'rheum_issue_na'
]
X_cont = ['age_at_hct', 'donor_age']
X_int = [
    'comorbidity_score', 'hla_high_res_10', 'hla_high_res_6', 'hla_high_res_8', 'hla_low_res_10',
    'hla_low_res_6', 'hla_low_res_8', 'hla_match_a_high', 'hla_match_a_low', 'hla_match_b_high',
    'hla_match_b_low', 'hla_match_drb1_low', 'hla_match_c_high', 'hla_match_c_low', 'hla_match_dqb1_high', 'hla_match_dqb1_low',
    'hla_match_drb1_high', 'hla_nmdp_6', 'karnofsky_score', 'year_hct'
]
X_all = X_tri + X_cont + X_int + X_na + X_bool + X_nom

In [141]:
import tensorflow as tf
import sgnn
from functools import partial

class RiskModel(tf.keras.Model):
    def __init__(self, emb_cols, cont_var, d_layers , **argv):
        super().__init__()
        self.emb_layers = {
            v: tf.keras.layers.Embedding(c, s, name=v) for v, c, s in emb_cols
        }
        self.cc = tf.keras.layers.Concatenate(axis=-1)
        self.d_model = tf.keras.models.Sequential([
            tf.keras.layers.Dense(**params) for params in d_layers
        ])
        self.cont_var = cont_var
        self.sigmoid = tf.keras.layers.Activation('sigmoid')

    def __call__(self, X):
        cc_list = [tf.squeeze(v(X[k]), axis=-2) for k, v in self.emb_layers.items()] + [X[self.cont_var]]
        X_cc = self.cc(cc_list)
        return self.sigmoid(
            self.d_model(X_cc)
        )

def to_tf_dataset(X, Y=None, sample_weights=None, cat=[], cont='Continuous'):
    d = {}
    for n in cat:
        d[n] = np.expand_dims(X[n], axis=-1)
    if cont is not None:
        d[cont] =  X.iloc[:, len(cat):].astype('float32')
    if Y is None:
        return tf.data.Dataset.from_tensor_slices(d)
    else:
        if type(Y) == pd.Series or type(Y) == pd.DataFrame:
            if sample_weights is None:
                return tf.data.Dataset.from_tensor_slices((d, Y.values))
            else:
                return tf.data.Dataset.from_tensor_slices((d, Y.values, sample_weights.values))
        else:
            if sample_weights is None:
                return tf.data.Dataset.from_tensor_slices((d, Y))
            else:
                return tf.data.Dataset.from_tensor_slices((d, Y, sample_weights))

def nn_cat_param(df, name, size):
    return name, len(df[name].cat.categories), size

nn_adapter = sgnn.NNAdapter(
    sgnn.NNClassifier, 
    to_tf_dataset=partial(to_tf_dataset, 
            cont=['mm__{}'.format(i) for i in ['Annual_Premium', 'Vintage', 'Age', 'Driving_License']], 
            cat= ['Annual_Premium_S', 'Vintage_S', 'Vehicle_Damage', 'Previously_Insured', 'Policy_Sales_Channel_S', 'Age_S', 'Vehicle_Age', 'Region_Code_S', 'Gender']
    )
)

In [142]:
s_emb = df_train[X_nom].apply(lambda x: len(x.cat.categories)).rename('cardinality').to_frame().join(
    pd.Series({
        'cmv_status': 3, 'conditioning_intensity': 3, 'cyto_score':3, 'cyto_score_detail': 3, 'donor_related': 2,
        'dri_score': 4, 'ethnicity': 2, 'gvhd_proph': 4, 'prim_disease_hct': 4, 'race_group': 3, 'sex_match': 3,
        'tbi_status': 3, 'tce_div_match': 3, 'tce_imm_match': 3, 'tce_match': 3
    }, name = 'emb_size')
).reset_index().apply(tuple, axis=1)
s_emb

0                 (cmv_status, 5, 3)
1     (conditioning_intensity, 7, 3)
2                 (cyto_score, 8, 3)
3          (cyto_score_detail, 6, 3)
4              (donor_related, 4, 2)
5                 (dri_score, 11, 4)
6                  (ethnicity, 4, 2)
7                (gvhd_proph, 17, 4)
8          (prim_disease_hct, 18, 4)
9                 (race_group, 6, 3)
10                 (sex_match, 5, 3)
11                (tbi_status, 8, 3)
12             (tce_div_match, 5, 3)
13             (tce_imm_match, 9, 3)
14                 (tce_match, 5, 3)
dtype: object

In [143]:
r_model = RiskModel(s_emb.tolist(), 'Continuous', [{'units': 32, 'activation': 'relu', 'kernel_initializer': 'he_uniform'}])

In [144]:
r_model.emb_layers

{'cmv_status': <Embedding name=cmv_status, built=False>,
 'conditioning_intensity': <Embedding name=conditioning_intensity, built=False>,
 'cyto_score': <Embedding name=cyto_score, built=False>,
 'cyto_score_detail': <Embedding name=cyto_score_detail, built=False>,
 'donor_related': <Embedding name=donor_related, built=False>,
 'dri_score': <Embedding name=dri_score, built=False>,
 'ethnicity': <Embedding name=ethnicity, built=False>,
 'gvhd_proph': <Embedding name=gvhd_proph, built=False>,
 'prim_disease_hct': <Embedding name=prim_disease_hct, built=False>,
 'race_group': <Embedding name=race_group, built=False>,
 'sex_match': <Embedding name=sex_match, built=False>,
 'tbi_status': <Embedding name=tbi_status, built=False>,
 'tce_div_match': <Embedding name=tce_div_match, built=False>,
 'tce_imm_match': <Embedding name=tce_imm_match, built=False>,
 'tce_match': <Embedding name=tce_match, built=False>}

In [121]:
from sklearn.impute import SimpleImputer

In [122]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OrdinalEncoder

ct = ColumnTransformer([
    ('std', make_pipeline(SimpleImputer(), StandardScaler()), X_cont + X_int[:-1]),
    ('mm', make_pipeline(SimpleImputer(), MinMaxScaler()), X_int[-1:]), 
    ('pt', 'passthrough', X_bool + X_tri + X_na)
]).set_output(transform='pandas')

In [125]:
ds_train = to_tf_dataset(pd.concat([
    df_train[X_nom].apply(lambda x: x.cat.codes), ct.fit_transform(df_train[X_cont + X_int + X_bool + X_tri + X_na]),
], axis=1), df_train[['efs_time', 'efs']], cat = X_nom, cont = 'Continuous')

In [162]:
for i in ds_train.batch(32).take(1):
    r_model(i[0])
    A = tf.reshape(tf.repeat(i[1][:, 0], 32), (-1, 32))
    B = tf.transpose(tf.reshape(tf.repeat(i[1][:, 0], 32), (-1, 32)))
    print(A, B)
    A = tf.reshape(tf.repeat(i[1][:, 1], 32), (-1, 32))
    B = tf.transpose(tf.reshape(tf.repeat(i[1][:, 1], 32), (-1, 32)))
    print(A, B)

tf.Tensor(
[[42.356 42.356 42.356 ... 42.356 42.356 42.356]
 [ 4.672  4.672  4.672 ...  4.672  4.672  4.672]
 [19.793 19.793 19.793 ... 19.793 19.793 19.793]
 ...
 [ 8.246  8.246  8.246 ...  8.246  8.246  8.246]
 [18.303 18.303 18.303 ... 18.303 18.303 18.303]
 [ 3.807  3.807  3.807 ...  3.807  3.807  3.807]], shape=(32, 32), dtype=float32) tf.Tensor(
[[42.356  4.672 19.793 ...  8.246 18.303  3.807]
 [42.356  4.672 19.793 ...  8.246 18.303  3.807]
 [42.356  4.672 19.793 ...  8.246 18.303  3.807]
 ...
 [42.356  4.672 19.793 ...  8.246 18.303  3.807]
 [42.356  4.672 19.793 ...  8.246 18.303  3.807]
 [42.356  4.672 19.793 ...  8.246 18.303  3.807]], shape=(32, 32), dtype=float32)
tf.Tensor(
[[0. 0. 0. ... 0. 0. 0.]
 [1. 1. 1. ... 1. 1. 1.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [1. 1. 1. ... 1. 1. 1.]
 [0. 0. 0. ... 0. 0. 0.]
 [1. 1. 1. ... 1. 1. 1.]], shape=(32, 32), dtype=float32) tf.Tensor(
[[0. 1. 0. ... 1. 0. 1.]
 [0. 1. 0. ... 1. 0. 1.]
 [0. 1. 0. ... 1. 0. 1.]
 ...
 [0. 1. 0. ... 1. 0. 1.]

TypeError: '_BatchDataset' object is not subscriptable

In [101]:
pd.concat([
    df_train[X_nom].apply(lambda x: x.cat.codes), ct.fit_transform(df_train[X_cont + X_int + X_bool + X_tri + X_na]),
], axis=1)

ValueError: A given column is not a column of the dataframe

In [104]:
ct.fit_transform(df_train[X_cont + X_int + X_bool + X_tri + X_na])

ValueError: A given column is not a column of the dataframe

In [107]:
df_train[X_cont + X_int + X_bool + X_tri + X_na]

Unnamed: 0_level_0,age_at_hct,donor_age,comorbidity_score,hla_high_res_10,hla_high_res_6,hla_high_res_8,hla_low_res_10,hla_low_res_6,hla_low_res_8,hla_match_a_high,...,hepatic_mild_na,hepatic_severe_na,obesity_na,peptic_ulcer_na,prior_tumor_na,psych_disturb_na,pulm_moderate_na,pulm_severe_na,renal_issue_na,rheum_issue_na
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,9.942000,,0.0,,6.0,,10.0,6.0,8.0,2.0,...,False,False,False,False,False,False,False,False,False,False
1,43.705002,72.290001,3.0,10.0,6.0,8.0,10.0,6.0,8.0,2.0,...,False,False,False,False,False,False,False,False,False,False
2,33.997002,,0.0,10.0,6.0,8.0,10.0,6.0,8.0,2.0,...,False,False,False,False,False,False,False,False,False,False
3,43.244999,29.230000,0.0,10.0,6.0,8.0,10.0,6.0,8.0,2.0,...,False,False,False,False,False,False,False,False,False,False
4,29.740000,56.810001,1.0,10.0,6.0,8.0,10.0,6.0,8.0,2.0,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28795,51.136002,24.212000,0.0,10.0,6.0,8.0,10.0,6.0,8.0,2.0,...,True,False,False,False,False,True,False,True,False,True
28796,18.075001,30.770000,3.0,6.0,3.0,4.0,8.0,5.0,6.0,1.0,...,False,False,False,False,False,False,False,False,False,False
28797,51.005001,22.627001,5.0,10.0,6.0,8.0,10.0,6.0,8.0,2.0,...,True,False,False,True,False,True,False,True,True,True
28798,0.044000,58.074001,1.0,5.0,3.0,4.0,5.0,3.0,4.0,1.0,...,False,False,True,True,False,False,False,True,True,False


In [106]:
X_cont + X_int + X_bool + X_tri + X_na

['age_at_hct',
 'donor_age',
 'comorbidity_score',
 'hla_high_res_10',
 'hla_high_res_6',
 'hla_high_res_8',
 'hla_low_res_10',
 'hla_low_res_6',
 'hla_low_res_8',
 'hla_match_a_high',
 'hla_match_a_low',
 'hla_match_b_high',
 'hla_match_b_low',
 'hla_match_drb1_low',
 'hla_match_c_high',
 'hla_match_c_low',
 'hla_match_dqb1_high',
 'hla_match_dqb1_low',
 'hla_match_drb1_high',
 'hla_nmdp_6',
 'karnofsky_score',
 'year_hct',
 'graft_type',
 'prod_type',
 'arrhythmia',
 'cardiac',
 'diabetes',
 'hepatic_mild',
 'hepatic_severe',
 'in_vivo_tcd',
 'melphalan_dose',
 'mrd_hct',
 'obesity',
 'peptic_ulcer',
 'prior_tumor',
 'psych_disturb',
 'pulm_moderate',
 'pulm_severe',
 'renal_issue',
 'rheum_issue',
 'rituximab',
 'vent_hist',
 'arrhythmia_na',
 'cardiac_na',
 'diabetes_na',
 'hepatic_mild_na',
 'hepatic_severe_na',
 'obesity_na',
 'peptic_ulcer_na',
 'prior_tumor_na',
 'psych_disturb_na',
 'pulm_moderate_na',
 'pulm_severe_na',
 'renal_issue_na',
 'rheum_issue_na']