In [1]:
import numpy as np 
import sys
sys.path.append("../")
from xgboost import XGBClassifier
from sklearn.model_selection import KFold
from sklearn.multioutput import MultiOutputClassifier

In [2]:
from pyhealth.datasets import MIMIC3BaseDataset, MIMIC4BaseDataset, eICUBaseDataset, OMOPBaseDataset
base_dataset = MIMIC3BaseDataset(root="../srv/local/data/physionet.org/files/mimiciii/1.4")
# base_dataset = eICUBaseDataset(root="/srv/local/data/physionet.org/files/eicu-crd/2.0")
# base_dataset = MIMIC4BaseDataset(root="/srv/local/data/physionet.org/files/mimiciv/2.0/hosp")
# base_dataset = OMOPBaseDataset(root="/srv/local/data/zw12/pyhealth/raw_data/synpuf1k_omop_cdm_5.2.2")
base_dataset.info()

  from .autonotebook import tqdm as notebook_tqdm



        ----- Output Data Structure -----
        Dataset.patients: [
            {
                patient_id: patient_id, 
                visits: [
                    {
                        visit_id: visit_id, 
                        patient_id: patient_id, 
                        conditions: [List], 
                        procedures: [List],
                        drugs: [List],
                        visit_info: <dict>
                    }
                    ...
                ]                    
            } 
            ...
        ]
        


In [3]:
from pyhealth.tasks import DrugRecDataset
drug_rec_dataset = DrugRecDataset(base_dataset)
drug_rec_dataset.info()


        ----- Output Data Structure -----
        >> drug_rec_dataloader[0]
        >> {
            "conditions": List[tensor],
            "procedures": List[tensor],
            "drugs": List[tensor]
        }
        


In [5]:
voc_size = drug_rec_dataset.voc_size
params = drug_rec_dataset.params

In [6]:
import torch.nn as nn

condition_embedding = nn.Sequential(
    nn.Embedding(voc_size[0], 64, padding_idx=0),
    nn.Dropout(0.5)
)
condition_embedding

Sequential(
  (0): Embedding(4493, 64, padding_idx=0)
  (1): Dropout(p=0.5, inplace=False)
)

In [7]:
procedure_embedding = nn.Sequential(
    nn.Embedding(voc_size[1], 64, padding_idx=0),
    nn.Dropout(0.5)
)
procedure_embedding

Sequential(
  (0): Embedding(1414, 64, padding_idx=0)
  (1): Dropout(p=0.5, inplace=False)
)

In [8]:
from pyhealth.data import split

drug_rec_trainset, drug_rec_valset, drug_rec_testset = split.random_split(drug_rec_dataset, [0.8, 0.1, 0.1])

In [9]:
drug_rec_trainset[4358]['conditions']

tensor([[ 729,  749, 1729, 2184, 4271, 1355, 1396, 2020,  330,    0,    0,    0,
            0,    0,    0,    0,    0],
        [4060,  727,  729, 3211, 2355, 2893, 1356, 2277, 1573, 2633, 4187, 3270,
         2184, 1355, 1251, 4271, 1863]])

In [10]:
visit_embs = []

for i in range(len(drug_rec_dataset)):
    # visit embedding
    condition_emb = condition_embedding(drug_rec_dataset[i]['conditions']).sum(dim=1).data
    procedure_emb = condition_embedding(drug_rec_dataset[i]['procedures']).sum(dim=1).data
    visit_embs.append(condition_emb + procedure_emb)
    
visit_embs

[tensor([[ -3.7773,  -0.2611,   4.0339,  -3.2936,   7.4869,  -4.4535,  -3.6538,
           -0.7238,   2.7240, -10.0644,  -8.8666,  -5.3387,   1.2326,  -5.3586,
            0.5824,   5.9301,   6.2032,   0.8641,   6.7291,   2.9548,   8.8134,
           -4.3814,   1.2816,  -2.5387,   2.3212,  -4.6174,   4.2718,  -7.1542,
            1.6010,  -3.1620,   4.0536,  -2.4989,  11.2558,  -5.9065,   8.9879,
           -5.1521,  -7.3495,   6.5376,  -3.1233,   7.3611,   2.7586,  -2.9682,
           -0.5153,  -5.3166,  -5.0828,  -9.8227,   6.6545,  -0.4509,  -4.9117,
           10.9712,  -5.4510,   2.3474,  -0.6204,   0.6451,  10.0312,   1.3155,
           -8.1573,   1.0863, -10.4755,  -8.6337,   2.8043,   4.9470,   8.8106,
            1.1031],
         [ -0.9105,  -0.6484,  -6.6904, -10.6134,  -1.7157,   0.4591,   6.8852,
            2.0961,  -6.5080,  -3.1632,  -2.1362, -19.1507,  -0.9890,  -7.5807,
          -12.4302,  -9.0467,  -0.9683,  -4.9285,  -0.7644,   4.1795,  -9.0441,
           -0.8248,

In [11]:
import torch

x_emb = []
y_emb = []
for patient in range(len(visit_embs)):
    for visit in range(len(visit_embs[patient])):
        x_emb.append(visit_embs[patient][visit].numpy())
        
        #drug multi-hot
        drugs_index = drug_rec_dataset[patient]['drugs'][visit]
        drugs_multihot = torch.zeros(1, voc_size[2])
        drugs_multihot[0][drugs_index] = 1
        y_emb.append(drugs_multihot[0].numpy())
        
X = np.array(x_emb, dtype=float)
y = np.array(y_emb, dtype=int)

X, y

(array([[ -3.77727795,  -0.26114416,   4.03388071, ...,   4.94697857,
           8.81057072,   1.10313153],
        [ -0.9105463 ,  -0.64835763,  -6.69037533, ...,  -2.22526574,
          -4.71081781,   3.87059426],
        [  1.11152196, -10.91492462, -11.82688427, ...,  -2.18885994,
           5.74045515,  10.59307384],
        ...,
        [ -0.94089699,  -5.83208179, -10.8504858 , ...,  -4.45184755,
          -0.96997786,   5.41622829],
        [  1.37252712,  -4.41866255,   2.5829215 , ...,   3.87589455,
         -12.08145142,  -0.78835368],
        [  7.42126179,  -4.10752869,  -1.52292609, ...,   3.26835394,
          -0.51680517,   7.99957561]]),
 array([[1, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [1, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [1, 0, 0, ..., 0, 0, 0]]))

In [12]:
idx = (int)(len(X) * 0.9)
X_train, X_test = X[:idx], X[idx:]
y_train, y_test = y[:idx], y[idx:]


In [13]:
from sklearn.metrics import log_loss

xgb_estimator = XGBClassifier(objective='binary:logistic',tree_method='gpu_hist')
predictor = MultiOutputClassifier(xgb_estimator)

In [15]:
oof_preds = np.zeros(y_train.shape)
test_preds = np.zeros((X_test.shape[0], y_test.shape[1]))
oof_losses = []
kf = KFold(n_splits=5)
for fn, (trn_idx, val_idx) in enumerate(kf.split(X_train, y_train)):
    print('Starting fold: ', fn)
    X_train_, X_val = X_train[trn_idx], X_train[val_idx]
    y_train_, y_val = y_train[trn_idx], y_train[val_idx]
    
    predictor.fit(X_train_, y_train_)
    val_preds = predictor.predict_proba(X_val) # list of preds per class
    val_preds = np.array(val_preds)[:,:,1].T # take the positive class
    oof_preds[val_idx] = val_preds
    
    loss = log_loss(np.ravel(y_val), np.ravel(val_preds))
    oof_losses.append(loss)
    preds = predictor.predict_proba(X_test)
    preds = np.array(preds)[:,:,1].T # take the positive class
    test_preds += preds / 5
    
print(oof_losses)
print('Mean OOF loss across folds', np.mean(oof_losses))
print('STD OOF loss across folds', np.std(oof_losses))

Starting fold:  0
Starting fold:  1
Starting fold:  2
Starting fold:  3
Starting fold:  4
[0.05961695796360882, 0.05936902598285493, 0.056277131193901174, 0.05809330447707838, 0.057003239936498896]
Mean OOF loss across folds 0.058071931910788445
STD OOF loss across folds 0.0012987245644495602


In [19]:
predict_result = predictor.predict(X_test)
predict_result

array([[1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]])

In [20]:
log_loss(predict_result, y_test)

199.16497665307494