In [1]:
import numpy as np 
import sys
sys.path.append("../")
from xgboost import XGBClassifier
from sklearn.model_selection import KFold
from sklearn.multioutput import MultiOutputClassifier

In [2]:
from pyhealth.datasets import MIMIC3BaseDataset, MIMIC4BaseDataset, eICUBaseDataset, OMOPBaseDataset
base_dataset = MIMIC3BaseDataset(root="../srv/local/data/physionet.org/files/mimiciii/1.4")
# base_dataset = eICUBaseDataset(root="/srv/local/data/physionet.org/files/eicu-crd/2.0")
# base_dataset = MIMIC4BaseDataset(root="/srv/local/data/physionet.org/files/mimiciv/2.0/hosp")
# base_dataset = OMOPBaseDataset(root="/srv/local/data/zw12/pyhealth/raw_data/synpuf1k_omop_cdm_5.2.2")
base_dataset.info()

  from .autonotebook import tqdm as notebook_tqdm



        ----- Output Data Structure -----
        Dataset.patients: [
            {
                patient_id: patient_id, 
                visits: [
                    {
                        visit_id: visit_id, 
                        patient_id: patient_id, 
                        conditions: [List], 
                        procedures: [List],
                        drugs: [List],
                        visit_info: <dict>
                    }
                    ...
                ]                    
            } 
            ...
        ]
        


In [3]:
from pyhealth.tasks import DrugRecDataset
drug_rec_dataset = DrugRecDataset(base_dataset)
drug_rec_dataset.info()


        ----- Output Data Structure -----
        >> drug_rec_dataloader[0]
        >> {
            "conditions": List[tensor],
            "procedures": List[tensor],
            "drugs": List[tensor]
        }
        


In [4]:
from pyhealth.models import BaseModel

xgboost = BaseModel(drug_rec_dataset, 'XGBoost')

In [None]:
# Train
xgboost.train(0.9)

Starting fold:  0


In [None]:
xgboost.predict()

In [None]:
from pyhealth.data import split

drug_rec_trainset, drug_rec_valset, drug_rec_testset = split.random_split(drug_rec_dataset, [0.8, 0.1, 0.1])

In [None]:
drug_rec_trainset[4358]['conditions']

In [None]:
visit_embs = []

for i in range(len(drug_rec_dataset)):
    # visit embedding
    condition_emb = condition_embedding(drug_rec_dataset[i]['conditions']).sum(dim=1).data
    procedure_emb = condition_embedding(drug_rec_dataset[i]['procedures']).sum(dim=1).data
    visit_embs.append(condition_emb + procedure_emb)
    
visit_embs

In [None]:
import torch

x_emb = []
y_emb = []
for patient in range(len(visit_embs)):
    for visit in range(len(visit_embs[patient])):
        x_emb.append(visit_embs[patient][visit].numpy())
        
        #drug multi-hot
        drugs_index = drug_rec_dataset[patient]['drugs'][visit]
        drugs_multihot = torch.zeros(1, voc_size[2])
        drugs_multihot[0][drugs_index] = 1
        y_emb.append(drugs_multihot[0].numpy())
        
X = np.array(x_emb, dtype=float)
y = np.array(y_emb, dtype=int)

X, y

In [None]:
idx = (int)(len(X) * 0.9)
X_train, X_test = X[:idx], X[idx:]
y_train, y_test = y[:idx], y[idx:]


In [None]:
from sklearn.metrics import log_loss

xgb_estimator = XGBClassifier(objective='binary:logistic',tree_method='gpu_hist')
predictor = MultiOutputClassifier(xgb_estimator)

In [None]:
oof_preds = np.zeros(y_train.shape)
test_preds = np.zeros((X_test.shape[0], y_test.shape[1]))
oof_losses = []
kf = KFold(n_splits=5)
for fn, (trn_idx, val_idx) in enumerate(kf.split(X_train, y_train)):
    print('Starting fold: ', fn)
    X_train_, X_val = X_train[trn_idx], X_train[val_idx]
    y_train_, y_val = y_train[trn_idx], y_train[val_idx]
    
    predictor.fit(X_train_, y_train_)
    val_preds = predictor.predict_proba(X_val) # list of preds per class
    val_preds = np.array(val_preds)[:,:,1].T # take the positive class
    oof_preds[val_idx] = val_preds
    
    loss = log_loss(np.ravel(y_val), np.ravel(val_preds))
    oof_losses.append(loss)
    preds = predictor.predict_proba(X_test)
    preds = np.array(preds)[:,:,1].T # take the positive class
    test_preds += preds / 5
    
print(oof_losses)
print('Mean OOF loss across folds', np.mean(oof_losses))
print('STD OOF loss across folds', np.std(oof_losses))

In [None]:
predict_result = predictor.predict(X_test)
