In [1]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import os, sys
import itertools
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc
import statsmodels.stats.api as sms

In [2]:
os.environ['CUDA_VISIBLE_DEVICES'] = ''

In [2]:
path = '/nfs/turbo/umms-awaljee/wsliu/Data/NRD/'

In [3]:
module_path = '/home/wsliu/Codes/DLproj'
if module_path not in sys.path:
    sys.path.append(module_path)
if module_path+'/NRD' not in sys.path:
    sys.path.append(module_path+'/NRD')

In [4]:
from ccs_tools import core_dtypes_pd
from utils import preprocess

Using TensorFlow backend.


## Editing pbs Files

In [14]:
!rm batch_gpu*.pbs

In [38]:
for job_index in range(12):
    with open('batch_gpu{}.pbs'.format(job_index), 'w') as f:
        f.write("""#!/bin/sh

#### PBS preamble

#PBS -N NRD_hypertune{}

#PBS -M wsliu@umich.edu
#PBS -m abe

#PBS -A awaljee_fluxg

#PBS -q fluxg

#PBS -V
#PBS -l nodes=1:gpus={},mem={}gb,walltime={}:00:00
#PBS -j oe

#### End PBS preamble

#  Show list of CPUs you ran on, if you're running under PBS
if [ -n "$PBS_NODEFILE" ]; then cat $PBS_NODEFILE; fi

#  Change to the directory you submitted from
if [ -n "$PBS_O_WORKDIR" ]; then cd $PBS_O_WORKDIR; fi

#  Put your job commands here:
sh hypertune{}.sh""".format(job_index, '1', 8, 48, job_index))

## Prepare hyper-parameters and generate the .sh files

For embedding+NN with a subset of codes:

In [35]:
!rm hypertune*.sh

In [36]:
model_names = ['setsum_nn', 'setsum_deep2']
DX1_dims = [30]
DX_dims = [200]
PR_dims = [100]
fc_widths = [256, 512, 1024]
#md_widths = [128]
lr1s = [2e-4]
lr2s = [2e-5]
dropouts = [0.3]
batchsizes = [256, 512, 1024]
embed_files = ['pretrain']
#penalties = [0]
#penalty_metrics = ['cosine']
count_caps = [20, 100, 200]
tst_seeds = [0]
cohorts = ['ami']
DX_rarecutpoints = [10, 90, 180, 270]
PR_rarecutpoints = [10, 90, 180, 270]
val_folds = [7]
other_preds = [0]
ndxprs = [0]
result_files = ['output/ht_result1224_{}.csv']

In [26]:
model_names = ['setsum_lr']
DX1_dims = [30, 50]
DX_dims = [200]
PR_dims = [100]
fc_widths = [0]
#md_widths = [128]
lr1s = [2e-4]
lr2s = [2e-5]
dropouts = [0.]
batchsizes = [512]
embed_files = ['pretrain']
#penalties = [0]
#penalty_metrics = ['cosine']
count_caps = [1, 20]
tst_seeds = [0]
cohorts = ['ami']
DX_rarecutpoints = [10]
PR_rarecutpoints = [drp/2 for drp in DX_rarecutpoints]
val_folds = [7]
other_preds = [0]
ndxprs = [0, 1]
result_files = ['output/ht_result1220_{}.csv']

In [33]:
model_names = ['setsum_deep3', 'setsum_deep2']
DX1_dims = [20]
DX_dims = [100]
PR_dims = [50]
fc_widths = [64, 128, 256, 512]
#md_widths = [128]
lr1s = [2e-4]
lr2s = [2e-5]
dropouts = [0.3]
batchsizes = [256, 512]
embed_files = ['pretrain']
#penalties = [0]
#penalty_metrics = ['cosine']
count_caps = [20, 100, 200]
tst_seeds = [0]
cohorts = ['ami']
DX_rarecutpoints = [180, 360]
PR_rarecutpoints = DX_rarecutpoints
val_folds = [7]
other_preds = [0]
ndxprs = [0]
result_files = ['output/ht_result1223_{}.csv']

In [37]:
para_itr = itertools.product(model_names, DX1_dims, DX_dims, PR_dims, fc_widths, lr1s, lr2s, dropouts, batchsizes, embed_files, 
                             count_caps, tst_seeds, cohorts, DX_rarecutpoints, other_preds, ndxprs, val_folds, 
                             result_files)
para_lst = [(mn, dx1d, dxd, prd, fc, l1, l2, do, bs, em, cc, ts, ch, dxr, dxr, op, ndp, vf, rf) 
            for mn, dx1d, dxd, prd, fc, l1, l2, do, bs, em, cc, ts, ch, dxr, op, ndp, vf, rf in para_itr]

In [16]:
para_itr = itertools.product(model_names, DX1_dims, DX_dims, PR_dims, fc_widths, lr1s, lr2s, dropouts, batchsizes, embed_files, 
                             count_caps, tst_seeds, cohorts, DX_rarecutpoints, PR_rarecutpoints, other_preds, ndxprs, val_folds, 
                             result_files)
para_lst = list(para_itr)

In [38]:
len(para_lst)

216

In [39]:
n_jobs = 12
for para, job_ind in zip(para_lst, itertools.cycle(range(n_jobs))):
    with open('hypertune'+str(job_ind)+'.sh', 'a') as f:
        f.write('python template_sub_multispace1218.py --model_name {} --DX1_dim {} --DX_dim {} --PR_dim {} --fc_width {} --lr1 {} --lr2 {} --dropout {} --batchsize {} --embed_file {} --count_cap {} --tst_seed {} --cohort {} --dx_rarecutpoint {} --pr_rarecutpoint {} --other_pred {} --ndxpr {} --val_fold {} --result_file {} --job_index {}\n'.format(*para, job_ind))

In [16]:
job_ind = 10
for para in para_lst[0:1]:
    with open('hypertune'+str(job_ind)+'.sh', 'a') as f:
        f.write('python template_sub_multispace1218.py --model_name {} --DX1_dim {} --DX_dim {} --PR_dim {} --fc_width {} --lr1 {} --lr2 {} --dropout {} --batchsize {} --embed_file {} --count_cap {} --tst_seed {} --cohort {} --dx_rarecutpoint {} --pr_rarecutpoint {} --other_pred {} --val_fold {} --result_file {} --job_index {}\n'.format(*para, job_ind))

Random search:

In [36]:
n_sample = 40

In [37]:
model_names = np.random.choice(['setsum_nn'], n_sample)
DX1_dims = np.random.choice([35], n_sample)
DX_dims = np.random.choice([200], n_sample)
PR_dims = np.random.choice([80], n_sample)
fc_widths = np.random.choice([450], n_sample)
#md_widths = np.random.choice([128, 256], n_sample)
lr1s = np.random.choice([2e-4], n_sample)
lr2s = np.random.choice([2e-5], n_sample)
dropouts = np.random.choice([0.3], n_sample)
batchsizes = np.random.choice([512], n_sample)
embed_files = np.random.choice(['pretrain'], n_sample)
#penalties = np.random.choice([0, 0.5, 1.], n_sample)
#penalty_metrics = np.random.choice(['cosine'], n_sample)
count_caps = np.random.choice([120], n_sample)
tst_seeds = np.random.choice([0], n_sample)
cohorts = np.random.choice(['ami'], n_sample)
DX_rarecutpoints = np.random.choice([10], n_sample)
PR_rarecutpoints = DX_rarecutpoints
val_folds = np.random.choice([5], n_sample)
other_preds = np.random.choice([0], n_sample)
ndxprs = np.random.choice([0], n_sample)
result_files = ['output/ht_result0101_{}.csv']*n_sample

In [38]:
zips = zip(model_names, DX1_dims, DX_dims, PR_dims, fc_widths, lr1s, lr2s, dropouts, batchsizes, embed_files, 
            count_caps, tst_seeds, cohorts, DX_rarecutpoints, PR_rarecutpoints, other_preds, ndxprs, val_folds, result_files)

In [39]:
para_lst = list(zips)

In [40]:
len(para_lst)

40

In [41]:
n_jobs = 8
for para, job_ind in zip(para_lst, itertools.cycle(range(n_jobs))):
    with open('hypertune'+str(job_ind)+'.sh', 'a') as f:
        f.write('python template_sub_multispace1218.py --model_name {} --DX1_dim {} --DX_dim {} --PR_dim {} --fc_width {} --lr1 {} --lr2 {} --dropout {} --batchsize {} --embed_file {} --count_cap {} --tst_seed {} --cohort {} --dx_rarecutpoint {} --pr_rarecutpoint {} --other_pred {} --ndxpr {} --val_fold {} --result_file {} --job_index {}\n'.format(*para, job_ind))

## Result Analysis

### Embedding + NN  with subset of codes

In [42]:
res = pd.DataFrame()

In [43]:
for job_ind in range(12):
    df = pd.read_csv('output/ht_result1231_'+str(job_ind)+'.csv', 
                     names=['model_name', 'DX1_dim', 'DX_dim', 'PR_dim', 'hosp_embed_dim', 'fc_width', 'lr1', 'lr2', 'dropout',
                            'batchsize', 'embed_file', 'cohort', 'tst_seed', 'n_fold', 'count_cap', 
                            'DX_rarecutpoint', 'PR_rarecutpoint', 'other_pred', 'ndxpr', 'auc_mean', 'auc_avg', 'auc_freeze', 'y_pred_file'], index_col=None)
    res = pd.concat([res, df])

In [4]:
res.head()

Unnamed: 0,model_name,DX1_dim,DX_dim,PR_dim,hosp_embed_dim,fc_width,lr1,lr2,dropout,batchsize,...,n_fold,count_cap,DX_rarecutpoint,PR_rarecutpoint,other_pred,ndxpr,auc_mean,auc_avg,auc_freeze,y_pred_file
0,setsum_nn,30,200,80,1,256,0.0002,2e-05,0.3,512,...,5,200,10,10,0,0,0.71322,0.71661,0.71294,output/y_pred_mat19_01_01_12_09_09.npy
1,setsum_nn,35,200,80,1,512,0.0002,2e-05,0.3,400,...,5,200,180,180,0,0,0.71357,0.71611,0.71263,output/y_pred_mat19_01_01_12_37_00.npy
2,setsum_nn,35,200,80,1,256,0.0002,2e-05,0.3,400,...,5,200,10,10,0,0,0.71341,0.71604,0.7126,output/y_pred_mat19_01_01_01_15_47.npy
3,setsum_nn,30,200,80,1,512,0.0002,2e-05,0.3,512,...,5,120,10,10,0,0,0.71352,0.71651,0.71263,output/y_pred_mat19_01_01_01_51_13.npy
4,setsum_nn,35,200,80,1,256,0.0002,2e-05,0.3,400,...,5,120,10,10,0,0,0.7134,0.71643,0.71255,output/y_pred_mat19_01_01_02_29_38.npy


In [27]:
res.shape

(600, 23)

In [44]:
res_grouped = res.loc[res.auc_freeze>0.7135].groupby(['model_name', 'DX1_dim', 'DX_dim', 'PR_dim', 'fc_width', 'count_cap', 
                                                     'batchsize', 'DX_rarecutpoint'])

In [30]:
res_grouped = res.groupby(['model_name', 'DX1_dim', 'DX_dim', 'PR_dim', 'fc_width', 'count_cap', 'batchsize', 'DX_rarecutpoint'])

In [45]:
res_grouped[['auc_freeze', 'auc_mean', 'auc_avg']].agg(['mean', 'count'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,auc_freeze,auc_freeze,auc_mean,auc_mean,auc_avg,auc_avg
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,mean,count,mean,count,mean,count
model_name,DX1_dim,DX_dim,PR_dim,fc_width,count_cap,batchsize,DX_rarecutpoint,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
setsum_nn,30,200,80,256,120,400,30,0.71355,1,0.71424,1,0.71721,1
setsum_nn,30,200,80,256,200,512,30,0.714043,3,0.714643,3,0.71746,3
setsum_nn,30,200,80,512,120,512,180,0.71353,1,0.71399,1,0.71683,1
setsum_nn,30,200,80,512,200,512,30,0.71358,2,0.71421,2,0.71735,2
setsum_nn,30,200,80,512,200,512,180,0.71364,1,0.71419,1,0.71689,1
setsum_nn,35,200,80,256,120,400,180,0.71394,1,0.71416,1,0.71674,1
setsum_nn,35,200,80,256,120,512,30,0.713785,2,0.714535,2,0.717605,2
setsum_nn,35,200,80,256,200,400,180,0.71366,1,0.71435,1,0.71713,1
setsum_nn,35,200,80,512,120,400,30,0.71366,2,0.71413,2,0.71711,2
setsum_nn,35,200,80,512,200,400,30,0.71415,1,0.7146,1,0.71796,1


In [31]:
res_grouped[['auc_freeze', 'auc_mean', 'auc_avg']].agg(['mean', 'count']).shape

(600, 6)

In [46]:
res.to_csv('output/ht_result1231.csv', index=False)

In [34]:
res.loc[(res.DX1_dim==35)&(res.DX_dim==200)&(res.PR_dim==80)&(res.count_cap==120)&(res.DX_rarecutpoint==10)&(res.batchsize==512)
       &(res.fc_width==450)]

Unnamed: 0,model_name,DX1_dim,DX_dim,PR_dim,hosp_embed_dim,fc_width,lr1,lr2,dropout,batchsize,...,n_fold,count_cap,DX_rarecutpoint,PR_rarecutpoint,other_pred,ndxpr,auc_mean,auc_avg,auc_freeze,y_pred_file


In [20]:
res = pd.read_csv('output/ht_result1227.csv')

In [19]:
res2 = pd.read_csv('output/ht_result1219.csv')

In [22]:
res2.loc[res2.auc_freeze>0.7135]

Unnamed: 0,model_name,DX1_dim,DX_dim,PR_dim,hosp_embed_dim,fc_width,lr1,lr2,dropout,batchsize,...,tst_seed,n_fold,count_cap,DX_rarecutpoint,PR_rarecutpoint,other_pred,auc_mean,auc_avg,auc_freeze,y_pred_file
106,setsum_nn,20,200,100,1,256,0.0002,2e-05,0.3,512,...,0,7,1,10,5,0,0.7139,0.717,0.71365,output/y_pred_mat18_12_20_11_23_06.npy
165,setsum_nn,20,200,100,1,512,0.0002,2e-05,0.3,512,...,0,7,100,10,5,0,0.71388,0.71674,0.71368,output/y_pred_mat18_12_19_11_12_36.npy
167,setsum_nn,40,200,100,1,256,0.0002,2e-05,0.3,512,...,0,7,5,10,5,0,0.71465,0.71788,0.71376,output/y_pred_mat18_12_20_12_58_32.npy
170,setsum_nn,20,200,100,1,512,0.0002,2e-05,0.3,512,...,0,7,100,10,5,0,0.71478,0.71784,0.71381,output/y_pred_mat18_12_20_10_29_21.npy
209,setsum_nn,20,200,200,1,256,0.0002,2e-05,0.3,512,...,0,7,100,10,5,0,0.71434,0.71707,0.71374,output/y_pred_mat18_12_20_11_37_02.npy
217,setsum_nn,40,200,100,1,512,0.0002,2e-05,0.3,512,...,0,7,100,10,5,0,0.71418,0.71714,0.71358,output/y_pred_mat18_12_20_01_09_19.npy
