In [1]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import os, sys
import itertools
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc
import statsmodels.stats.api as sms

In [2]:
os.environ['CUDA_VISIBLE_DEVICES'] = ''

In [2]:
path = '/nfs/turbo/umms-awaljee/wsliu/Data/NRD/'

In [3]:
module_path = '/home/wsliu/Codes/DLproj'
if module_path not in sys.path:
    sys.path.append(module_path)
if module_path+'/NRD' not in sys.path:
    sys.path.append(module_path+'/NRD')

In [4]:
from ccs_tools import core_dtypes_pd
from utils import preprocess

Using TensorFlow backend.


## Editing pbs Files

In [14]:
!rm batch_gpu*.pbs

In [19]:
for job_index in range(11):
    with open('batch_gpu{}.pbs'.format(job_index), 'w') as f:
        f.write("""#!/bin/sh

#### PBS preamble

#PBS -N NRD_hypertune{}

#PBS -M wsliu@umich.edu
#PBS -m abe

#PBS -A awaljee_fluxg

#PBS -q fluxg

#PBS -V
#PBS -l nodes=1:gpus={},mem={}gb,walltime={}:00:00
#PBS -j oe

#### End PBS preamble

#  Show list of CPUs you ran on, if you're running under PBS
if [ -n "$PBS_NODEFILE" ]; then cat $PBS_NODEFILE; fi

#  Change to the directory you submitted from
if [ -n "$PBS_O_WORKDIR" ]; then cd $PBS_O_WORKDIR; fi

#  Put your job commands here:
sh hypertune{}.sh""".format(job_index, '1', 8, 48, job_index))

## Prepare hyper-parameters and generate the .sh files

For embedding+NN with a subset of codes:

In [2]:
!rm hypertune*.sh

In [3]:
model_names = ['setpool_nn']
DX1_dims = [20, 40]
DX_dims = [200, 300]
PR_dims = [100, 200]
fc_widths = [256, 512]
#md_widths = [128]
lr1s = [2e-4]
lr2s = [2e-5]
dropouts = [0.3]
batchsizes = [512]
embed_files = ['pretrain']
#penalties = [0]
#penalty_metrics = ['cosine']
count_caps = [1, 5, 100]
tst_seeds = [0]
cohorts = ['ami']
DX_rarecutpoints = [10]
PR_rarecutpoints = [drp/2 for drp in DX_rarecutpoints]
val_folds = [7]
other_preds = [0]
result_files = ['output/ht_result1219_{}.csv']

In [8]:
model_names = ['embed_sum', 'embed_pool']
code_embed_dims = [100]
fc_widths = [512]
md_widths = [128]
lr1s = [2e-4]
lr2s = [2e-5]
dropouts = [0.3]
batchsizes = [256]
embed_mats = ['random']
penalties = [0]
penalty_metrics = ['cosine']
count_caps = [5]
tst_seeds = range(10)
cohorts = ['ami']
DX_rarecutpoints = [20]
PR_rarecutpoints = [drp/2 for drp in DX_rarecutpoints]
val_folds = [5]
result_files = ['output/ht_result1001_{}.csv']

In [4]:
para_itr = itertools.product(model_names, DX1_dims, DX_dims, PR_dims, fc_widths, lr1s, lr2s, dropouts, batchsizes, embed_files, 
                             count_caps, tst_seeds, cohorts, DX_rarecutpoints, other_preds, val_folds, result_files)
para_lst = [(mn, dx1d, dxd, prd, fc, l1, l2, do, bs, em, cc, ts, ch, dxr, int(dxr/2), op, vf, rf) 
            for mn, dx1d, dxd, prd, fc, l1, l2, do, bs, em, cc, ts, ch, dxr, op, vf, rf in para_itr]

In [5]:
len(para_lst)

48

In [6]:
n_jobs = 10
for para, job_ind in zip(para_lst, itertools.cycle(range(n_jobs))):
    with open('hypertune'+str(job_ind)+'.sh', 'a') as f:
        f.write('python template_sub_multispace1218.py --model_name {} --DX1_dim {} --DX_dim {} --PR_dim {} --fc_width {} --lr1 {} --lr2 {} --dropout {} --batchsize {} --embed_file {} --count_cap {} --tst_seed {} --cohort {} --dx_rarecutpoint {} --pr_rarecutpoint {} --other_pred {} --val_fold {} --result_file {} --job_index {}\n'.format(*para, job_ind))

In [16]:
job_ind = 10
for para in para_lst[0:1]:
    with open('hypertune'+str(job_ind)+'.sh', 'a') as f:
        f.write('python template_sub_multispace1218.py --model_name {} --DX1_dim {} --DX_dim {} --PR_dim {} --fc_width {} --lr1 {} --lr2 {} --dropout {} --batchsize {} --embed_file {} --count_cap {} --tst_seed {} --cohort {} --dx_rarecutpoint {} --pr_rarecutpoint {} --other_pred {} --val_fold {} --result_file {} --job_index {}\n'.format(*para, job_ind))

Random search:

In [4]:
n_sample = 33

In [5]:
model_names = np.random.choice(['setsum_nn'], n_sample)
code_embed_dims = np.random.choice([200, 300], n_sample)
fc_widths = np.random.choice([512, 1024], n_sample)
md_widths = np.random.choice([128, 256], n_sample)
lr1s = np.random.choice([2e-4], n_sample)
lr2s = np.random.choice([2e-5], n_sample)
dropouts = np.random.choice([0.3], n_sample)
batchsizes = np.random.choice([256, 512], n_sample)
embed_mats = np.random.choice(['pretrain'], n_sample)
penalties = np.random.choice([0, 0.5, 1.], n_sample)
penalty_metrics = np.random.choice(['cosine'], n_sample)
count_caps = np.random.choice([0, 5, 20], n_sample)
cohorts = np.random.choice(['ami'], n_sample)
DX_rarecutpoints = np.random.choice([20], n_sample)
PR_rarecutpoints = [int(drp/2) for drp in DX_rarecutpoints]
val_folds = np.random.choice([5], n_sample)
result_files = ['output/ht_result1001_{}.csv']*n_sample

zips = zip(model_names, code_embed_dims, fc_widths, md_widths, lr1s, lr2s, dropouts, batchsizes, embed_mats, 
                             penalties, penalty_metrics, count_caps, cohorts, DX_rarecutpoints, PR_rarecutpoints,
                             val_folds, result_files)
tst_seeds = range(10)

In [6]:
para_itr = itertools.product(zips, tst_seeds)

para_lst = [(*z, t) for z, t in para_itr]

In [7]:
len(para_lst)

330

In [8]:
n_jobs = 4
for para, job_ind in zip(para_lst, itertools.cycle(range(n_jobs))):
    with open('hypertune'+str(job_ind)+'.sh', 'a') as f:
        f.write('python train_template_sub0922.py --model_name {0} --code_embed_dim {1} --fc_width {2} --md_width {3} --lr1 {4} --lr2 {5} --dropout {6} --batchsize {7} --embed_file {8} --penalty {9} --penalty_metric {10} --count_cap {11} --cohort {12} --dx_rarecutpoint {13} --pr_rarecutpoint {14} --val_fold {15} --result_file {16} --tst_seed {17} --job_index {18}\n'.format(*para, job_ind))

## Result Analysis

### Embedding + NN  with subset of codes

In [36]:
res = pd.DataFrame()

In [37]:
for job_ind in range(10):
    df = pd.read_csv('output/ht_result1219_'+str(job_ind)+'.csv', 
                     names=['model_name', 'DX1_dim', 'DX_dim', 'PR_dim', 'hosp_embed_dim', 'fc_width', 'lr1', 'lr2', 'dropout',
                            'batchsize', 'embed_file', 'cohort', 'tst_seed', 'n_fold', 'count_cap', 
                            'DX_rarecutpoint', 'PR_rarecutpoint', 'other_pred', 'auc_mean', 'auc_avg', 'auc_freeze', 'y_pred_file'], index_col=None)
    res = pd.concat([res, df])

In [4]:
res = res.loc[res.model_name=='embed_sum']

In [14]:
res

Unnamed: 0,model_name,DX1_dim,DX_dim,PR_dim,hosp_embed_dim,fc_width,lr1,lr2,dropout,batchsize,...,tst_seed,n_fold,count_cap,DX_rarecutpoint,PR_rarecutpoint,other_pred,auc_mean,auc_avg,auc_freeze,y_pred_file
0,setsum_nn,20,200,100,1,256,0.0002,2e-05,0.3,512,...,0,7,1,10,5,0,0.71357,0.71694,0.71291,output/y_pred_mat18_12_19_11_06_00.npy
1,setsum_nn,20,200,200,1,512,0.0002,2e-05,0.3,512,...,0,7,5,10,5,0,0.71381,0.71734,0.71301,output/y_pred_mat18_12_19_11_48_57.npy
2,setsum_nn,20,300,200,1,256,0.0002,2e-05,0.3,512,...,0,7,100,10,5,0,0.71329,0.71613,0.71154,output/y_pred_mat18_12_20_12_37_23.npy
3,setsum_nn,40,200,200,1,256,0.0002,2e-05,0.3,512,...,0,7,1,10,5,0,0.71376,0.71647,0.71208,output/y_pred_mat18_12_20_01_19_56.npy
4,setsum_nn,40,300,100,1,512,0.0002,2e-05,0.3,512,...,0,7,5,10,5,0,0.7127,0.71554,0.71185,output/y_pred_mat18_12_20_02_07_41.npy
0,setsum_nn,20,200,100,1,256,0.0002,2e-05,0.3,512,...,0,7,5,10,5,0,0.71354,0.71698,0.71316,output/y_pred_mat18_12_19_11_06_33.npy
1,setsum_nn,20,200,200,1,512,0.0002,2e-05,0.3,512,...,0,7,100,10,5,0,0.71381,0.71678,0.71303,output/y_pred_mat18_12_19_11_53_22.npy
2,setsum_nn,20,300,200,1,512,0.0002,2e-05,0.3,512,...,0,7,1,10,5,0,0.71379,0.71665,0.71203,output/y_pred_mat18_12_20_12_47_59.npy
3,setsum_nn,40,200,200,1,256,0.0002,2e-05,0.3,512,...,0,7,5,10,5,0,0.71375,0.7167,0.71276,output/y_pred_mat18_12_20_01_31_32.npy
4,setsum_nn,40,300,100,1,512,0.0002,2e-05,0.3,512,...,0,7,100,10,5,0,0.7132,0.71629,0.71237,output/y_pred_mat18_12_20_02_20_56.npy


In [33]:
res_grouped = res.loc[res.auc_freeze>0.713].groupby(['model_name', 'DX1_dim', 'DX_dim', 'PR_dim', 'fc_width', 'batchsize', 'count_cap'])

In [38]:
res_grouped = res.groupby(['model_name', 'DX1_dim', 'DX_dim', 'PR_dim', 'fc_width', 'batchsize', 'count_cap'])

In [39]:
res_grouped[['auc_freeze', 'auc_mean', 'auc_avg']].agg(['mean', 'count'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,auc_freeze,auc_freeze,auc_mean,auc_mean,auc_avg,auc_avg
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,mean,count,mean,count,mean,count
model_name,DX1_dim,DX_dim,PR_dim,fc_width,batchsize,count_cap,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
setsum_nn,20,200,100,256,512,1,0.712977,3,0.713643,3,0.716837,3
setsum_nn,20,200,100,256,512,5,0.712783,3,0.713457,3,0.716873,3
setsum_nn,20,200,100,256,512,100,0.71237,3,0.713347,3,0.716373,3
setsum_nn,20,200,100,512,512,1,0.712467,3,0.71356,3,0.71672,3
setsum_nn,20,200,100,512,512,5,0.7122,3,0.713237,3,0.71634,3
setsum_nn,20,200,100,512,512,100,0.7133,3,0.714177,3,0.717183,3
setsum_nn,20,200,200,256,512,1,0.712067,3,0.713087,3,0.716147,3
setsum_nn,20,200,200,256,512,5,0.712103,3,0.71298,3,0.716267,3
setsum_nn,20,200,200,256,512,100,0.712703,3,0.71342,3,0.716203,3
setsum_nn,20,200,200,512,512,1,0.711957,3,0.71322,3,0.716173,3


In [11]:
res.to_csv('output/ht_result1218.csv', index=False)

In [5]:
res = pd.read_csv('output/ht_result1003embed_nn_sub.csv')

In [9]:
res = res.loc[res.penalty==0]