In [1]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import os, sys
import itertools
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc
import statsmodels.stats.api as sms

In [2]:
path = '/nfs/turbo/umms-awaljee/wsliu/Data/NRD/'

In [3]:
module_path = '/home/wsliu/Codes/DLproj'
if module_path not in sys.path:
    sys.path.append(module_path)
if module_path+'/NRD' not in sys.path:
    sys.path.append(module_path+'/NRD')

In [4]:
from DL_utils import parallel_coordinates

In [5]:
from ccs_tools import core_dtypes_pd

In [8]:
folder = 'multi_space_glove/'
DX1_freq = pd.read_csv(path+folder+'DX1_freq.csv', dtype={'DX1':str, 'frequency':int})
DX_freq = pd.read_csv(path+folder+'DX_freq.csv', dtype={'DX':str, 'frequency':int})
PR_freq = pd.read_csv(path+folder+'PR_freq.csv', dtype={'PR':str, 'frequency':int})

In [14]:
from ccs_tools import dx_multi, pr_multi, core_dtypes_pd
unclassified = set(dx_multi.loc[dx_multi.CCS_LVL1 == '18', 'ICD9CM_CODE'])

In [16]:
DX1_rarecutpoint = 10
DX_rarecutpoint = 10
PR_rarecutpoint = 10

In [17]:
DX1_freq = DX1_freq.loc[DX1_freq.frequency>0]
DX_freq = DX_freq.loc[DX_freq.frequency>0]
PR_freq = PR_freq.loc[PR_freq.frequency>0]
DX1_cat = sorted(DX1_freq.loc[(DX1_freq.frequency>DX1_rarecutpoint) & (~DX1_freq.DX1.isin(unclassified))].DX1)
DX_cat = sorted(DX_freq.loc[(DX_freq.frequency>DX_rarecutpoint) & (~DX_freq.DX.isin(unclassified))].DX)
PR_cat = sorted(PR_freq.loc[(PR_freq.frequency>PR_rarecutpoint)].PR)
n_DX1_cat = len(DX1_cat)
n_DX_cat = len(DX_cat)
n_PR_cat = len(PR_cat)

In [18]:
len(set(DX1_cat).union(set(DX_cat)))

9778

In [19]:
len(PR_cat)

3183

## Editing pbs Files

In [2]:
!rm batch_gpu*.pbs

In [3]:
for job_index in range(12):
    with open('batch_gpu{}.pbs'.format(job_index), 'w') as f:
        f.write("""#!/bin/sh

#### PBS preamble

#PBS -N NRD_hypertune{}

#PBS -M wsliu@umich.edu
#PBS -m abe

#PBS -A awaljee_fluxg

#PBS -q fluxg

#PBS -V
#PBS -l nodes=1:gpus={},mem={}gb,walltime={}:00:00
#PBS -j oe

#### End PBS preamble

#  Show list of CPUs you ran on, if you're running under PBS
if [ -n "$PBS_NODEFILE" ]; then cat $PBS_NODEFILE; fi

#  Change to the directory you submitted from
if [ -n "$PBS_O_WORKDIR" ]; then cd $PBS_O_WORKDIR; fi

#  Put your job commands here:
sh hypertune{}.sh""".format(job_index, '1', 7, 72, job_index))

In [10]:
!rm batch*.pbs

In [11]:
for job_index in range(20):
    with open('batch{}.pbs'.format(job_index), 'w') as f:
        f.write("""#!/bin/sh

#### PBS preamble

#PBS -N NRD_hypertune{}

#PBS -M wsliu@umich.edu
#PBS -m abe

#PBS -A awaljee_fluxod

#PBS -q fluxod

#PBS -V
#PBS -l nodes=1:ppn={},pmem={}gb,walltime={}:00:00
#PBS -j oe

#### End PBS preamble

#  Show list of CPUs you ran on, if you're running under PBS
if [ -n "$PBS_NODEFILE" ]; then cat $PBS_NODEFILE; fi

#  Change to the directory you submitted from
if [ -n "$PBS_O_WORKDIR" ]; then cd $PBS_O_WORKDIR; fi

#  Put your job commands here:
sh hypertune{}.sh""".format(job_index, '2', 23, 48, job_index))

In [11]:
for job_index in range(3):
    with open('batch{}.pbs'.format(job_index), 'w') as f:
        f.write("""#!/bin/sh

#### PBS preamble

#PBS -N NRD_hypertune{}

#PBS -M wsliu@umich.edu
#PBS -m abe

#PBS -A awaljee_fluxod

#PBS -q fluxod

#PBS -V
#PBS -l nodes=1:largemem,mem={}gb,walltime={}:00:00
#PBS -j oe

#### End PBS preamble

#  Show list of CPUs you ran on, if you're running under PBS
if [ -n "$PBS_NODEFILE" ]; then cat $PBS_NODEFILE; fi

#  Change to the directory you submitted from
if [ -n "$PBS_O_WORKDIR" ]; then cd $PBS_O_WORKDIR; fi

#  Put your job commands here:
sh hypertune{}.sh""".format(job_index, 63, 48, job_index))

## Prepare hyper-parameters and generate the .sh files

In [12]:
!rm hypertune*.sh

In [13]:
ts = range(10)
#ts.remove(7)

In [15]:
lrs = [2e-5]
tst_seeds = ts
cohorts = ['ami', 'chf', 'pna']
folders = ['multi_space_glove/']
DX_rarecutpoints = [10]
PR_rarecutpoints = [10]
result_files = ['output/ht_result0524_{}.csv']

In [16]:
para_itr = itertools.product(lrs, tst_seeds, cohorts, folders, DX_rarecutpoints, result_files)
para_lst = [(lr, ts, ch, fl, dxr, dxr, rf) for (lr, ts, ch, fl, dxr, rf) in para_itr]

In [17]:
len(para_lst)

30

In [18]:
job_lst = range(3)
for para, job_ind in zip(para_lst, itertools.cycle(job_lst)):
    with open('hypertune'+str(job_ind)+'.sh', 'a') as f:
        f.write('python train_template_xgboost0425.py --lr {} --tst_seed {} --cohort {} --folder {} --dx_rarecutpoint {} --pr_rarecutpoint {} --result_file {} --job_index {}\n'.format(*para, job_ind))

In [12]:
job_ind = 2
for para in para_lst:
    with open('hypertune'+str(job_ind)+'.sh', 'a') as f:
        f.write('python template_all_multispace0102.py --model_name {} --DX1_dim {} --DX_dim {} --PR_dim {} --fc_width {} --lr1 {} --lr2 {} --dropout {} --batchsize {} --embed_file {} --penalty {} --penalty_metric {} --count_cap {} --tst_seed {} --cohort {} --dx1_rarecutpoint {} --dx_rarecutpoint {} --pr_rarecutpoint {} --other_pred {} --ndxpr {} --val_fold {} --result_file {} --job_index {}\n'.format(*para, job_ind))

## Result Analysis

In [2]:
res = pd.DataFrame()

In [3]:
for job_ind in range(2):
    df = pd.read_csv('output/ht_result0523_'+str(job_ind)+'.csv', 
                     names=['lr', 'cohort', 'tst_seed', 'DX_rarecutpoint', 'PR_rarecutpoint', 'n_code_cat', 'n_X', 
                            'roc_auc'], index_col=None)
    res = pd.concat([res, df])

In [4]:
res.head()

Unnamed: 0,lr,cohort,tst_seed,DX_rarecutpoint,PR_rarecutpoint,n_code_cat,n_X,roc_auc
0,2e-05,ami,0,10,10,3105,4177,0.66473
1,2e-05,pna,0,10,10,3873,5727,0.63206
2,2e-05,chf,1,10,10,3652,5319,0.60367
3,2e-05,ami,2,10,10,3105,4177,0.6643
4,2e-05,pna,2,10,10,3873,5727,0.63377


In [5]:
res.shape

(30, 8)

In [25]:
res_grouped = res.groupby(['cohort', 'DX_rarecutpoint'])

In [26]:
res_grouped[['roc_auc']].agg(['mean', 'count'])

Unnamed: 0_level_0,Unnamed: 1_level_0,roc_auc,roc_auc
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,count
cohort,DX_rarecutpoint,Unnamed: 2_level_2,Unnamed: 3_level_2
ami,10,0.701448,9
ami,20,0.701448,9
ami,40,0.701493,10
ami,60,0.701512,10
ami,80,0.701514,10
ami,100,0.701509,10
chf,10,0.615313,6
chf,20,0.614633,9
chf,40,0.614292,10
chf,60,0.614292,10


In [13]:
res.loc[res.DX_rarecutpoint==40].groupby('cohort').get_group('pna').sort_values(['DX_rarecutpoint', 'tst_seed'])

Unnamed: 0,lr,cohort,tst_seed,DX_rarecutpoint,PR_rarecutpoint,n_code_cat,n_X,roc_auc
11,2e-05,pna,0,40,40,3042,4963,0.65587
9,2e-05,pna,1,40,40,3042,4963,0.65582
10,2e-05,pna,2,40,40,3042,4963,0.65465
12,2e-05,pna,3,40,40,3042,4963,0.65412
7,2e-05,pna,4,40,40,3042,4963,0.65597
9,2e-05,pna,5,40,40,3042,4963,0.65273
7,0.0,pna,6,40,40,3042,4963,0.65969
7,2e-05,pna,7,40,40,3042,4963,0.65005
6,2e-05,pna,9,40,40,3042,4963,0.65351


In [9]:
res.to_csv('output/ht_result0523xgboost_elder.csv', index=False)

In [29]:
res = res.loc[res.DX_rarecutpoint==40]

In [6]:
res_grouped = res.groupby(['cohort'])

In [7]:
res_grouped[['roc_auc']].agg(['mean', 'count'])

Unnamed: 0_level_0,roc_auc,roc_auc
Unnamed: 0_level_1,mean,count
cohort,Unnamed: 1_level_2,Unnamed: 2_level_2
ami,0.666148,10
chf,0.601764,10
pna,0.634731,10


Rarecutpoint = 10, folder = 'elder/':

In [8]:
for n, g in res_grouped:
    print(n, 'auc: {0:.3f} ({1:.3f}, {2:.3f})'.format(g.roc_auc.mean(), *sms.DescrStatsW(g.roc_auc).zconfint_mean()))

ami auc: 0.666 (0.664, 0.668)
chf auc: 0.602 (0.599, 0.605)
pna auc: 0.635 (0.632, 0.638)
