In [None]:
# first we define relevant directories
import sys
import os
import pickle
# project directory
project_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.metrics import classification_report, f1_score, roc_auc_score, roc_curve, \
                            precision_recall_curve, average_precision_score, confusion_matrix
%matplotlib inline

In [None]:
def eval_perf_df(gdf):
    ref_target_levent = gdf['ref_target']
    pred_target_levent = gdf['pred_target']
    prob_score_levent = gdf['prob_target1']
    lsep = "\n"
    report = "Classification report on last event:" + lsep
    report += str(classification_report(ref_target_levent, pred_target_levent)) + lsep
    micro_f1 = None
    for method in ('binary','micro', 'macro', 'weighted'):
        f_score = f1_score(ref_target_levent, pred_target_levent, average=method)
        report += "{} f1:".format(method) + lsep
        report += str(f_score) + lsep
        if(method=='micro'):
            micro_f1 = f_score
    report += str(confusion_matrix(ref_target_levent, pred_target_levent)) + lsep
    report += "auc:" + lsep
    try:
        auc_score = roc_auc_score(ref_target_levent, prob_score_levent)
    except Exception:
        print("exception is raised !!")
        auc_score = 0
    finally:
        report += str(auc_score) + lsep
        for method in ('micro',):
            avg_precrecall = average_precision_score(ref_target_levent, pred_target_levent, average=method)
            report += "average precision recall, method={}:".format(method) + lsep
            report += str(avg_precrecall) + lsep
        report += "-"*30 + lsep + "*"*30 + lsep
        return(micro_f1, auc_score, report)

In [None]:
num_folds = 5
model_desc = [('cnn', 'CNN_Labeler'),
              ('cnnwide', 'CNNWide_Labeler'),
              ('nn','NN_Labeler'),
              ('LogisticRegression_l1', 'LogisticRegression_last'),
              ('LogisticRegression_l2', 'LogisticRegression_last'),
              ('crfnn', 'CRF_NN_Labeler'),
              ('crfnnpair', 'CRF_NN_Pair_Labeler'),
              ('crfonly','CRF_Only_Labeler'),
              ('crfonlypair','CRF_Only_Pair_Labeler'),
              ('rnncrfpair', 'RNNCRF_Pair_Labeler'),
              ('rnncrfunary', 'RNNCRF_Unary_Labeler')]
rnn_losses = ('Convex_HF_LastHF', 'LastHF', 'Uniform_HF', 'Convex_HF_NonHF')
for m in ('rnn', 'rnnss'):
    for mloss in rnn_losses:
        model_desc += [("{}_lossmode_{}".format(m, mloss), "{}_Labeler_lossmode_{}".format(m.upper(), mloss))]
data_dir = os.path.join(project_dir, 'decoded_output')

In [None]:
# sanity check
for folder_name, model_prefix in model_desc:
    print("model name: ", model_prefix)
    for fold in range(num_folds):
        print('fold num: ', fold)
        print()
        df = pd.read_csv(os.path.join(data_dir, "{}_decoded".format(folder_name), "{}_fold_{}_none.txt".format(model_prefix, fold)), header=0, sep="\t")
        if(pd.isnull(df['prob_target1']).sum()):
            print("*"*15)
            print("NULL found!!!")
            print("folder name: ", folder_name)
            print("fold num: ", fold)
            print("*"*15)
            print()

### Update each decoded text to make all files in same representation

In [None]:
for folder_name, model_prefix in model_desc:
    print("model name: ", model_prefix)
    for fold in range(num_folds):
        print('fold num: ', fold)
        print()
        df = pd.read_csv(os.path.join(data_dir, "{}_decoded".format(folder_name), "{}_fold_{}_none.txt".format(model_prefix, fold)), header=0, sep="\t")
        df['rindx'] = df.index.tolist()
        df['lastevent'] = 0
        lastevent_indx = df.groupby('pid').nth(-1)['rindx'].tolist()
        df.loc[df['rindx'].isin(lastevent_indx), 'lastevent']=1
        df.to_csv(os.path.join(data_dir, "{}_decoded".format(folder_name), "{}_fold_{}_none_upd.txt".format(model_prefix, fold)), sep="\t", header=True, index=False)    

In [None]:
# compute performance of each fold
cols = ['micro f1', 'AUC']
verbose = True
for folder_name, model_prefix in model_desc:
    res_levent = np.zeros((5,2))
    res_index_wo_levent = np.zeros((5,2))
    print("folder name: ", folder_name)
    print("model name: ", model_prefix)
    flag = False
    for fold in range(num_folds):
        print('fold num: ', fold)
        print()
        report = ''
        if(folder_name not in {'cnn','cnnwide','nn', 'LogisticRegression_l1', 'LogisticRegression_l2'}):
            flag = True
            df = pd.read_csv(os.path.join(data_dir, "{}_decoded".format(folder_name), "{}_fold_{}_none_upd.txt".format(model_prefix, fold)), 
                             sep="\t", header=0)  
            report += "evaluating performance using only last index event: \n"
            tmp = df.loc[(df['index_event']==1) & (df['lastevent']==1)]
            micro_f1, auc, eval_rep = eval_perf_df(tmp)
            res_levent[fold,:] = np.array([micro_f1, auc])
            report += eval_rep

#             report += "evaluating performance using all index events not including the last event: \n"
#             tmp = df.loc[(df['index_event']==1) & (df['lastevent']!=1)]
#             micro_f1, auc, eval_rep= eval_perf_df(tmp)
#             res_index_wo_levent[fold,:] = np.array([micro_f1, auc])
#             report += eval_rep

#             report += "evaluating performance using all index events (including the last event)\n"
#             tmp = df.loc[(df['index_event']==1)]
#             _, _, eval_rep = eval_perf_df(tmp)
#             report += eval_rep

#             report += "evaluating performance using all events (including non index events)\n"
#             _, _, eval_rep = eval_perf_df(df)
#             report += eval_rep
        else:
            df = pd.read_csv(os.path.join(data_dir, "{}_decoded".format(folder_name), "{}_fold_{}_none.txt".format(model_prefix, fold)), 
                             sep="\t", header=0)
            report += "evaluating performance using only last index event: \n"
            micro_f1, auc, eval_rep = eval_perf_df(df)
            res_levent[fold,:] = np.array([micro_f1, auc])
            report += eval_rep
        if(verbose):
            print(report)
    print("|"*100)
    print()
    for i in range(2):
        print("average performance (across five folds) using only last index event:")
        print("{} mean:{} sd:{}".format(cols[i], np.mean(res_levent[:,i]), np.std(res_levent[:,i])))
        print()
        if(flag):
            print("average performance (across five folds) using all index events not including the last event:")
            print("{} mean:{} sd:{}".format(cols[i], np.mean(res_index_wo_levent[:,i]), np.std(res_index_wo_levent[:,i])))
            print()
    print("|"*100)
    print()
    print()

In [None]:
def generate_perf_df(gdf, i):
    ref_target_levent = gdf['ref_target']
    pred_target_levent = gdf['pred_target']
    prob_score_levent = gdf['prob_target1']
    try:
        auc_score = roc_auc_score(ref_target_levent, prob_score_levent)
    except Exception:
        print("exception is raised !!")
        auc_score = 0
    finally:
        df = pd.DataFrame()
        df['seq_len'] = [i]
        df['auc'] = [auc_score]
        df['fold_id'] = [gdf.iloc[-1]['fold_id']]
        df['model_name'] = [gdf.iloc[-1]['model_name']]
        return(df)

In [None]:
final_df = pd.DataFrame()
seqs_len = list(range(1,11)) + [20]
for folder_name, model_prefix in model_desc:
    print("folder name: ", folder_name)
    print("model name: ", model_prefix)
    for fold in range(num_folds):
        print('fold num: ', fold)
        print()
        df = pd.read_csv(os.path.join(data_dir, "{}_decoded".format(folder_name), "{}_fold_{}_none_upd.txt".format(model_prefix, fold)), 
                         sep="\t", header=0)
        if(folder_name not in {'cnn','cnnwide','nn', 'LogisticRegression_l1', 'LogisticRegression_l2'}):
            df = df.loc[(df['index_event']==1) & (df['lastevent']==1)].copy() # use only last event
        df['seq_len_categ'] = df['seq_len']
        for seq_len in seqs_len:
            if(seq_len == 20):
                seq_len = 11
            tmp = df.loc[df['seq_len'] <= seq_len].copy()
            auc_df = generate_perf_df(tmp, seq_len)
            final_df = pd.concat([final_df, auc_df], axis=0, ignore_index=True)

In [None]:
import seaborn as sns
sns.set_style('white')

In [None]:
sns.set(font_scale = 1.4)
sns.set_style('white')
g = sns.factorplot(x="seq_len", y='auc', hue='model_name',
                   data=final_df.loc[final_df['model_name'].isin({'LogisticRegression_l1','RNNCRF_Pair_Labeler'})],
                   size=8, aspect=1);
g.set_ylabels('AUC', fontsize = 20)
g.set_xlabels("Patients' timeline length", fontsize=20)
g.set_xticklabels(['<= {}'.format(i) for i in list(range(1,11)) + [20]], fontsize=18)

# title
new_title = 'Model name'
g._legend.set_title("")
# replace labels
new_labels = ['LASSO', 'RNNCRF (Pairwise potential)']
for t, l in zip(g._legend.texts, new_labels): t.set_text(l); t.set_fontsize(18);

sns.plt.show()

In [None]:
g.savefig(os.path.join(project_dir, 'performance_vs_seqlen.svg'),format='svg')

### Comparing the decoded output of both LASSO and RNNCRF Pairwise potential

In [None]:
# traj_info = pd.read_csv(os.path.join(project_dir, 'dataset', 'traj_info.txt'), header=0, sep="\t")
# model_a = ('LogisticRegression_l1', 'LogisticRegression_last')
# model_b = ('rnncrfpair', 'RNNCRF_Pair_Labeler')
# for fold_num in range(5):
#     print("Fold number: ", fold_num)
#     df_a = pd.read_csv(os.path.join(data_dir, "{}_decoded".format(model_a[0]), "{}_fold_{}_none_upd.txt".format(model_a[-1], fold)), 
#                        sep="\t", header=0)  
#     df_b = pd.read_csv(os.path.join(data_dir, "{}_decoded".format(model_b[0]), "{}_fold_{}_none_upd.txt".format(model_b[-1], fold)), 
#                        sep="\t", header=0)  
#     df_b = df_b.loc[(df_b['index_event']==1) & (df_b['lastevent']==1)].copy()
#     # correct prediction users for RNNCRF Pair model
#     setcorrect_b = set(df_b.loc[df_b['ref_target'] == df_b['pred_target'], 'pid'])
#     # correct prediction users for Logistic regression model
#     setcorrect_a = set(df_a.loc[df_a['ref_target'] == df_a['pred_target'], 'pid'])
    
#     print("Number of users correctly predicted outcomes by RNNCRF model and not by Logistic regression: \n", 
#           len(setcorrect_b - setcorrect_a))
#     print("Number of users correctly predicted outcomes by Logistic regression and not by RNNCRF model: \n", 
#           len(setcorrect_a - setcorrect_b))
#     print("Number of users correctly predicted outcomes by both models: \n", 
#           len(setcorrect_b.intersection(setcorrect_a)))
#     print()
#     b_minus_a = setcorrect_b - setcorrect_a
#     print('Characteristics of users correctly predicted by RNNCRF model and not by Logistic regression:')
#     print(traj_info.loc[traj_info['nrd_visitlink'].isin(b_minus_a), ['female', 'age', 
#                                                                      'run_num_indxevents', 'seq_len',
#                                                                      'count_allcausereadmit']].describe())
#     print()
#     print("percentage of females:")
#     print(traj_info.loc[traj_info['nrd_visitlink'].isin(b_minus_a), 'female'].value_counts(normalize=True))
#     print()
#     print("percentage of allcause readmissions:")
#     print(traj_info.loc[traj_info['nrd_visitlink'].isin(b_minus_a), 'allcause_readmit'].value_counts(normalize=True))
#     print()
#     a_minus_b = setcorrect_a - setcorrect_b
#     print('Characteristics of users correctly predicted by Logistic regression and not by RNNCRF model:')
#     print(traj_info.loc[traj_info['nrd_visitlink'].isin(a_minus_b), ['female', 'age', 
#                                                                      'run_num_indxevents', 'seq_len',
#                                                                      'count_allcausereadmit']].describe())
#     print()
#     print("percentage of females:")
#     print(traj_info.loc[traj_info['nrd_visitlink'].isin(a_minus_b), 'female'].value_counts(normalize=True))
#     print()
#     print("percentage of allcause readmissions:")
#     print(traj_info.loc[traj_info['nrd_visitlink'].isin(a_minus_b), 'allcause_readmit'].value_counts(normalize=True))
#     print()
#     print("*"*50)