In [None]:
# first we define relevant directories
import sys
import os
import pickle
# project directory
project_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
# src directory (below)
src_dir = os.path.join(project_dir, 'src')
sys.path.insert(0, src_dir)
dataset_dir = os.path.join(project_dir, "dataset")
print("dataset_dir: ", dataset_dir)
print("project_dir: ", project_dir)

In [None]:
from explore_hcupdata import *
from utilities import create_directory, ReaderWriter

In [None]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
# import seaborn as sns
# sns.set_style('white')

In [None]:
# read relevant data
CONT_COLS = ReaderWriter.read_data(os.path.join(dataset_dir, 'continuous_features.pkl'))
COL_FEATURES = ReaderWriter.read_data(os.path.join(dataset_dir, 'col_features.pkl'))
feat_label = ReaderWriter.read_data(os.path.join(dataset_dir, 'feat_label.pkl'))
code_feat = ReaderWriter.read_data(os.path.join(dataset_dir, 'code_feat.pkl'))
datasplit = ReaderWriter.read_data(os.path.join(dataset_dir, 'datasplit.pkl'))
traj_info = ReaderWriter.read_data(os.path.join(dataset_dir, 'traj_info.pkl'))
fsample = ReaderWriter.read_data(os.path.join(dataset_dir, 'fsample.pkl'))

In [None]:
## Generate baseline model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score, roc_auc_score, \
                            brier_score_loss, average_precision_score
from sklearn.calibration import calibration_curve

def results_report(y_ref, y_pred, pred_prob, clf_name, dump_file=None):
    lsep = "\n"
    report = "Classification report on last event:" + lsep
    report += str(classification_report(y_ref, y_pred)) + lsep
    report += "weighted f1:" + lsep
    weighted_f1 = f1_score(y_ref, y_pred, average='weighted')
    report += str(weighted_f1) + lsep
    report += "micro f1:" + lsep
    report += str(f1_score(y_ref, y_pred, average='micro')) + lsep
    report += "brier score:" + lsep
    brierscore = brier_score_loss(y_ref, pred_prob, pos_label=y_ref.max())
    report += str(brierscore) + lsep
    avg_precrecall = average_precision_score(y_ref, y_pred, average='micro')
    report += "average precision recall, method={}:".format('micro') + lsep
    report += str(avg_precrecall) + lsep
    report += "auc:" + lsep
    auc_score = roc_auc_score(y_ref, pred_prob)
    report += str(auc_score) + lsep
    report += "-"*30 + lsep + "*"*30 + lsep
    print(report)
    if(dump_file):
        score = (weighted_f1, auc_score)
        ReaderWriter.dump_data(score, dump_file)
    return(auc_score)
                                       
def collapse_avgfeature(gdf, target_outcome):
    avgfeat = gdf[COL_FEATURES].mean(axis=0)
    avgfeat[target_outcome] = gdf.iloc[-1][target_outcome]
    return(avgfeat)

def construct_data_baseline(target_idx, fsample, target_outcome='allcause_readmit', collapse_option='last'):
    dset = fsample.loc[fsample['nrd_visitlink'].isin(target_idx)].copy()
    if(collapse_option=='last'):
        dset = dset.groupby('nrd_visitlink', group_keys=False).nth(-1)
        dset.reset_index(inplace=True)
    elif(collapse_option=='average'):
        dset = dset.groupby('nrd_visitlink', group_keys=False).apply(collapse_avgfeature, target_outcome)
        dset.reset_index(inplace=True)
    return(dset)

def generate_baseline_normalizer(dset, norm_option, cdir):
    print("norm_option: ", norm_option)
    print(cdir)
    if(norm_option=='standardize'):
        normalizer = GaussianNormalizerInfo
    elif(norm_option=='meanrange'):
        normalizer = MeanRangeNormalizerInfo
    elif(norm_option == 'rescale'):
        normalizer = RescaleNormalizerInfo
    a, b = get_feature_normalizer(dset, CONT_COLS, norm_option)
    ReaderWriter.dump_data(normalizer(a,b), os.path.join(cdir, ("{}_info.pkl".format(norm_option))))


In [None]:
def train_eval_baseline_models(datafolds, norm_option, collapse_option, clf_name, reg_type,
                               dataset_dir, wrkdir, dec_dir, target_outcome = 'allcause_readmit'):
    dsettypes = ('train', 'validation', 'test')
    for fold_name in datafolds:
        print("fold name: ", fold_name)
        # model signature/name
        sign = "{}_{}_{}_{}.pkl".format(fold_name, norm_option, collapse_option, clf_name)
        print("model signature: ", sign)
        # dataset directory
        cdir = create_directory("{}_{}_{}".format(fold_name, norm_option, collapse_option), dataset_dir) 
        fitmodels = []
        fitmodels_map = {}
        # datafolds[fold_name] is a tuple of (train, validation, test) 
        for i, dset in enumerate(datafolds[fold_name]):
            dsettype =dsettypes[i]
            print(dsettype + " dataset")
            print("dset shape: ", dset.shape)
            print()
            if(norm_option != 'none'): # apply normalization for continuous features
                apply_normalization(dset, 
                                    CONT_COLS, 
                                    ReaderWriter.read_data(os.path.join(cdir, "{}_info.pkl".format(norm_option))))   
            if(dsettype == 'train'):
                for solver in ('liblinear', 'saga'):
                    for C in (1e-1, 1e-2, 1e-3):
                        for class_weight in ('balanced',):
                            model_spec = "solver:{}, reg_type:{}, C:{}, class_weight:{}".format(solver,
                                                                                                reg_type,
                                                                                                C,
                                                                                                class_weight)
                            print("model_spec: ", model_spec)                                
                            lr = LogisticRegression(solver=solver, penalty=reg_type, 
                                                    class_weight=class_weight, C=C)
                            lr.fit(dset[COL_FEATURES], dset[target_outcome])
                            fitmodels.append(lr)
                            fitmodels_map[len(fitmodels_map)] = model_spec
            elif(dsettype == 'validation'):
                res = []
                for model in fitmodels:
                    y_pred = model.predict(dset[COL_FEATURES])
                    y_ref = dset[target_outcome]
                    pred_prob = model.predict_proba(dset[COL_FEATURES])[:, 1]                          
                    score = results_report(y_ref, y_pred, pred_prob, 'logistic regression')
                    res.append(score)
                maxarg = np.argmax(res)                
                trainedmodel = fitmodels[maxarg]
                print("trainedmodel: ", trainedmodel)
                print("bestmodel signature: ", fitmodels_map[maxarg])
                model_name = str(trainedmodel.__class__).split('.')[-1][:-2]
                ReaderWriter.dump_data(trainedmodel, os.path.join(wrkdir, "{}_{}_{}_{}.pkl".format(model_name,
                                                                                                   collapse_option,
                                                                                                   fold_name,
                                                                                                   norm_option)))
                score = res[maxarg]
            elif(dsettype == 'test'):
                y_pred = trainedmodel.predict(dset[COL_FEATURES])
                y_ref = dset[target_outcome]
                pred_prob = trainedmodel.predict_proba(dset[COL_FEATURES])[:, 1]
                dump_file = os.path.join(wrkdir, "{}_{}_{}_{}_{}_score.pkl".format(model_name,
                                                                                   collapse_option,
                                                                                   fold_name,
                                                                                   norm_option,
                                                                                   dsettype))
                score = results_report(y_ref, y_pred, pred_prob, 'logistic regression', dump_file=dump_file)
                # dump dataframe to disk
                df = dset[['nrd_visitlink', 'seq_len', 'index_event', target_outcome]].copy()
                df['pred_target'] = y_pred
                df['prob_target1'] = pred_prob
                df['model_name'] = clf_name
                df['fold_id'] = "{}_{}".format(fold_name, norm_option)
                df.rename(index=str, columns={"nrd_visitlink": "pid", target_outcome:'ref_target'}, inplace=True)
                fpath = os.path.join(dec_dir,"{}_{}_{}_{}.txt".format(clf_name,
                                                                      collapse_option,
                                                                      fold_name,
                                                                      norm_option))
                dump_df(df, fpath, sep="\t")
    return(score)

def dump_df(df, fpath, sep="\t"):
    f_out = open(fpath, 'a')
    f_out.write(sep.join(df.columns.tolist()) + "\n")
    f_out.close()
    df.to_csv(fpath, mode='a', index=False, header=False, sep=sep, na_rep='NaN')
    
# we can merge this function with the :func:`train_eval_baseline_models`
def test_baseline_models(datafolds, norm_option, collapse_option, clf_name, 
                         dataset_dir, model_dir, dec_dir, target_outcome = 'allcause_readmit'):
    dsettypes = ('test',)
    for fold_name in datafolds:
        print("fold name: ", fold_name)
        sign = "{}_{}_{}_{}.pkl".format(fold_name, norm_option, collapse_option, clf_name)
        print("model signature: ", sign)
        # dataset directory
        cdir = create_directory("{}_{}_{}".format(fold_name, norm_option, collapse_option), dataset_dir)
        # datafolds[fold_name] is a tuple of (train, validation, test)
        for i, dset in enumerate(datafolds[fold_name]): 
            dsettype =dsettypes[i]
            print(dsettype + " dataset")
            print("dset shape: ", dset.shape)
            print()
            # read model
            trainedmodel = ReaderWriter.read_data(os.path.join(model_dir, "{}_{}_{}_{}.pkl".format(clf_name,
                                                                                                   collapse_option
                                                                                                   fold_name,
                                                                                                   norm_option)))
            y_pred = trainedmodel.predict(dset[COL_FEATURES])
            y_ref = dset[target_outcome]
            pred_prob = trainedmodel.predict_proba(dset[COL_FEATURES])[:, 1]
            dump_file = os.path.join(model_dir, "{}_{}_{}_{}_{}_score.pkl".format(clf_name,
                                                                                  collapse_option,
                                                                                  fold_name,
                                                                                  norm_option,
                                                                                  dsettype))
            score = results_report(y_ref, y_pred, pred_prob, 'logistic regression', dump_file=dump_file)
            # dump dataframe to disk
            df = dset[['nrd_visitlink', 'seq_len', 'index_event', target_outcome]].copy()
            df['pred_target'] = y_pred
            df['prob_target1'] = pred_prob
            df['model_name'] = clf_name
            df['fold_id'] = "{}_{}".format(fold_name, norm_option)
            df.rename(index=str, columns={"nrd_visitlink": "pid", target_outcome:'ref_target'}, inplace=True)
            fpath = os.path.join(dec_dir, "{}_{}_{}_{}.txt".format(clf_name,
                                                                   collapse_option,
                                                                   fold_name,
                                                                   norm_option))
            dump_df(df, fpath, sep="\t")
    return(score)

### Generate dataset for training/testing baseline models

In [None]:
baseline_dataset_dir = create_directory('dataset_baseline', project_dir)
decoded_dir = create_directory('decoded_output', project_dir)
# datafolds -> fold_id:(train_idx, val_idx, test_idx)
datafolds = get_datafolds(datasplit, traj_info, 0.2)
norm_option = 'none'
collapse_option = 'last'
dsettypes = ['train', 'validation', 'test']
target_outcome = 'allcause_readmit'
for fold_name in datafolds:
    dirname = "{}_{}_{}".format(fold_name, norm_option, collapse_option)
    cdir = create_directory(dirname, baseline_dataset_dir)
    for i, target_idx in enumerate(datafolds[fold_name]):
        dset = construct_data_baseline(target_idx, fsample, 
                                       target_outcome = target_outcome,
                                       collapse_option=collapse_option)
        dsettype = dsettypes[i]
        # save to disk
        dset.to_pickle(os.path.join(cdir, "{}.pkl".format(dsettype)))

In [None]:
# train models with l1 and l2 regularization
clf_name = 'LogisticRegression'
res = {}
wrkdir_f = create_directory('models_baseline', project_dir)
fold_names =  ["fold_{}".format(i) for i in range(0, 5)]
for reg_type in ('l1', 'l2'):
    print("{} with regularization type: {}".format(clf_name, reg_type))
    wrkdir = create_directory('{}_{}'.format(clf_name, reg_type), wrkdir_f)
    dec_dir = create_directory('{}_{}_decoded'.format(clf_name, reg_type), decoded_dir)
    for fold_name in fold_names:
        folds = {fold_name: []}
        dirname = "{}_{}_{}".format(fold_name, norm_option, collapse_option)
        cdir = create_directory(dirname, baseline_dataset_dir) 
        for dsettype in dsettypes:
            dset = ReaderWriter.read_data(os.path.join(cdir, "{}.pkl".format(dsettype)))
            folds[fold_name].append(dset)
        res[fold_name] = train_eval_baseline_models(folds, norm_option, collapse_option, clf_name, reg_type,
                                                    baseline_dataset_dir, wrkdir, dec_dir, 
                                                    target_outcome=target_outcome)
    print("results:\n", res)
    print("-"*35)