In [4]:
import pandas as pd
import numpy as np
import itertools
import warnings
import matplotlib.pyplot as plt
# import seaborn as sns

warnings.filterwarnings('ignore')

In [5]:
# model = "catboost-ve"
# dataset = "trauma_uk"
# results_file = "results/subgroup_statistics_{model}_{dataset}.csv".format(model=model, dataset=dataset)
# df = pd.read_csv(results_file, index_col="Unnamed: 0")
# display(df)
models = ["NN-dropout"]
datasets = ["trauma_uk"]
# models = ["catboost-ve", "NN-dropout"]
# datasets = ["diabetes", "trauma_uk", "ED_3day_readmit", "hospitalization_prediction"] #"critical_outcome",

In [15]:
def get_dataset_sizes(dataset):
    train_sizes = []
    test_sizes = []
    for i in range(10):
        train = pd.read_csv("../data/cross_val_split/{0}_train_X_{1}.csv".format(dataset,i))
        test = pd.read_csv("../data/cross_val_split/{0}_test_X_{1}.csv".format(dataset,i))
        train_sizes.append(train.shape[0])
        test_sizes.append(test.shape[0])
    # train_size = np.mean(train_sizes)
    # test_size = np.mean(test_sizes)
    return {"train_size": train_sizes, "test_size": test_sizes}

dataset_sizes = {}
size_df = {}
for dataset in datasets:
    sze = get_dataset_sizes(dataset)
    dataset_sizes[dataset] = sze
    size_df[dataset] = {"train mean": np.mean(sze["train_size"]),
                        "train std": np.std(sze["train_size"]),
                        "test mean": np.mean(sze["test_size"]),
                        "test std": np.std(sze["test_size"]),
    }

pd.DataFrame(size_df).transpose()

Unnamed: 0,test mean,test std,train mean,train std
trauma_uk,5688.8,0.4,51199.2,0.4


In [16]:
#params

def read_results_file(model, dataset, group):
    results_file = "results/{group}/subgroup_statistics_{model}_{dataset}.csv".format(model=model, dataset=dataset, group=group)
    # print(results_file)
    df = pd.read_csv(results_file, index_col="Unnamed: 0")
    return df

def get_sd_res(alpha_gain, n_bins, min_support, df):
    cond_1 = (df["alpha_gain"] == alpha_gain)
    cond_2 = (df["n_bins"] == n_bins)
    cond_3 = (df["min_support"] == min_support)
    sub_df = df[cond_1 & cond_2 & cond_3]
    return sub_df

def format_res(mean, std):
    mean = np.around(mean, 3)
    std = np.around(std, 3)
    return "{0} ({1})".format(mean, std)

# Table Generation Functions
Classification Accuracy
| Dataset | No. Learned Rules | Accuracy | Coverage
----------|-------------------|----------|----------

2 Bin Accuracy
| Dataset | Bin 0 Accuracy  | Bin 1 Accuracy |
----------|-----------------|----------------|

3 bin Accuracy
| Dataset | Bin 0 Accuracy  | Bin 1 Accuracy | Bin 2 Accuracy 
----------|-----------------|----------------|----------------



In [17]:
def calculate_average_subgroup_size(model, dataset, group, alpha_gain, n_bins, min_support):
    df = read_results_file(model, dataset, group)
    df = get_sd_res(alpha_gain, n_bins, min_support, df)
    size_info = dataset_sizes[dataset]["test_size"]
    print(size_info)
    sizes = []
    for id, row in df.iterrows():
        n_g = int(row["no_rules"])
        iteration = int(row["iteration"])
        for g in range(n_g):
            n_cond = row["subgroup {0} size".format(g)]
            if not np.isnan(n_cond):
                sizes.append(n_cond/size_info[iteration])
    sizes = np.asarray(sizes)

    return np.mean(sizes), np.std(sizes)



def generate_classification_accuracy_row(model, dataset, group, alpha_gain, n_bins, min_support):
    df = read_results_file(model, dataset, group)
    df = get_sd_res(alpha_gain, n_bins, min_support, df)
    df = df[["dataset", "model", "no_rules", "bin assignment accuracy overall", "coverage"]]
    nr_mean = np.mean(df["no_rules"])
    nr_std = np.std(df["no_rules"])
    ba_mean = np.mean(df["bin assignment accuracy overall"])
    ba_std = np.std(df["bin assignment accuracy overall"])
    cv_mean = np.mean(df["coverage"])
    cv_std = np.std(df["coverage"])
    sz_mean, sz_std = calculate_average_subgroup_size(model, dataset, group, alpha_gain, n_bins, min_support)
    to_return = {
        "dataset": dataset,
        "model": model,
        "no_rules": nr_mean,
        "no_rules std": nr_std,
        "bin assignment accuracy": ba_mean,
        "bin assignment accuracy std": ba_std,
        "coverage": cv_mean,
        "coverage std": cv_std,
        "size mean": sz_mean, 
        "size std": sz_std 
    }
    return to_return

def generate_classification_accuracy_tables(alpha_gain, n_bins, min_support, group, model, datasets):
        results = []
        for dataset in datasets:
            res = generate_classification_accuracy_row(model, dataset, group, alpha_gain, n_bins, min_support)
            results.append(res)
        results = pd.DataFrame(results)
        # display(results)
        return results




In [18]:
def generate_nbin_accuracy_row(model, dataset, n_bins, group, alpha_gain, min_support):
    # print(model, dataset, n_bins, group, alpha_gain, min_support)
    key="pred"
    df = read_results_file(model, dataset, group)
    df = get_sd_res(alpha_gain, n_bins, min_support, df)
    cols = ["dataset", "model"]
    for bin in range(n_bins):
        cols.append("classsification accuracy {0} by {1}".format(bin, key))
    df = df[cols]
    to_return = {
        "dataset": dataset,
        "model": model,
    }
    to_return = {"dataset": dataset, "model": model}
    
    for bin in range(n_bins):
        bin_x_accuracy = np.mean(df["classsification accuracy {0} by {1}".format(bin, key)])
        bin_x_std = np.std(df["classsification accuracy {0} by {1}".format(bin, key)])
        # to_return["bin {0} accuracy".format(bin)] = format_res(bin_x_accuracy, bin_x_std)
        to_return["bin {0} accuracy".format(bin)] = bin_x_accuracy
        to_return["bin {0} accuracy std".format(bin)] = bin_x_std
    
    diff_ranges = []
    for bin in range(n_bins-1):
        diff_ranges.append((bin, bin+1))
    
    for start, end in diff_ranges:
        bin_start_accuracy = df["classsification accuracy {0} by {1}".format(start, key)].values
        bin_end_accuracy = df["classsification accuracy {0} by {1}".format(end, key)].values
        # print(np.unique(bin_start_accuracy, return_counts=True), np.unique(bin_end_accuracy, return_counts=True))
        # print(np.count_nonzero(np.isnan(bin_start_accuracy)), np.count_nonzero(np.isnan(bin_start_accuracy)))
        delta = bin_start_accuracy - bin_end_accuracy
        # print(delta)
        delta_mean = np.mean(delta)
        delta_std = np.std(delta)
        to_return["Delta {0}-{1}".format(start, end)] = delta_mean #n, delta_std)#"{0} ({1})".format(delta_mean, delta_std)
        to_return["Delta {0}-{1} std".format(start, end)] = delta_std #format_res(delta_mean, delta_std)#"{0} ({1})".format(delta_mean, delta_std)

    return to_return

def generate_nbin_accuracy_accuracy_tables(alpha_gain, min_support, n_bins, group, model, datasets):
    results = []
    for dataset in datasets:
        # print(model, dataset)
        res = generate_nbin_accuracy_row(model, dataset, n_bins, group, alpha_gain, min_support)
        results.append(res)
    results = pd.DataFrame(results)
    # display(results)
    return results


In [19]:
def generate_workbook(alpha_gain, min_support, group, model, datasets):
    wkbk = {}
    wkbk["2 bin sd metrics"] = generate_classification_accuracy_tables(alpha_gain=alpha_gain, n_bins=2, min_support=min_support, group=group, model=model, datasets=datasets)
    wkbk["3 bin sd metrics"] = generate_classification_accuracy_tables(alpha_gain=alpha_gain, n_bins=3, min_support=min_support, group=group, model=model, datasets=datasets)
    # wkbk["4 bin sd metrics"] = generate_classification_accuracy_tables(alpha_gain=alpha_gain, n_bins=4, min_support=min_support, group=group, model=model, datasets=datasets)
    # wkbk["5 bin sd metrics"] = generate_classification_accuracy_tables(alpha_gain=alpha_gain, n_bins=3, min_support=min_support, group=group, model=model, datasets=datasets)
    wkbk["2 bin accuracy table"] = generate_nbin_accuracy_accuracy_tables(alpha_gain=alpha_gain, n_bins=2, min_support=min_support, group=group, model=model, datasets=datasets)
    # display(wkbk["2 bin accuracy table"])
    wkbk["3 bin accuracy table"] = generate_nbin_accuracy_accuracy_tables(alpha_gain=alpha_gain, n_bins=3, min_support=min_support, group=group, model=model, datasets=datasets)
    # display(wkbk["3 bin accuracy table"])
    # wkbk["4 bin accuracy table"] = generate_nbin_accuracy_accuracy_tables(alpha_gain=alpha_gain, n_bins=4, min_support=min_support, group=group, model=model, datasets=datasets)
    # wkbk["5 bin accuracy table"] = generate_nbin_accuracy_accuracy_tables(alpha_gain=alpha_gain, n_bins=5, min_support=min_support, group=group, model=model, datasets=datasets)
    xlsx_fn = "results/{group}_{model}_alpha-gain{alpha_gain}_min-support{min_support}.xlsx".format(group=group, alpha_gain=alpha_gain, min_support=min_support, model=model)
    with pd.ExcelWriter(xlsx_fn) as writer:  
        for key in wkbk:
            df = wkbk[key]
            df.to_excel(writer, sheet_name=key)  
    return wkbk


In [20]:
# code adapted from stack overflow

def gen_df_plot(df, datasets, n_bins):
    new_df = []
    for dataset, dataset_full in datasets:
        sub_df = df[df["dataset"] == dataset]
        
        for index, row in sub_df.iterrows():
            for bin in range(n_bins):
                new_row = {}
                new_row["Dataset"] = dataset_full
                new_row["bin"] = "Bin {0}".format(bin)
                new_row["Classification Accuracy"] = row["bin {0} accuracy".format(bin)]
                new_row["std dev"] = row["bin {0} accuracy std".format(bin)]
                new_df.append(new_row)
    new_df = pd.DataFrame(new_df)
    return new_df

def grouped_barplot(df, cat, subcat, val, err,figname):
    plt.figure(figsize=(12.4, 4.8)) 
    hatches = ["/", "//", "///"]
    u = df[cat].unique()
    x = np.arange(len(u))
    subx = df[subcat].unique()
    offsets = (np.arange(len(subx))-np.arange(len(subx)).mean())/(len(subx)+1.)
    width= np.diff(offsets).mean()
    for i,gr in enumerate(subx):
        dfg = df[df[subcat] == gr]
        plt.bar(x+offsets[i], 
                dfg[val].values, 
                width=width, 
                label="{}".format(gr), 
                color = "white",
                hatch = hatches[i],
                edgecolor="black",
                yerr=None)
    plt.xlabel(cat)
    plt.ylabel(val)
    plt.xticks(x, u)
    plt.legend()
    # plt.show()
    plt.savefig("results/{group}/{figname}.png".format(group=group,figname=figname))


In [21]:
import scipy
def calc_correlation_coef(x, y):
    # print(x,y)
    _, _, r_value, p_value, _ = scipy.stats.linregress(x, y)
    return r_value, p_value



In [22]:
groups = ["debug" ]
alpha_gains = [0,]
min_support = [1]
models = ["NN-dropout"]
# models = ["catboost-ve", "NN-dropout"]
datasets = ["trauma_uk"]
# datasets = ["trauma_uk", "diabetes", "critical_outcome", "ED_3day_readmit", "hospitalization_prediction"]
# datasets_plot = [("trauma_uk", "KY Trauma Triage"), 
#                 ("diabetes", "Pima Diabetes"), 
#                 ("critical_outcome", "Critical Outcome"), 
#                 ("ED_3day_readmit", "ED 3 Day Readmit"), 
#                 ("hospitalization_prediction", "Hosptialization Prediction")]
datasets_plot = [("trauma_uk", "KY Trauma Triage")]
combos = list(itertools.product(groups, alpha_gains, min_support, models))
for group, alpha_gain, min_support, model in combos:
    # print(group, alpha_gain, min_support, model)
    print(model)
    wkbk = generate_workbook(alpha_gain, min_support, group, model, datasets)
    # r, p = calc_correlation_coef(x=wkbk["2 bin sd metrics"]["bin assignment accuracy"], y=wkbk["2 bin accuracy table"]["Delta 0-1"])
    # print("2 bin assignment accuracy vs Delta 0-1: r^2 = {0}, p = {1}".format(r, p))
    # r, p = calc_correlation_coef(x=wkbk["3 bin sd metrics"]["bin assignment accuracy"], y=wkbk["3 bin accuracy table"]["Delta 0-1"])
    # print("3 bin assignment accuracy vs Delta 0-1: r^2 = {0}, p = {1}".format(r, p))
    # r, p = calc_correlation_coef(x=wkbk["3 bin sd metrics"]["bin assignment accuracy"], y=wkbk["3 bin accuracy table"]["Delta 1-2"])
    # print("3 bin assignment accuracy vs Delta 1-2: r^2 = {0}, p = {1}".format(r, p))
    # new_df = gen_df_plot(df=wkbk["2 bin accuracy table"], datasets=datasets_plot, n_bins=2)
    # grouped_barplot(df=new_df, cat = "Dataset", subcat = "bin", val = "Classification Accuracy", err="std dev", figname="{0}_nbins{1}".format(model, 2))
    # new_df = gen_df_plot(df=wkbk["3 bin accuracy table"], datasets=datasets_plot, n_bins=3)
    # grouped_barplot(df=new_df, cat = "Dataset", subcat = "bin", val = "Classification Accuracy", err="std dev", figname="{0}_nbins{1}".format(model, 3))

NN-dropout


KeyError: "['no_rules', 'bin assignment accuracy overall', 'coverage'] not in index"

In [None]:
groups = ["cut_final2" ]
alpha_gains = [0,]
min_support = [1]
models = ["catboost-ve", "NN-dropout"]
datasets = ["diabetes", "trauma_uk", "ED_3day_readmit", "hospitalization_prediction"] #"critical_outcome",

combos = list(itertools.product(groups, alpha_gains, min_support, models))
for group, alpha_gain, min_support, model in combos:
    try:
        # print(group, alpha_gain, min_support, model)
        generate_workbook(alpha_gain, min_support, group, model, datasets)
    except:
        print("ERROR: ", group, alpha_gain, min_support, model)

Unnamed: 0,dataset,train_size,test_size
0,trauma_uk,51199.2,5688.8
1,diabetes,691.2,76.8
2,critical_outcome,397293.3,44143.7
3,ED_3day_readmit,397293.3,44143.7
4,hospitalization_prediction,397293.3,44143.7
