In [9]:
%cd ~/dissertation_code
import pandas as pd
import numpy as np
from functools import reduce
# from sklearn import *
from sklearn.metrics import confusion_matrix
import pysubgroup as ps #test comment
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, r2_score
import sys, itertools
# from rulelist.rulelistmodel.categoricalmodel.categoricalrulelist import CategoricalRuleList
from rulelist import RuleList, SubgroupListCategorical
# from RuleList import SubgroupListCategorical
from rulelist.measures import subgroup_measures

# import categorical_rsd as crsd

/home/tntech.edu/kebrown46/dissertation_code


In [10]:
def read_data(iteration, model, dataset, n_bins, bin_split="qcut", include_probability=False):
    print("IN read_data")
    train_fn = "input/{0}/uncertainty-info_{1}-train_{2}.csv".format(iteration, model, dataset)
    test_fn = "input/{0}/uncertainty-info_{1}_{2}.csv".format(iteration, model, dataset)

    train_df = pd.read_csv(train_fn, index_col="Unnamed: 0")
    test_df = pd.read_csv(test_fn, index_col="Unnamed: 0")
    #drop truth column
    train_ycls = train_df["truth"]
    test_ycls = test_df["truth"]

    if not include_probability:
        train_df = train_df.drop(["p(positive class)", "truth", "ratio 1 predicted", "rate corrected predicted", "class uncertainty"], axis=1)
        test_df = test_df.drop(["p(positive class)", "truth", "ratio 1 predicted", "rate corrected predicted", "class uncertainty"], axis=1)
    else:
        train_df = train_df.drop(["truth", "ratio 1 predicted", "rate corrected predicted", "class uncertainty"], axis=1)
        test_df = test_df.drop(["truth", "ratio 1 predicted", "rate corrected predicted", "class uncertainty"], axis=1)

    # pysubgroup code for discrete
    train_df = train_df.sort_values(by="uncertainty", ascending=True)
    if bin_split == "cut":
        bins,cps = pd.cut(train_df['uncertainty'], duplicates="drop", bins=n_bins, retbins=True, labels=list(range(n_bins)))
        bins_,cps_ = pd.cut(train_df['uncertainty'], duplicates="drop", bins=n_bins, retbins=True)
    else:
        bins,cps = pd.qcut(train_df['uncertainty'], duplicates="drop", q=n_bins, retbins=True, labels=list(range(n_bins)))
        bins_,cps_ = pd.qcut(train_df['uncertainty'], duplicates="drop", q=n_bins, retbins=True)

    bins.rename("uncertainty group", inplace=True)
    bins_.value_counts().to_csv("RQ 3/results/bins_{dataset}_{model}_{bins}bins.csv".format(dataset=dataset, model=model, bins=n_bins))
    train_df["uncertainty group"] = bins
    #train_df = train_df.sample(frac=1)
    

    test_df["uncertainty group"] = pd.cut(test_df["uncertainty"], right=False, duplicates="drop", bins=cps, labels=False, include_lowest=True)
    # df.loc[df['A'] > 2, 'B'] = new_val

    print(cps)
    print(cps[0])
    print(test_df["uncertainty"] <= cps[0])
    test_df.loc[test_df["uncertainty"] >= cps[-1], "uncertainty group"] = len(cps)-2#.fillna(len(cps)-2, inplace=True)
    test_df.loc[test_df["uncertainty"] <= cps[0], "uncertainty group"] = 0#.fillna(0, inplace=True)


    # test_df.dropna(inplace=True)
    # test_df =  pd.concat([test_df, sub_1, sub_2], axis=0)


    print("!! UQ GROUP VALUES: ", np.unique(test_df["uncertainty group"].values))
    print(test_df.loc[np.isnan(test_df["uncertainty group"])])
    print(test_df[["uncertainty group", "uncertainty"]])
    print(np.unique(test_df["uncertainty group"], return_counts=True))

    train_df = train_df.drop(["uncertainty"], axis=1)
    test_df = test_df.drop(["uncertainty"], axis=1)

    return train_df, test_df, train_ycls, test_ycls

In [11]:
def _get_validation_curves(n_bins, iteration, model, dataset):
    train_df, test_df, train_ycls, test_ycls = read_data(iteration, 
                                                              model,
                                                              dataset, 
                                                              n_bins, 
                                                              bin_split="qcut", 
                                                              include_probability= False)
    train_df, val_df = train_test_split(train_df, test_size=0.2)
    
    print("In run_sgd")
    print("test_df.shape = ", test_df.shape)
    model_name = model
    features = list(test_df.columns)
    for x in ["uncertainty", "uncertainty group"]:
        while x in features: features.remove(x)

    trainX = train_df[features]
    valX = val_df[features]
    print(list(trainX))
    trainY_cat = train_df["uncertainty group"]
    valY_cat = val_df["uncertainty group"]
    
    testX = test_df[features]
    print(list(testX))
    testY_cat = test_df["uncertainty group"]
    
    
    results = {}
    alphas = [increment/100 for increment in range(0, 100, 5)]
    for alpha in alphas:
        args = {
                # "data": data,
                "max_rules": np.inf,
                "beam_width": 100,
                "min_support": 1,
                "beam_width": 1,
                "max_depth": 5, 
                "alpha_gain": alpha,
        }

        print(args)
        model = SubgroupListCategorical(**args)
        # SubgroupListCategorical(**args)
        model.fit(trainX, trainY_cat)
        print(model)
        pred = model.predict(testX)
        true = testY_cat.values.astype(int)
        pred = pred.astype(int)

        accuracy = accuracy_score(true, pred)
        no_rules = model.number_rules
        results[alpha] = {
            "alpha": alpha,
            "Validation Fidelity": accuracy,
            "No. Subgroups": no_rules
        }
    return results
        
def get_validation_curves(n_bins, model, dataset, res_key):
    
    for i in range(0,10):
        results = _get_validation_curves(n_bins=2, iteration=0, model=model, dataset=dataset)
        res_df = pd.DataFrame(results).transpose()
        res_df.to_csv(f"RQ 3/results/{res_key}/RSDTuning_{model}_{dataset}_{i}_bins{n_bins}.csv")
    
    

In [None]:
n_bins_list = [2,3]
model_list = ["NN-dropout", "catboost-ve"]
datasets = ["trauma_uk", "critical_outcome", "ED_3day_readmit", "hospitalization_prediction", "diabetes",]
eval_combos = itertools.product(n_bins_list, model_list, datasets)
for n, m, d in eval_combos:
    print(n,m,d)
    res = get_validation_curves(n_bins=n, model=m, dataset=d, res_key="dissertation_SG")

2 NN-dropout trauma_uk
IN read_data
[0.00074678 0.04454555 0.2501911 ]
0.00074677815
0       False
1       False
2       False
3       False
4       False
        ...  
5684    False
5685    False
5686    False
5687    False
5688    False
Name: uncertainty, Length: 5689, dtype: bool
!! UQ GROUP VALUES:  [0 1]
Empty DataFrame
Columns: [uncertainty, TRAUMATYPE, SYSBP, RR, GCS, EDMOTOR, SI, SIRANK, AGE, SEX, RTS, FlailAMPT, SCIAMPT, RRAMPT, GCSAMPT, ABDAMPT, ARMAMPT, CHESTAMPT, FACEAMPT, HEADAMPT, LEGAMPT, NECKAMPT, MultisystemAMPT, AnatomicalAMPT, PhysiologicalAMPT, LungAMPT, AMPT, Mechanism, RTSCode, AgeGroups, SBPCode, MotorCode, AMPT2, uncertainty group]
Index: []

[0 rows x 34 columns]
      uncertainty group  uncertainty
0                     0     0.014881
1                     0     0.032195
2                     0     0.021461
3                     1     0.044716
4                     0     0.011303
...                 ...          ...
5684                  1     0.052608
5685   

  for name, values in self.input_data.iteritems():
  self.categories = {colname: colvals.unique() for colname, colvals in target_values.iteritems()} #ignores NANs values
  for namecol, colvals in target_values.iteritems():


Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 6
Iteration: 7
Iteration: 8
Iteration: 9
Iteration: 10
Iteration: 11
Iteration: 12
Iteration: 13
Iteration: 14
If AMPT >= 1.0  THEN  usage = 16848 : target = uncertainty groupPr(0) = 0.13912630579297247;Pr(1) = 0.8608736942070275;
ELSE IF 0.0 <= ABDAMPT < 1.0  AND  0.0 <= HEADAMPT < 1.0  AND  0.0 <= CHESTAMPT < 1.0  AND  RTS >= 7.1082  THEN  usage = 12603 : target = uncertainty groupPr(0) = 0.9632627152265334;Pr(1) = 0.03673728477346663;
ELSE IF AGE >= 61.0  AND  Mechanism >= 1.0  AND  HEADAMPT >= 1.0  AND  0.0 <= ABDAMPT < 1.0  THEN  usage = 700 : target = uncertainty groupPr(0) = 0.018571428571428572;Pr(1) = 0.9814285714285714;
ELSE IF HEADAMPT >= 1.0  AND  0.0 <= LEGAMPT < 1.0  AND  AgeGroups < 5.0  AND  0.0 <= ARMAMPT < 1.0  AND  SI < 0.9219304079999999  THEN  usage = 3248 : target = uncertainty groupPr(0) = 0.75;Pr(1) = 0.25;
ELSE IF Mechanism >= 1.0  AND  AGE >= 36.0  AND  RTS >= 7.1082  THEN  usage = 153

  for name, values in self.input_data.iteritems():
  self.categories = {colname: colvals.unique() for colname, colvals in target_values.iteritems()} #ignores NANs values
  for namecol, colvals in target_values.iteritems():


Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 6
Iteration: 7
Iteration: 8
Iteration: 9
Iteration: 10
Iteration: 11
Iteration: 12
Iteration: 13
Iteration: 14
Iteration: 15
Iteration: 16
Iteration: 17
Iteration: 18
If AMPT >= 1.0  THEN  usage = 16848 : target = uncertainty groupPr(0) = 0.13912630579297247;Pr(1) = 0.8608736942070275;
ELSE IF 0.0 <= ABDAMPT < 1.0  AND  0.0 <= HEADAMPT < 1.0  AND  0.0 <= CHESTAMPT < 1.0  AND  RTS >= 7.1082  THEN  usage = 12603 : target = uncertainty groupPr(0) = 0.9632627152265334;Pr(1) = 0.03673728477346663;
ELSE IF AGE >= 61.0  AND  Mechanism >= 1.0  AND  HEADAMPT >= 1.0  AND  0.0 <= ABDAMPT < 1.0  THEN  usage = 700 : target = uncertainty groupPr(0) = 0.018571428571428572;Pr(1) = 0.9814285714285714;
ELSE IF HEADAMPT >= 1.0  AND  0.0 <= LEGAMPT < 1.0  AND  AgeGroups < 5.0  AND  0.0 <= Mechanism < 1.0  AND  0.0 <= SEX < 1.0  THEN  usage = 1744 : target = uncertainty groupPr(0) = 0.8245412844036697;Pr(1) = 0.17545871559633028;
E

  for name, values in self.input_data.iteritems():
  self.categories = {colname: colvals.unique() for colname, colvals in target_values.iteritems()} #ignores NANs values
  for namecol, colvals in target_values.iteritems():


Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 6
Iteration: 7
Iteration: 8
Iteration: 9
Iteration: 10
Iteration: 11
Iteration: 12
Iteration: 13
Iteration: 14
Iteration: 15
Iteration: 16
Iteration: 17
Iteration: 18
Iteration: 19
If AMPT >= 1.0  THEN  usage = 16848 : target = uncertainty groupPr(0) = 0.13912630579297247;Pr(1) = 0.8608736942070275;
ELSE IF 0.0 <= HEADAMPT < 1.0  AND  0.0 <= ABDAMPT < 1.0  AND  0.0 <= CHESTAMPT < 1.0  AND  0.0 <= RTSCode < 1.0  THEN  usage = 12523 : target = uncertainty groupPr(0) = 0.9639862652718997;Pr(1) = 0.036013734728100294;
ELSE IF AGE >= 61.0  AND  Mechanism >= 1.0  AND  HEADAMPT >= 1.0  AND  0.0 <= ABDAMPT < 1.0  THEN  usage = 700 : target = uncertainty groupPr(0) = 0.018571428571428572;Pr(1) = 0.9814285714285714;
ELSE IF HEADAMPT >= 1.0  AND  AGE < 47.0  AND  0.0 <= LEGAMPT < 1.0  AND  0.0 <= SEX < 1.0  AND  0.0 <= ARMAMPT < 1.0  THEN  usage = 2329 : target = uncertainty groupPr(0) = 0.7908973808501503;Pr(1) = 0.20910

  for name, values in self.input_data.iteritems():
  self.categories = {colname: colvals.unique() for colname, colvals in target_values.iteritems()} #ignores NANs values
  for namecol, colvals in target_values.iteritems():


Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 6
Iteration: 7
Iteration: 8
Iteration: 9
Iteration: 10
Iteration: 11
Iteration: 12
Iteration: 13
Iteration: 14
Iteration: 15
Iteration: 16
Iteration: 17
Iteration: 18
Iteration: 19
Iteration: 20
Iteration: 21
Iteration: 22
Iteration: 23
Iteration: 24
Iteration: 25
Iteration: 26
Iteration: 27
Iteration: 28
If AMPT >= 1.0  AND  0.0 <= PhysiologicalAMPT < 1.0  AND  MultisystemAMPT >= 1.0  THEN  usage = 6423 : target = uncertainty groupPr(0) = 0.027245835279464423;Pr(1) = 0.9727541647205356;
ELSE IF 0.0 <= AMPT < 1.0  AND  0.0 <= HEADAMPT < 1.0  AND  0.0 <= ABDAMPT < 1.0  AND  0.0 <= CHESTAMPT < 1.0  AND  0.0 <= RTSCode < 1.0  THEN  usage = 12523 : target = uncertainty groupPr(0) = 0.9639862652718997;Pr(1) = 0.036013734728100294;
ELSE IF AMPT2 >= 1.0  AND  0.0 <= MultisystemAMPT < 1.0  THEN  usage = 3249 : target = uncertainty groupPr(0) = 0.009541397353031703;Pr(1) = 0.9904586026469683;
ELSE IF AnatomicalAMPT >= 1

  for name, values in self.input_data.iteritems():
  self.categories = {colname: colvals.unique() for colname, colvals in target_values.iteritems()} #ignores NANs values
  for namecol, colvals in target_values.iteritems():


Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 6
Iteration: 7
Iteration: 8
Iteration: 9
Iteration: 10
Iteration: 11
Iteration: 12
Iteration: 13
Iteration: 14
Iteration: 15
Iteration: 16
Iteration: 17
Iteration: 18
Iteration: 19
Iteration: 20
Iteration: 21
Iteration: 22
Iteration: 23
Iteration: 24
Iteration: 25
Iteration: 26
If AMPT >= 1.0  AND  0.0 <= PhysiologicalAMPT < 1.0  AND  MultisystemAMPT >= 1.0  THEN  usage = 6423 : target = uncertainty groupPr(0) = 0.027245835279464423;Pr(1) = 0.9727541647205356;
ELSE IF AMPT2 >= 1.0  AND  0.0 <= MultisystemAMPT < 1.0  THEN  usage = 3249 : target = uncertainty groupPr(0) = 0.009541397353031703;Pr(1) = 0.9904586026469683;
ELSE IF 0.0 <= AMPT < 1.0  AND  0.0 <= HEADAMPT < 1.0  AND  0.0 <= ABDAMPT < 1.0  AND  0.0 <= CHESTAMPT < 1.0  AND  0.0 <= RTSCode < 1.0  THEN  usage = 12523 : target = uncertainty groupPr(0) = 0.9639862652718997;Pr(1) = 0.036013734728100294;
ELSE IF AnatomicalAMPT >= 1.0  AND  0.0 <= ABDAMPT < 1.

  for name, values in self.input_data.iteritems():
  self.categories = {colname: colvals.unique() for colname, colvals in target_values.iteritems()} #ignores NANs values
  for namecol, colvals in target_values.iteritems():


Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 6
Iteration: 7
Iteration: 8
Iteration: 9
Iteration: 10
Iteration: 11
Iteration: 12
Iteration: 13
Iteration: 14
Iteration: 15
Iteration: 16
Iteration: 17
Iteration: 18
Iteration: 19
Iteration: 20
Iteration: 21
Iteration: 22
Iteration: 23
Iteration: 24
Iteration: 25
Iteration: 26
Iteration: 27


In [None]:
print("DONE lol")