In [2]:
import pandas as pd
import numpy as np
from sklearn import *
import pysubgroup as ps
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, r2_score
import sys
sys.path.append("../../RQ 4/src/RobustSubgroupDiscovery")
from RSD.rulelist_class import MDLRuleList


In [3]:
# read in data
iteration=0
train = False
model = "catboost-ve"
model_str = model if not train else "{0}-train".format(model)
dataset = "trauma_uk" 
n_bins = 2

In [6]:

#uncertainty-info_catboost-ve_trauma_uk
#uncertainty-info_catboost-ve-train_trauma_uk
fn = "../../input/{0}/uncertainty-info_{1}_{2}.csv".format(iteration, model_str, dataset)
# fn = "../../input/VarianceMapperResults_raw_trauma_uk_catboost-ve.csv"
df = pd.read_csv(fn, index_col="Unnamed: 0")
#drop truth column
df = df.drop(["truth", "p(positive class)", "ratio 1 predicted", "rate corrected predicted", "class uncertainty",], axis=1)

features = list(df.columns)
features.remove("uncertainty")
print(features)

# pysubgroup code for discrete
df = df.sort_values(by="uncertainty", ascending=True)
bins = pd.qcut(df['uncertainty'], q=n_bins, labels=list(range(n_bins)))
bins.rename("uncertainty group", inplace=True)
display(bins)


df_new = pd.concat([df, bins], axis=1)

display(df_new)

# plt.hist(df_new["uncertainty"])

['TRAUMATYPE', 'SYSBP', 'RR', 'GCS', 'EDMOTOR', 'SI', 'SIRANK', 'AGE', 'SEX', 'RTS', 'FlailAMPT', 'SCIAMPT', 'RRAMPT', 'GCSAMPT', 'ABDAMPT', 'ARMAMPT', 'CHESTAMPT', 'FACEAMPT', 'HEADAMPT', 'LEGAMPT', 'NECKAMPT', 'MultisystemAMPT', 'AnatomicalAMPT', 'PhysiologicalAMPT', 'LungAMPT', 'AMPT', 'Mechanism', 'RTSCode', 'AgeGroups', 'SBPCode', 'MotorCode', 'AMPT2']


9501     0
1790     0
4996     0
13316    0
4673     0
        ..
11983    1
1926     1
15683    1
12157    1
2523     1
Name: uncertainty group, Length: 18774, dtype: category
Categories (2, int64): [0 < 1]

Unnamed: 0,uncertainty,TRAUMATYPE,SYSBP,RR,GCS,EDMOTOR,SI,SIRANK,AGE,SEX,...,PhysiologicalAMPT,LungAMPT,AMPT,Mechanism,RTSCode,AgeGroups,SBPCode,MotorCode,AMPT2,uncertainty group
9501,0.001973,0.0,68.0,0.0,3.0,1.0,1.000000,2.0,50.0,0.0,...,1.0,1.0,7.0,0.0,1.0,4.0,1.0,1.0,1.0,0
1790,0.003888,0.0,77.0,0.0,7.0,4.0,1.000000,2.0,23.0,0.0,...,1.0,1.0,6.0,0.0,1.0,4.0,1.0,1.0,1.0,0
4996,0.004764,0.0,107.0,0.0,3.0,1.0,0.962617,2.0,37.0,1.0,...,1.0,1.0,7.0,0.0,1.0,4.0,0.0,1.0,1.0,0
13316,0.005701,0.0,0.0,0.0,3.0,1.0,1.000000,2.0,39.0,0.0,...,1.0,1.0,6.0,0.0,1.0,4.0,1.0,1.0,1.0,0
4673,0.006005,0.0,107.0,0.0,3.0,1.0,0.831776,2.0,32.0,1.0,...,1.0,1.0,7.0,0.0,1.0,4.0,0.0,1.0,1.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11983,0.693147,0.0,162.0,18.0,15.0,6.0,0.438272,0.0,74.0,0.0,...,0.0,0.0,0.0,1.0,0.0,5.0,0.0,0.0,0.0,1
1926,0.693147,0.0,138.0,20.0,15.0,6.0,0.666667,1.0,47.0,1.0,...,0.0,0.0,1.0,0.0,0.0,4.0,0.0,0.0,0.0,1
15683,0.693147,0.0,111.0,24.0,15.0,6.0,0.909910,2.0,10.0,0.0,...,0.0,0.0,1.0,1.0,0.0,3.0,0.0,0.0,0.0,1
12157,0.693147,0.0,159.0,20.0,15.0,6.0,0.710692,2.0,63.0,0.0,...,0.0,0.0,1.0,0.0,0.0,5.0,0.0,0.0,0.0,1


In [7]:
def get_uncertainty_values(bin, df):
    sub = df[df["uncertainty group"] == bin]
    unc = sub["uncertainty"]
    return {
        "mean": np.mean(unc),
        "min": np.min(unc),
        "max": np.max(unc)
    }

for i in range(n_bins):
    res = get_uncertainty_values(i, df_new)
    print(res)

{'mean': 0.13807594604851217, 'min': 0.0019733170073992, 'max': 0.3378377713667632}
{'mean': 0.5400630730303991, 'min': 0.337919636222405, 'max': 0.6931471804592587}


In [8]:
train, test = train_test_split(df_new, train_size=0.8, shuffle=True)

trainX = train[features]
trainY_cat = train["uncertainty group"]
trainY_continuous = train["uncertainty"]

testX = test[features]
testY_cat = test["uncertainty group"]
testY_continuous = test["uncertainty"]

In [69]:
target_model = 'categorical'
task = "discovery"

# user configuration
disc_type = "static"
max_len = 5
beamsize = 100
ncutpoints = 5
max_rules = np.inf
y_label = "uncertainty group"



model = MDLRuleList(task = task, target_model = target_model, max_rules=max_rules, n_cutpoints=ncutpoints)
model.fit(trainX, trainY_cat)

print(model)

pred = model.predict(testX)

true = testY_cat.values.astype(int)
pred = pred.astype(int)
print(accuracy_score(true, pred))
print(model.number_rules)


  for name, values in self.input_data.iteritems():
  self.categories = {colname: colvals.unique() for colname, colvals in target_values.iteritems()} #ignores NANs values
  for namecol, colvals in target_values.iteritems():


Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 6
Iteration: 7
Iteration: 8
Iteration: 9
Iteration: 10
Iteration: 11
Iteration: 12
Iteration: 13
Iteration: 14
Iteration: 15
Iteration: 16
Iteration: 17
Iteration: 18
Iteration: 19
Iteration: 20
Iteration: 21
Iteration: 22
Iteration: 23
Iteration: 24
Iteration: 25
Iteration: 26
Iteration: 27
Iteration: 28
Iteration: 29
Iteration: 30
Iteration: 31
Iteration: 32
Iteration: 33
Iteration: 34
Iteration: 35
Iteration: 36
Iteration: 37
Iteration: 38
Iteration: 39
Iteration: 40
Iteration: 41
Iteration: 42
Iteration: 43
Iteration: 44
Iteration: 45
Iteration: 46
Iteration: 47
Iteration: 48
Iteration: 49
Iteration: 50
Iteration: 51
Iteration: 52
Iteration: 53
Iteration: 54
If 0.0 <= AMPT2 < 1.0  AND  MultisystemAMPT >= 1.0  AND  0.0 <= ARMAMPT < 1.0  AND  0.0 <= ABDAMPT < 1.0  THEN  usage = 403 : target = uncertainty groupPr(1) = 1.0;Pr(0) = 0.0;
ELSE IF ARMAMPT >= 1.0  AND  0.0 <= AMPT < 1.0  AND  LEGAMPT >= 1.0  THEN  u

In [60]:
print(model.number_rules)

67


In [61]:
target_model = 'gaussian'
task = "discovery"

# user configuration
disc_type = "static"
max_len = 5
beamsize = 100
ncutpoints = 5
y_label = "uncertainty"


model = MDLRuleList(task = task, target_model = target_model, n_cutpoints=ncutpoints)
model.fit(trainX, trainY_continuous)


print(model)

pred = model.predict(testX)

true = testY_cat.values.astype(float)
pred = pred.astype(float)
print(r2_score(testY_continuous, pred))
print(model.number_rules)

  for name, values in self.input_data.iteritems():


Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 6
Iteration: 7
Iteration: 8
Iteration: 9
Iteration: 10
Iteration: 11
Iteration: 12
Iteration: 13
Iteration: 14
Iteration: 15
Iteration: 16
Iteration: 17
Iteration: 18
Iteration: 19
Iteration: 20
Iteration: 21
Iteration: 22
Iteration: 23
Iteration: 24
Iteration: 25
Iteration: 26
Iteration: 27
Iteration: 28
Iteration: 29
Iteration: 30
Iteration: 31
Iteration: 32
Iteration: 33
Iteration: 34
Iteration: 35
Iteration: 36
Iteration: 37
Iteration: 38
Iteration: 39
Iteration: 40
Iteration: 41
Iteration: 42
Iteration: 43
Iteration: 44
Iteration: 45
Iteration: 46
Iteration: 47
Iteration: 48
Iteration: 49
Iteration: 50
Iteration: 51
Iteration: 52
Iteration: 53
Iteration: 54
Iteration: 55
Iteration: 56
Iteration: 57
Iteration: 58
Iteration: 59
Iteration: 60
Iteration: 61
Iteration: 62
Iteration: 63
Iteration: 64
Iteration: 65
Iteration: 66
Iteration: 67
Iteration: 68
Iteration: 69
Iteration: 70
Iteration: 71
Iteration: 72
I

NameError: name 'r2_score' is not defined

In [64]:
true = testY_cat.values.astype(float)
pred = pred.astype(float)
print(r2_score(testY_continuous, pred))
print(model.number_rules)

0.8190629824574085
92
[0.32856301 0.11190844 0.348603   ... 0.27956713 0.66559283 0.53952801]


In [11]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    "max_depth": 5,
    "beam_width": 100,
    "min_support": 1,
    "iterative_beam_width": 1,
    "n_cutpoints": 5,
    "discretization": "static",
    "max_rules": np.inf
}


gs = GridSearchCV(MDLRuleList("gaussian", "discovery"), param_grid, scoring=accuracy_score)

gs.fit(trainX, trainY_cat)



TypeError: If no scoring is specified, the estimator passed should have a 'score' method. The estimator <RSD.rulelist_class.MDLRuleList object at 0x16962c8e0> does not.