In [1]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree._tree import TREE_LEAF
from sklearn.model_selection import GridSearchCV, train_test_split
from tqdm import tqdm

In [2]:
# load the csv, looks fairly clean, source: https://github.com/propublica/compas-analysis/blob/master/
compas = pd.read_csv("compas-scores-two-years.csv")
compas

Unnamed: 0,id,name,first,last,compas_screening_date,sex,dob,age,age_cat,race,...,v_decile_score,v_score_text,v_screening_date,in_custody,out_custody,priors_count.1,start,end,event,two_year_recid
0,1,miguel hernandez,miguel,hernandez,2013-08-14,Male,1947-04-18,69,Greater than 45,Other,...,1,Low,2013-08-14,2014-07-07,2014-07-14,0,0,327,0,0
1,3,kevon dixon,kevon,dixon,2013-01-27,Male,1982-01-22,34,25 - 45,African-American,...,1,Low,2013-01-27,2013-01-26,2013-02-05,0,9,159,1,1
2,4,ed philo,ed,philo,2013-04-14,Male,1991-05-14,24,Less than 25,African-American,...,3,Low,2013-04-14,2013-06-16,2013-06-16,4,0,63,0,1
3,5,marcu brown,marcu,brown,2013-01-13,Male,1993-01-21,23,Less than 25,African-American,...,6,Medium,2013-01-13,,,1,0,1174,0,0
4,6,bouthy pierrelouis,bouthy,pierrelouis,2013-03-26,Male,1973-01-22,43,25 - 45,Other,...,1,Low,2013-03-26,,,2,0,1102,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7209,10996,steven butler,steven,butler,2013-11-23,Male,1992-07-17,23,Less than 25,African-American,...,5,Medium,2013-11-23,2013-11-22,2013-11-24,0,1,860,0,0
7210,10997,malcolm simmons,malcolm,simmons,2014-02-01,Male,1993-03-25,23,Less than 25,African-American,...,5,Medium,2014-02-01,2014-01-31,2014-02-02,0,1,790,0,0
7211,10999,winston gregory,winston,gregory,2014-01-14,Male,1958-10-01,57,Greater than 45,Other,...,1,Low,2014-01-14,2014-01-13,2014-01-14,0,0,808,0,0
7212,11000,farrah jean,farrah,jean,2014-03-09,Female,1982-11-17,33,25 - 45,African-American,...,2,Low,2014-03-09,2014-03-08,2014-03-09,3,0,754,0,0


In [3]:
# copy the pre-processing steps from "Loading Data" found on: https://github.com/propublica/compas-analysis/blob/master/Compas%20Analysis.ipynb
compas = compas[(compas["days_b_screening_arrest"] <= 30) & (compas["days_b_screening_arrest"] >= -30)]

# nr of rows match those in link
compas["sex"]

0         Male
1         Male
2         Male
5         Male
6         Male
         ...  
7209      Male
7210      Male
7211      Male
7212    Female
7213    Female
Name: sex, Length: 6172, dtype: object

In [4]:
# separate labels
compas_y = compas["two_year_recid"]
compas_X = compas.drop("two_year_recid", axis=1)

In [5]:
# ensure favorable prediction is 1 and vice versa
compas_y = compas_y.map({0:1, 1:0})

In [6]:
# drop rows that contain no valuable information (id, name etc)
compas_X = compas_X[["age", "c_charge_degree", "age_cat", "score_text", "sex", "priors_count", 
                    "days_b_screening_arrest", "decile_score", "race", "in_custody", "out_custody"]]


In [7]:
# convert dates to just years and numerical types
def date_to_justyear(date):
    if type(date) == str:
        return int(date[:4])
    
    return date

for column in ["in_custody", "out_custody"]:
    compas_X[column] = compas_X[column].apply(func=date_to_justyear, convert_dtype=True)
    

In [8]:
# separate sensitive attributes
compas_sex = compas_X["sex"]
compas_race = compas_X["race"]
compas_age = compas_X["age"]
compas_age_cat = compas_X["age_cat"]
compas_X = compas_X.drop(["race", "sex", "age", "age_cat"], axis=1)

In [9]:
# distinguish between privileged and un-privileged ethnic groups
compas_race = compas_race.map({"Caucasian": "White", "African-American": "Non_White", "Hispanic": "Non_White", "Other": "Non_White", "Asian": "Non_White", "Native American": "Non_White"})

In [10]:
# make intersectional sensitive feature
compas_sexrace = pd.concat([compas_sex, compas_race], axis=1)
compas_sex_race = compas_sexrace[['sex', 'race']].agg('-'.join, axis=1)
compas_sex_race

0         Male-Non_White
1         Male-Non_White
2         Male-Non_White
5         Male-Non_White
6             Male-White
              ...       
7209      Male-Non_White
7210      Male-Non_White
7211      Male-Non_White
7212    Female-Non_White
7213    Female-Non_White
Length: 6172, dtype: object

In [11]:
# impute the numerical missing values with the median
compas_X = compas_X.fillna(compas_X.median(numeric_only=True))

In [12]:
# bin the numerical features into 5 equal-width bins
bins = 5
compas_X["priors_count"] = pd.cut(compas_X["priors_count"], bins=bins)
compas_X["days_b_screening_arrest"] = pd.cut(compas_X["days_b_screening_arrest"], bins=bins)

In [13]:
# convert to one-hot-encoding
compas_cat_X = pd.get_dummies(compas_X, columns=compas_X.columns)
compas_cat_X

Unnamed: 0,c_charge_degree_F,c_charge_degree_M,score_text_High,score_text_Low,score_text_Medium,"priors_count_(-0.038, 7.6]","priors_count_(7.6, 15.2]","priors_count_(15.2, 22.8]","priors_count_(22.8, 30.4]","priors_count_(30.4, 38.0]",...,in_custody_2009,in_custody_2013,in_custody_2014,in_custody_2015,in_custody_2016,out_custody_2013,out_custody_2014,out_custody_2015,out_custody_2016,out_custody_2020
0,1,0,0,1,0,1,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0
1,1,0,0,1,0,1,0,0,0,0,...,0,1,0,0,0,1,0,0,0,0
2,1,0,0,1,0,1,0,0,0,0,...,0,1,0,0,0,1,0,0,0,0
5,0,1,0,1,0,1,0,0,0,0,...,0,1,0,0,0,1,0,0,0,0
6,1,0,0,0,1,0,1,0,0,0,...,0,0,1,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7209,1,0,0,0,1,1,0,0,0,0,...,0,1,0,0,0,1,0,0,0,0
7210,1,0,0,1,0,1,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0
7211,1,0,0,1,0,1,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0
7212,0,1,0,1,0,1,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0


In [14]:
# make a train and test split with same proportional size as Adult dataset 
adult_test_size = 1/3
compas_train_cat_X, compas_test_cat_X, compas_train_y, compas_test_y= train_test_split(compas_cat_X, compas_y, test_size=adult_test_size, random_state=42)

# also for the sensitive attributes with same random_state
compas_train_sex, compas_test_sex, compas_train_y, compas_test_y = train_test_split(compas_sex, compas_y, test_size=adult_test_size, random_state=42)
compas_train_race, compas_test_race, compas_train_y, compas_test_y = train_test_split(compas_race, compas_y, test_size=adult_test_size, random_state=42)
compas_train_age, compas_test_age, compas_train_y, compas_test_y = train_test_split(compas_age, compas_y, test_size=adult_test_size, random_state=42)
compas_train_sex_race, compas_test_sex_race, compas_train_y, compas_test_y = train_test_split(compas_sex_race, compas_y, test_size=adult_test_size, random_state=42)

In [15]:
# reindex all datasets, drop=True to prevent addition of "index" column
compas_train_cat_X = compas_train_cat_X.reset_index(drop=True)
compas_train_y = compas_train_y.reset_index(drop=True)
compas_train_sex = compas_train_sex.reset_index(drop=True)
compas_train_race = compas_train_race.reset_index(drop=True)
compas_train_sex_race = compas_train_sex_race.reset_index(drop=True)
compas_train_age = compas_train_age.reset_index(drop=True)

compas_test_cat_X = compas_test_cat_X.reset_index(drop=True)
compas_test_y = compas_test_y.reset_index(drop=True)
compas_test_sex = compas_test_sex.reset_index(drop=True)
compas_test_race = compas_test_race.reset_index(drop=True)
compas_test_sex_race = compas_test_sex_race.reset_index(drop=True)
compas_test_age = compas_test_age.reset_index(drop=True)

## PAFER
The code that implements the PAFER algorithm as found in Algorithm 1 in the paper https://arxiv.org/abs/2312.08413

In [16]:
def oracle(dataset, sens_dataset, rule, s_i, mechanism=None, epsilon=0.05, delta=0.001):
    """Returns some (differentially privatised) statistics on the sensitive attribute for the specified dataframe and rule.
    
    Args:
        dataset: The DataFrame that the developers own, which does not contain sensitive attributes.
            Used to calculate total quantities in (root) nodes.
        sens_dataset: A Series that the developers do not own, which contains the sensitive attributes. 
            Combined sensitive attributes should be encoded as a Series, e.g. Black-Female
        rule: The rule for which the to estimate the sensitive attribute. 
            rule must be a pandas conditional expression as a string, e.g. "(adult_test_cat_X['marital-status_Married-civ-spouse']<= 0.5)"
        s_i: The sensitive attribute, its name comes from the ith element in the set S of sensitive attributes.
            s_i should thus be in sens_dataset. 
        mechanism: The privacy mechanism used on the returned counts. Can be one of "gaussian", "laplacian", "exponential", None. 
        epsilon: The privacy budget. Should be larger than 0.
        delta: The privacy margin. Ignored when mechanism is either laplacian or gaussian. Should be in (0, 1]. 
        
    Returns:
        The number of times s_i occurs in sens_dataset, potentially privatised via the mechanism. 
        """
        
    # check epsilon and delta parameters
    if epsilon <= 0 or (mechanism == "gaussian" and (delta <= 0 or delta > 1 or epsilon > 1)):
        raise ValueError("The value of delta should be in (0,1] when using the gaussian mechanism")
    
    if not sens_dataset.isin([s_i]).any():
        raise KeyError("The requested sensitive attribute (s_i) is not in the sensitive dataframe (sens_dataset)")
        
    # the answer if no privacy mechanism is applied
    try:
        # engine might differ for your version, i.e. engine="pandas"
        no_mechanism = sens_dataset.loc[dataset[pd.eval(rule, engine='python')].index].value_counts(sort=False)[s_i]
        
    except KeyError:
        no_mechanism = 0
    
    if mechanism == "laplacian":
        # this is a histogram query so the l1-sensitivity = 1 as per Dwork & Roth 
        sensitivity = 1
        return no_mechanism + np.random.laplace(loc=0, scale=sensitivity / epsilon)
    
    elif mechanism == "gaussian":
        # this is a histogram query so the l2-sensitivity = 2 as per Dwork & Roth
        sensitivity = 2
        return no_mechanism + np.random.normal(loc=0, scale=2 * sensitivity**2 * np.log(1.25 / delta) / epsilon**2)
    
    elif mechanism == "exponential":
        # this query can only change by 1 if an instance is omitted so l1-sensitivity = 1
        sensitivity = 1
        
        # np.arange is [start, stop) so + 1 for entire possible range
        possible_values = np.arange(0, sens_dataset.loc[dataset[pd.eval(rule, engine='python')].index].value_counts().to_numpy().sum() + 1)
        
        # the utility is higher when the value is closer to the actual value
        utility_scores = np.array([no_mechanism - abs(no_mechanism - value) for value in possible_values]) / 100
        probabilities = [np.exp(epsilon * score / (2 * sensitivity)) for score in utility_scores]
        
        # normalize probabilties to sum to 1
        probabilities /= np.linalg.norm(probabilities, ord=1)
        return np.random.choice(possible_values, p=probabilities)

    # if no mechanism is given, return the unprivatised cocunt
    return no_mechanism


In [17]:
def statistical_parity(y_pred, sens_dataset):
    """Calculates Statistical Parity Ratio using the predictions and the actual sensitive feature values. 
    
    Args:
        y_pred: The predictions, should be of same size as sens_dataset.
        sens_dataset: The Series with the sensitive attributes.
        
    Returns:
        The true statistical parity ratio.
        """
    accept_rates = []
    
    for sens_attr in sorted(sens_dataset.unique()):
        accept_rates.append(np.sum((sens_dataset == sens_attr) & y_pred) / np.sum(sens_dataset == sens_attr))
        
    return min(accept_rates) / max(accept_rates)


def estimate_sp(pos_ruleset, dataset, sens_dataset, S, mechanism, epsilon, delta=0.001):
    """Returns the estimated Statistical Parity of a tree for a privacy mechanism. The PAFER algorithm. 
    
    Args:
        pos_ruleset: A list of rules that classify favorably in the tree. This is the representation of the
        (relevant parts of the) tree. 
        dataset: The DataFrame that the developers own that does not contain sensitive feature values.
        sens_dataset: The Series that contains the sensitive features, which the developers do not own.
        S: The set/list of sensitive attributes, should all be in the sens_dataset attribute.
        mechanism: The mechanism with which to privatise the query answers. 
        epsilon: The privacy budget for the privacy mechanism. Should be larger than 0.
        delta: The privacy margin. Ignored when mechanism is either laplacian or gaussian. Should be in (0, 1].
        
    Returns:
        The statistical parity ratio for the specified pos_ruleset. 
        """
    
    poscounts_per_si = np.zeros(len(S))
    
    # the variable name of the current dataset is inferred from the ruleset
    datasetname = str(pos_ruleset[0].split('[')[0])[1:]
    
    # the base rule is a rule that includes all individuals, i.e. the condition is a tautology
    # in this case we select all rows that have a value that is in the set of possible values of the first column
    base_rule = f"({datasetname}[{datasetname}.columns[0]].isin({datasetname}[{datasetname}.columns[0]].unique()))"
    
    # query the size of each sensitive attribute in the dataset
    total_per_si = [oracle(dataset, sens_dataset, base_rule, s_i, mechanism, 0.5 * epsilon, delta) for s_i in S]
    
    # replace each invalid value with balanced totals
    for i, tot in enumerate(total_per_si):
        if tot < 0 or tot > len(sens_dataset):
            total_per_si[i] = (1 / len(S)) * len(sens_dataset)
        
    total_per_si = np.array(total_per_si)
    
    for rule in pos_ruleset:
        # for each rule we find the distribution of sensitive attributes
        rule_counts = np.zeros(len(S))
        rule_total = len(sens_dataset[pd.eval(rule)])
        
        for i, s_i in enumerate(S):
            # because the queries are disjoint, epsilon remains equal across queries
            answer = round(oracle(dataset, sens_dataset, rule, s_i, mechanism, 0.5 * epsilon, delta))

            # if invalid answers from query: replace with balanced node value
            if answer < 0 or answer > len(sens_dataset):
                answer = (1 / len(S)) * rule_total

            rule_counts[i] += answer
        
        # the distribution for the current rule is added to the total
        poscounts_per_si += rule_counts
    
    # calculate and return sp
    accept_rates = poscounts_per_si / total_per_si
    return np.min(accept_rates) / np.max(accept_rates)


## Tree construction pipeline

In [18]:
def find_best_tree(dataset, dataset_labels, minleaf=1, ccp_alpha=0.0):
    """Train a tree for balanced accuracy performance using grid search. 
    
    dataset: The training data (without sensitive attributes!).
    dataset_labels: The true outcomes for the prediction task.
    minleaf: The tree construction parameter denoting the minimum number of instances in a leaf node.
        minleaf should be in (0, 1]. 
        
    Returns: 
        The best performing decision tree."""
    
    # no random_state because we want a different tree each run
    tree = DecisionTreeClassifier()

    parameter_grid = {"criterion":["entropy", "gini"],
                      "max_features":["sqrt", "log2"], 
                      "min_samples_leaf":[minleaf], "ccp_alpha": [ccp_alpha]}
    
    # train and return the best tree
    tree_cv = GridSearchCV(tree, param_grid=parameter_grid, scoring='balanced_accuracy', n_jobs=2, cv=3, verbose=0)
    tree_cv.fit(dataset, dataset_labels)
    best_tree = tree_cv.best_estimator_
    return best_tree

best_tree = find_best_tree(compas_train_cat_X, compas_train_y, ccp_alpha=0.2)

In [24]:
# taken from: https://stackoverflow.com/a/51398390
def is_leaf(inner_tree, index):
    # check whether node is leaf node
    return (inner_tree.children_left[index] == TREE_LEAF and 
            inner_tree.children_right[index] == TREE_LEAF)

def prune_index(inner_tree, decisions, index=0):
    # start pruning from the bottom - if we start from the top, we might miss
    # nodes that become leaves during pruning
    if not is_leaf(inner_tree, inner_tree.children_left[index]):
        prune_index(inner_tree, decisions, inner_tree.children_left[index])
    if not is_leaf(inner_tree, inner_tree.children_right[index]):
        prune_index(inner_tree, decisions, inner_tree.children_right[index])

    # prune children if both children are leaves now and make the same decision
    if (is_leaf(inner_tree, inner_tree.children_left[index]) and
        is_leaf(inner_tree, inner_tree.children_right[index]) and
        (decisions[index] == decisions[inner_tree.children_left[index]]) and 
        (decisions[index] == decisions[inner_tree.children_right[index]])):
        # turn node into a leaf by "unlinking" its children
        inner_tree.children_left[index] = TREE_LEAF
        inner_tree.children_right[index] = TREE_LEAF

def prune_duplicate_leaves(mdl):
    # Remove leaves if all siblings make the same decision
    decisions = mdl.tree_.value.argmax(axis=2).flatten().tolist() # Decision for each node
    prune_index(mdl.tree_, decisions)
    
# pruning happens in-place
prune_duplicate_leaves(best_tree)

In [25]:
def positive_rules (tree, rules):
    """From the extracted rules, return those that have a favorable classification. 

    Arg:
        tree: The tree classification object from which the rules are extracted. 
        rules: Dict of which the values are rule strings.

    Returns:
        A list of all the rules that classify favorably"""

    # only those rules are added for which the majority of individuals in the node is at index 1, i.e. max
    # index 1 corresponds to class 1 which we ensured was the favorable outcome
    return [rule for node_id, rule in rules.items() if np.argmax(tree.tree_.value[node_id][0])]


In [26]:
# taken from: https://stackoverflow.com/a/56427596
def extract_pos_rules(tree, dataset, datasetname):
    n_nodes = tree.tree_.node_count
    children_left = tree.tree_.children_left
    children_right = tree.tree_.children_right
    feature = tree.tree_.feature
    threshold = tree.tree_.threshold

    def find_path(node_numb, path, x):
        path.append(node_numb)
        if node_numb == x:
            return True
        left = False
        right = False
        if (children_left[node_numb] !=-1):
            left = find_path(children_left[node_numb], path, x)
        if (children_right[node_numb] !=-1):
            right = find_path(children_right[node_numb], path, x)
        if left or right :
            return True
        path.remove(node_numb)
        return False


    def get_rule(datasetname, path, column_names):
        mask = '('
        for index, node in enumerate(path):
            # check if we are not in the leaf
            if index!=len(path)-1:
                # under or over the threshold?
                if (children_left[node] == path[index+1]):
                    mask += f"{datasetname}['{column_names[feature[node]]}']<= {threshold[node]}\t "
                else:
                    mask += f"{datasetname}['{column_names[feature[node]]}']> {threshold[node]} \t "

        # insert the & at the right places
        mask = mask.replace("\t", "&", mask.count("\t") - 1)
        mask = mask.replace("\t", "")
        mask += ")"
        return mask
    
    # Leaves
    leave_id = tree.apply(dataset)

    paths = {}
    for leaf in np.unique(leave_id):
        path_leaf = []
        find_path(0, path_leaf, leaf)
        paths[leaf] = np.unique(np.sort(path_leaf))

    rules = {}
    for key in paths:
        rules[key] = get_rule(datasetname, paths[key], [name for name in dataset.columns])
        
    return positive_rules(tree, rules)
        
extract_pos_rules(best_tree, compas_train_cat_X, "compas_train_cat_X")

['()']

## Experiments

In [22]:
def bootstrap(dataset, dataset_labels, sens_dataset):
    """A bootstrapping function that helps to diversify the tree generating process."""
    indices = np.random.choice(dataset.index, size=len(dataset.index))
    
    return dataset.iloc[indices], dataset_labels.iloc[indices], sens_dataset.iloc[indices]

# dataset, labels, sens_dataset = bootstrap(compas_test_cat_X, compas_test_y, compas_test_sex)

In [23]:
def experiment(trainset, sens_trainset, trainsetname, trainset_labels, testset, sens_testset, testsetname, 
               testset_labels, epsilons=[0.05, 0.1, 0.15, 0.2, 0.25], minleaf=1, ccp_alpha=0.0, runs=5, combined=False):
    """Performs an experiment as described in the paper.
    
    trainset: The DataFrame containing the training instances.
    sens_trainset: The Series containing the sensitive attribute values for the trainset.
    trainsetname: The variable name of the trainset. Required because of rule evaluation.
    trainset_labels: The true outcomes for the prediction task for the trainset.
    testset: The DataFrame containing the test instances.
    sens_testset: The Series containing the sensitive attribute values for the testset.
    testsetname: The variable name of the testset. Required because of rule evaluation.
    testset_labels: The true outcomes for the prediction task for the testset.
    epsilons: The different privacy budgets to try. 
        epsilons must be a list.
    minleaf: The tree construction parameter denoting the minimum number of instances in a leaf node.
        minleaf should be in (0, 1].
    runs: The number of runs to average over. Advised to be quite high (e.g. 50) to compensate for noise.
    combined: Whether to combine all positive rules into one query. 
    
    Returns:
        The true SP of the tree and the estimated SP."""
    
    
    tree_sps = np.zeros((runs, len(epsilons)))
    tree_depths = np.zeros((runs, len(epsilons)))
    estimated_sps = np.zeros((runs, len(epsilons)))
    for i in range(runs):
        ruleset = []
        
        # keep boostrapping until we find a ruleset that has at least one positive rule
        while ruleset == [] or ruleset == ['()']:
            # sample with replacement
            dataset, dataset_labels, sens_dataset = bootstrap(trainset, trainset_labels, sens_trainset)
            
            # build tree 
            best_tree = find_best_tree(trainset, trainset_labels, minleaf, ccp_alpha)
        
            # extract positive rules
            prune_duplicate_leaves(best_tree)
            ruleset = extract_pos_rules(best_tree, trainset, testsetname)
        
        if combined:
            ruleset = [" | ".join(rule for rule in ruleset)]

        # calculate true SP
        tree_sps[i] = statistical_parity(best_tree.predict(testset), sens_testset)
        tree_depths[i] = best_tree.tree_.max_depth
        
        # apply PAFER
        for j, epsilon in enumerate(epsilons):
            estimated_sps[i, j] = estimate_sp(ruleset, testset, sens_testset, sorted(sens_testset.unique()), mechanism='laplacian', epsilon=epsilon)
        
    return tree_sps, tree_depths, estimated_sps
        

## MINLEAF EXPERIMENTS

In [None]:
# script to run the experiments
# TODO: current datastructures are not optimal or intuitive so could be helpful to streamline (for plotting)
minleafs = np.linspace(0.2, 0.001, 80)
runs = 50

# storage for results
tree_sps = []
tree_depths = []
estimated_sps = []
for minleaf in tqdm(minleafs):
    t_sps, t_depths, e_sps = experiment(compas_train_cat_X, compas_train_sex, "compas_train_cat_X", compas_train_y, 
                                         compas_test_cat_X, compas_test_sex, "compas_test_cat_X", compas_test_y, minleaf=minleaf, runs=runs)
    
    tree_sps.append(t_sps)
    tree_depths.append(t_depths)
    estimated_sps.append(e_sps)
    

In [None]:
tree_sps = np.array(tree_sps)
tree_depths = np.array(tree_depths)
estimated_sps = np.array(estimated_sps)

for arr, name in zip([tree_sps, tree_depths, estimated_sps], ["tree_sps", "tree_depths", "estimated_sps"]):
    with open(f"compas-sex-minleaf-{name}", "wb") as f:
        np.save(f, arr)

In [None]:
minleafs = np.linspace(0.2, 0.001, 80)
runs = 50

# storage for results
tree_sps = []
tree_depths = []
estimated_sps = []
for minleaf in tqdm(minleafs):
    t_sps, t_depths, e_sps = experiment(compas_train_cat_X, compas_train_race, "compas_train_cat_X", compas_train_y, 
                                         compas_test_cat_X, compas_test_race, "compas_test_cat_X", compas_test_y, minleaf=minleaf, runs=runs)
    
    tree_sps.append(t_sps)
    tree_depths.append(t_depths)
    estimated_sps.append(e_sps)
    

In [None]:
tree_sps = np.array(tree_sps)
tree_depths = np.array(tree_depths)
estimated_sps = np.array(estimated_sps)

for arr, name in zip([tree_sps, tree_depths, estimated_sps], ["tree_sps", "tree_depths", "estimated_sps"]):
    with open(f"compas-race-minleaf-{name}", "wb") as f:
        np.save(f, arr)

In [None]:
minleafs = np.linspace(0.2, 0.001, 80)
runs = 50

# storage for results
tree_sps = []
tree_depths = []
estimated_sps = []
for minleaf in tqdm(minleafs):
    t_sps, t_depths, e_sps = experiment(compas_train_cat_X, compas_train_sex_race, "compas_train_cat_X", compas_train_y, 
                                         compas_test_cat_X, compas_test_sex_race, "compas_test_cat_X", compas_test_y, minleaf=minleaf, runs=runs)
    
    tree_sps.append(t_sps)
    tree_depths.append(t_depths)
    estimated_sps.append(e_sps)
    

In [None]:
tree_sps = np.array(tree_sps)
tree_depths = np.array(tree_depths)
estimated_sps = np.array(estimated_sps)

for arr, name in zip([tree_sps, tree_depths, estimated_sps], ["tree_sps", "tree_depths", "estimated_sps"]):
    with open(f"compas-sex_race-minleaf-{name}", "wb") as f:
        np.save(f, arr)

## CCP_ALPHA EXPERIMENTS

In [25]:
# script to run the experiments
# TODO: current datastructures are not optimal or intuitive so could be helpful to streamline (for plotting)
ccp_alphas = np.linspace(0.05, 0.001, 80)
runs = 50

# storage for results
tree_sps = []
tree_depths = []
estimated_sps = []
for ccp_alpha in tqdm(ccp_alphas):
    t_sps, t_depths, e_sps = experiment(compas_train_cat_X, compas_train_sex, "compas_train_cat_X", compas_train_y, 
                                         compas_test_cat_X, compas_test_sex, "compas_test_cat_X", compas_test_y, ccp_alpha=ccp_alpha, runs=runs)
    
    tree_sps.append(t_sps)
    tree_depths.append(t_depths)
    estimated_sps.append(e_sps)
    

100%|█████████████████████████████████████████| 10/10 [11:04<00:00, 66.49s/it]


In [29]:
tree_sps = np.array(tree_sps)
tree_depths = np.array(tree_depths)
estimated_sps = np.array(estimated_sps)

for arr, name in zip([tree_sps, tree_depths, estimated_sps], ["tree_sps", "tree_depths", "estimated_sps"]):
    with open(f"compas-sex-ccp-{name}", "wb") as f:
        np.save(f, arr)

In [31]:
ccp_alphas = np.linspace(0.05, 0.001, 80)
runs = 50

# storage for results
tree_sps = []
tree_depths = []
estimated_sps = []
for ccp_alpha in tqdm(ccp_alphas):
    t_sps, t_depths, e_sps = experiment(compas_train_cat_X, compas_train_race, "compas_train_cat_X", compas_train_y, 
                                         compas_test_cat_X, compas_test_race, "compas_test_cat_X", compas_test_y, ccp_alpha=ccp_alpha, runs=runs)
    
    tree_sps.append(t_sps)
    tree_depths.append(t_depths)
    estimated_sps.append(e_sps)
    

[0.2        0.19748101 0.19496203 0.19244304 0.18992405 0.18740506
 0.18488608 0.18236709 0.1798481  0.17732911 0.17481013 0.17229114
 0.16977215 0.16725316 0.16473418 0.16221519 0.1596962  0.15717722
 0.15465823 0.15213924 0.14962025 0.14710127 0.14458228 0.14206329
 0.1395443  0.13702532 0.13450633 0.13198734 0.12946835 0.12694937
 0.12443038 0.12191139 0.11939241 0.11687342 0.11435443 0.11183544
 0.10931646 0.10679747 0.10427848 0.10175949 0.09924051 0.09672152
 0.09420253 0.09168354 0.08916456 0.08664557 0.08412658 0.08160759
 0.07908861 0.07656962 0.07405063 0.07153165 0.06901266 0.06649367
 0.06397468 0.0614557  0.05893671 0.05641772 0.05389873 0.05137975
 0.04886076 0.04634177 0.04382278 0.0413038  0.03878481 0.03626582
 0.03374684 0.03122785 0.02870886 0.02618987 0.02367089 0.0211519
 0.01863291 0.01611392 0.01359494 0.01107595 0.00855696 0.00603797
 0.00351899 0.001     ]


100%|█████████████████████████████████████████████████████████████| 80/80 [7:30:14<00:00, 337.68s/it]


In [None]:
tree_sps = np.array(tree_sps)
tree_depths = np.array(tree_depths)
estimated_sps = np.array(estimated_sps)

for arr, name in zip([tree_sps, tree_depths, estimated_sps], ["tree_sps", "tree_depths", "estimated_sps"]):
    with open(f"compas-race-ccp-{name}", "wb") as f:
        np.save(f, arr)

In [None]:
ccp_alphas = np.linspace(0.05, 0.001, 80)
runs = 50

# storage for results
tree_sps = []
tree_depths = []
estimated_sps = []
for ccp_alpha in tqdm(ccp_alphas):
    t_sps, t_depths, e_sps = experiment(compas_train_cat_X, compas_train_sex_race, "compas_train_cat_X", compas_train_y, 
                                         compas_test_cat_X, compas_test_sex_race, "compas_test_cat_X", compas_test_y, ccp_alpha=ccp_alpha, runs=runs)
    
    tree_sps.append(t_sps)
    tree_depths.append(t_depths)
    estimated_sps.append(e_sps)
    

In [51]:
tree_sps = np.array(tree_sps)
tree_depths = np.array(tree_depths)
estimated_sps = np.array(estimated_sps)

for arr, name in zip([tree_sps, tree_depths, estimated_sps], ["tree_sps", "tree_depths", "estimated_sps"]):
    with open(f"compas-sex_race-ccp-{name}", "wb") as f:
        np.save(f, arr)