In [1]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree._tree import TREE_LEAF
from sklearn.model_selection import GridSearchCV, train_test_split
from tqdm import tqdm

In [2]:
# The Adult data source: https://archive.ics.uci.edu/ml/datasets/adult
adult_train = pd.read_csv("adult.data", sep=',\s+', engine='python')
adult_test = pd.read_csv("adult.test", sep=',\s+', engine='python')

In [3]:
adult_train

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,outcome
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [4]:
# drop incomplete instances, i.e. rows that contain ? values
adult_test = adult_test[(adult_test != '?').all(1)]
adult_train = adult_train[(adult_train != '?').all(1)]

# drop fnlwgt column, which is uninformational and might cause overfitting
adult_test = adult_test.drop(columns=["fnlwgt"])
adult_train = adult_train.drop(columns=["fnlwgt"])

In [5]:
# separate labels
adult_test_y = adult_test['outcome']
adult_test_X = adult_test.loc[:, adult_test.columns != 'outcome']
adult_train_y = adult_train['outcome']
adult_train_X = adult_train.loc[:, adult_train.columns != 'outcome']

In [6]:
# transform labels to 0 and 1, above 50k is favorable outcome
adult_test_y = adult_test_y.map({'<=50K.': 0, '>50K.': 1})
adult_train_y = adult_train_y.map({'<=50K': 0, '>50K': 1})

In [7]:
# separate sensitive attributes from model data
adult_train_sex = adult_train_X["sex"]
adult_test_sex = adult_test_X["sex"]
adult_train_race = adult_train_X["race"]
adult_test_race = adult_test_X["race"]
adult_train_nc = adult_train_X["native-country"]
adult_test_nc = adult_test_X["native-country"]
adult_train_age = adult_train_X["age"]
adult_test_age = adult_test_X["age"]
adult_train_X = adult_train_X.drop(columns=["race", "sex", "native-country", "age"])
adult_test_X = adult_test_X.drop(columns=["race", "sex", "native-country", "age"])

In [8]:
# convert race data to binary white - non-white
race_map_dict = {"White": "White", "Black": "Non_White", "Asian-Pac-Islander": "Non_White", "Other": "Non_White", "Amer-Indian-Eskimo": "Non_White"}
adult_train_race = adult_train_race.map(race_map_dict)
adult_test_race = adult_test_race.map(race_map_dict)
adult_train_race.value_counts()

White        25933
Non_White     4229
Name: race, dtype: int64

In [9]:
# convert the intersectional sensitive attributes into a dataframe and then into a series for the training set
adult_train_sexrace = pd.concat([adult_train_sex, adult_train_race], axis=1)
adult_train_sex_race = adult_train_sexrace[['sex', 'race']].agg('-'.join, axis=1)
adult_train_sex_race.value_counts()

Male-White          18038
Female-White         7895
Male-Non_White       2342
Female-Non_White     1887
dtype: int64

In [10]:
# convert the two sensitive attributes into a dataframe and then into a series for the test set
adult_test_sexrace = pd.concat([adult_test_sex, adult_test_race], axis=1)
adult_test_sex_race = adult_test_sexrace[['sex', 'race']].agg('-'.join, axis=1)
adult_test_sex_race.value_counts()

Male-White          8982
Female-White        3988
Male-Non_White      1165
Female-Non_White     925
dtype: int64

In [11]:
# bin the numerical features into 5 equal-width bins
bins = 5

adult_train_X["education-cat"] = pd.cut(adult_train_X["education-num"], bins=bins)
adult_train_X["capital-gain-cat"] = pd.cut(adult_train_X["capital-gain"], bins=bins)
adult_train_X["capital-loss-cat"] = pd.cut(adult_train_X["capital-loss"], bins=bins)
adult_train_X["hours-per-week-cat"] = pd.cut(adult_train_X["hours-per-week"], bins=bins)

# use the same bins as the trainset to ensure we can predict later
adult_test_X["education-cat"] = pd.cut(adult_train_X["education-num"], bins=bins)
adult_test_X["capital-gain-cat"] = pd.cut(adult_train_X["capital-gain"], bins=bins)
adult_test_X["capital-loss-cat"] = pd.cut(adult_train_X["capital-loss"], bins=bins)
adult_test_X["hours-per-week-cat"] = pd.cut(adult_train_X["hours-per-week"], bins=bins)

In [12]:
# convert both datasets to one-hot encoding
adult_train_cat_X = pd.get_dummies(adult_train_X)
adult_test_cat_X = pd.get_dummies(adult_test_X)
adult_train_cat_X

Unnamed: 0,education-num,capital-gain,capital-loss,hours-per-week,workclass_Federal-gov,workclass_Local-gov,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,workclass_State-gov,...,"capital-loss-cat_(-4.356, 871.2]","capital-loss-cat_(871.2, 1742.4]","capital-loss-cat_(1742.4, 2613.6]","capital-loss-cat_(2613.6, 3484.8]","capital-loss-cat_(3484.8, 4356.0]","hours-per-week-cat_(0.902, 20.6]","hours-per-week-cat_(20.6, 40.2]","hours-per-week-cat_(40.2, 59.8]","hours-per-week-cat_(59.8, 79.4]","hours-per-week-cat_(79.4, 99.0]"
0,13,2174,0,40,0,0,0,0,0,1,...,1,0,0,0,0,0,1,0,0,0
1,13,0,0,13,0,0,0,0,1,0,...,1,0,0,0,0,1,0,0,0,0
2,9,0,0,40,0,0,1,0,0,0,...,1,0,0,0,0,0,1,0,0,0
3,7,0,0,40,0,0,1,0,0,0,...,1,0,0,0,0,0,1,0,0,0
4,13,0,0,40,0,0,1,0,0,0,...,1,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,12,0,0,38,0,0,1,0,0,0,...,1,0,0,0,0,0,1,0,0,0
32557,9,0,0,40,0,0,1,0,0,0,...,1,0,0,0,0,0,1,0,0,0
32558,9,0,0,40,0,0,1,0,0,0,...,1,0,0,0,0,0,1,0,0,0
32559,9,0,0,20,0,0,1,0,0,0,...,1,0,0,0,0,1,0,0,0,0


In [13]:
# reindex all datasets
adult_train_cat_X = adult_train_cat_X.reset_index(drop=True)
adult_train_y = adult_train_y.reset_index(drop=True)
adult_train_sex = adult_train_sex.reset_index(drop=True)
adult_train_race = adult_train_race.reset_index(drop=True)
adult_train_age = adult_train_age.reset_index(drop=True)
adult_train_nc = adult_train_nc.reset_index(drop=True)
adult_train_sex_race = adult_train_sex_race.reset_index(drop=True)

adult_test_cat_X = adult_test_cat_X.reset_index(drop=True)
adult_test_y = adult_test_y.reset_index(drop=True)
adult_test_sex = adult_test_sex.reset_index(drop=True)
adult_test_race = adult_test_race.reset_index(drop=True)
adult_test_age = adult_test_age.reset_index(drop=True)
adult_test_nc = adult_test_nc.reset_index(drop=True)
adult_test_sex_race = adult_test_sex_race.reset_index(drop=True)

In [14]:
adult_train_cat_X

Unnamed: 0,education-num,capital-gain,capital-loss,hours-per-week,workclass_Federal-gov,workclass_Local-gov,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,workclass_State-gov,...,"capital-loss-cat_(-4.356, 871.2]","capital-loss-cat_(871.2, 1742.4]","capital-loss-cat_(1742.4, 2613.6]","capital-loss-cat_(2613.6, 3484.8]","capital-loss-cat_(3484.8, 4356.0]","hours-per-week-cat_(0.902, 20.6]","hours-per-week-cat_(20.6, 40.2]","hours-per-week-cat_(40.2, 59.8]","hours-per-week-cat_(59.8, 79.4]","hours-per-week-cat_(79.4, 99.0]"
0,13,2174,0,40,0,0,0,0,0,1,...,1,0,0,0,0,0,1,0,0,0
1,13,0,0,13,0,0,0,0,1,0,...,1,0,0,0,0,1,0,0,0,0
2,9,0,0,40,0,0,1,0,0,0,...,1,0,0,0,0,0,1,0,0,0
3,7,0,0,40,0,0,1,0,0,0,...,1,0,0,0,0,0,1,0,0,0
4,13,0,0,40,0,0,1,0,0,0,...,1,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30157,12,0,0,38,0,0,1,0,0,0,...,1,0,0,0,0,0,1,0,0,0
30158,9,0,0,40,0,0,1,0,0,0,...,1,0,0,0,0,0,1,0,0,0
30159,9,0,0,40,0,0,1,0,0,0,...,1,0,0,0,0,0,1,0,0,0
30160,9,0,0,20,0,0,1,0,0,0,...,1,0,0,0,0,1,0,0,0,0


## PAFER

In [15]:
def oracle(dataset, sens_dataset, rule, s_i, mechanism=None, epsilon=0.05, delta=0.001):
    """Returns some (differentially privatised) statistics on the sensitive attribute for the specified dataframe and rule.
    
    Args:
        dataset: The DataFrame that the developers own, which does not contain sensitive attributes.
            Used to calculate total quantities in (root) nodes.
        sens_dataset: A Series that the developers do not own, which contains the sensitive attributes. 
            Combined sensitive attributes should be encoded as a Series, e.g. Black-Female
        rule: The rule for which the to estimate the sensitive attribute. 
            rule must be a pandas conditional expression as a string, e.g. "(adult_test_cat_X['marital-status_Married-civ-spouse']<= 0.5)"
        s_i: The sensitive attribute, its name comes from the ith element in the set S of sensitive attributes.
            s_i should thus be in sens_dataset. 
        mechanism: The privacy mechanism used on the returned counts. Can be one of "gaussian", "laplacian", "exponential", None. 
        epsilon: The privacy budget. Should be larger than 0.
        delta: The privacy margin. Ignored when mechanism is either laplacian or gaussian. Should be in (0, 1]. 
        
    Returns:
        The number of times s_i occurs in sens_dataset, potentially privatised via the mechanism. 
        """
        
    # check epsilon and delta parameters
    if epsilon <= 0 or (mechanism == "gaussian" and (delta <= 0 or delta > 1 or epsilon > 1)):
        raise ValueError("The value of delta should be in (0,1] when using the gaussian mechanism")
    
    if not sens_dataset.isin([s_i]).any():
        raise KeyError("The requested sensitive attribute (s_i) is not in the sensitive dataframe (sens_dataset)")
        
    # the answer if no privacy mechanism is applied
    try:
        # engine might differ for your version, i.e. engine="pandas"
        no_mechanism = sens_dataset.loc[dataset[pd.eval(rule, engine='python')].index].value_counts(sort=False)[s_i]
        
    except KeyError:
        no_mechanism = 0
    
    if mechanism == "laplacian":
        # this is a histogram query so the l1-sensitivity = 1 as per Dwork & Roth 
        sensitivity = 1
        return no_mechanism + np.random.laplace(loc=0, scale=sensitivity / epsilon)
    
    elif mechanism == "gaussian":
        # this is a histogram query so the l2-sensitivity = 2 as per Dwork & Roth
        sensitivity = 2
        return no_mechanism + np.random.normal(loc=0, scale=2 * sensitivity**2 * np.log(1.25 / delta) / epsilon**2)
    
    elif mechanism == "exponential":
        # this query can only change by 1 if an instance is omitted so l1-sensitivity = 1
        sensitivity = 1
        
        # np.arange is [start, stop) so + 1 for entire possible range
        possible_values = np.arange(0, sens_dataset.loc[dataset[pd.eval(rule, engine='python')].index].value_counts().to_numpy().sum() + 1)
        
        # the utility is higher when the value is closer to the actual value
        utility_scores = np.array([no_mechanism - abs(no_mechanism - value) for value in possible_values]) / 100
        probabilities = [np.exp(epsilon * score / (2 * sensitivity)) for score in utility_scores]
        
        # normalize probabilties to sum to 1
        probabilities /= np.linalg.norm(probabilities, ord=1)
        return np.random.choice(possible_values, p=probabilities)

    # if no mechanism is given, return the unprivatised cocunt
    return no_mechanism


In [16]:
def statistical_parity(y_pred, sens_dataset):
    """Calculates Statistical Parity Ratio using the predictions and the actual sensitive feature values. 
    
    Args:
        y_pred: The predictions, should be of same size as sens_dataset.
        sens_dataset: The Series with the sensitive attributes.
        
    Returns:
        The true statistical parity ratio.
        """
    accept_rates = []
    
    for sens_attr in sorted(sens_dataset.unique()):
        accept_rates.append(np.sum((sens_dataset == sens_attr) & y_pred) / np.sum(sens_dataset == sens_attr))
        
    return min(accept_rates) / max(accept_rates)


def estimate_sp(pos_ruleset, dataset, sens_dataset, S, mechanism, epsilon, delta=0.001):
    """Returns the estimated Statistical Parity of a tree for a privacy mechanism. The PAFER algorithm. 
    
    Args:
        pos_ruleset: A list of rules that classify favorably in the tree. This is the representation of the
        (relevant parts of the) tree. 
        dataset: The DataFrame that the developers own that does not contain sensitive feature values.
        sens_dataset: The Series that contains the sensitive features, which the developers do not own.
        S: The set/list of sensitive attributes, should all be in the sens_dataset attribute.
        mechanism: The mechanism with which to privatise the query answers. 
        epsilon: The privacy budget for the privacy mechanism. Should be larger than 0.
        delta: The privacy margin. Ignored when mechanism is either laplacian or gaussian. Should be in (0, 1].
        
    Returns:
        The statistical parity ratio for the specified pos_ruleset. 
        """
    
    poscounts_per_si = np.zeros(len(S))
    
    # the variable name of the current dataset is inferred from the ruleset
    datasetname = str(pos_ruleset[0].split('[')[0])[1:]
    
    # the base rule is a rule that includes all individuals, i.e. the condition is a tautology
    # in this case we select all rows that have a value that is in the set of possible values of the first column
    base_rule = f"({datasetname}[{datasetname}.columns[0]].isin({datasetname}[{datasetname}.columns[0]].unique()))"
    
    # query the size of each sensitive attribute in the dataset
    total_per_si = [oracle(dataset, sens_dataset, base_rule, s_i, mechanism, 0.5 * epsilon, delta) for s_i in S]
    
    # replace each invalid value with balanced totals
    for i, tot in enumerate(total_per_si):
        if tot < 0 or tot > len(sens_dataset):
            total_per_si[i] = (1 / len(S)) * len(sens_dataset)
        
    total_per_si = np.array(total_per_si)
    
    for rule in pos_ruleset:
        # for each rule we find the distribution of sensitive attributes
        rule_counts = np.zeros(len(S))
        rule_total = len(sens_dataset[pd.eval(rule)])
        
        for i, s_i in enumerate(S):
            # because the queries are disjoint, epsilon remains equal across queries
            answer = round(oracle(dataset, sens_dataset, rule, s_i, mechanism, 0.5 * epsilon, delta))

            # if invalid answers from query: replace with balanced node value
            if answer < 0 or answer > len(sens_dataset):
                answer = (1 / len(S)) * rule_total

            rule_counts[i] += answer
        
        # the distribution for the current rule is added to the total
        poscounts_per_si += rule_counts
    
    # calculate and return sp
    accept_rates = poscounts_per_si / total_per_si
    return np.min(accept_rates) / np.max(accept_rates)


## Tree construction pipeline

In [17]:
def find_best_tree(dataset, dataset_labels, minleaf=1, ccp_alpha=0.01):
    """Train a tree for balanced accuracy performance using grid search. 
    
    dataset: The training data (without sensitive attributes!).
    dataset_labels: The true outcomes for the prediction task.
    minleaf: The tree construction parameter denoting the minimum number of instances in a leaf node.
        minleaf should be in (0, 1]. 
        
    Returns: 
        The best performing decision tree."""
    
    # no random_state because we want a different tree each run
    tree = DecisionTreeClassifier()

    parameter_grid = {"criterion":["entropy", "gini"],
                      "max_features":["sqrt", "log2"], 
                      "min_samples_leaf":[minleaf], "ccp_alpha": [ccp_alpha]}
    
    # train and return the best tree
    tree_cv = GridSearchCV(tree, param_grid=parameter_grid, scoring='balanced_accuracy', n_jobs=2, cv=3, verbose=0)
    tree_cv.fit(dataset, dataset_labels)
    best_tree = tree_cv.best_estimator_
    return best_tree

best_tree = find_best_tree(adult_train_cat_X, adult_train_y, ccp_alpha=0.2)

In [18]:
# taken from: https://stackoverflow.com/a/51398390
def is_leaf(inner_tree, index):
    # check whether node is leaf node
    return (inner_tree.children_left[index] == TREE_LEAF and 
            inner_tree.children_right[index] == TREE_LEAF)

def prune_index(inner_tree, decisions, index=0):
    # start pruning from the bottom - if we start from the top, we might miss
    # nodes that become leaves during pruning
    if not is_leaf(inner_tree, inner_tree.children_left[index]):
        prune_index(inner_tree, decisions, inner_tree.children_left[index])
    if not is_leaf(inner_tree, inner_tree.children_right[index]):
        prune_index(inner_tree, decisions, inner_tree.children_right[index])

    # prune children if both children are leaves now and make the same decision
    if (is_leaf(inner_tree, inner_tree.children_left[index]) and
        is_leaf(inner_tree, inner_tree.children_right[index]) and
        (decisions[index] == decisions[inner_tree.children_left[index]]) and 
        (decisions[index] == decisions[inner_tree.children_right[index]])):
        # turn node into a leaf by "unlinking" its children
        inner_tree.children_left[index] = TREE_LEAF
        inner_tree.children_right[index] = TREE_LEAF

def prune_duplicate_leaves(mdl):
    # Remove leaves if all siblings make the same decision
    decisions = mdl.tree_.value.argmax(axis=2).flatten().tolist() # Decision for each node
    prune_index(mdl.tree_, decisions)
    
# pruning happens in-place
prune_duplicate_leaves(best_tree)

In [19]:
def positive_rules (tree, rules):
    """From the extracted rules, return those that have a favorable classification. 

    Arg:
        tree: The tree classification object from which the rules are extracted. 
        rules: Dict of which the values are rule strings.

    Returns:
        A list of all the rules that classify favorably"""

    # only those rules are added for which the majority of individuals in the node is at index 1, i.e. max
    # index 1 corresponds to class 1 which we ensured was the favorable outcome
    return [rule for node_id, rule in rules.items() if np.argmax(tree.tree_.value[node_id][0])]


In [20]:
# taken from: https://stackoverflow.com/a/56427596
def extract_pos_rules(tree, dataset, datasetname):
    n_nodes = tree.tree_.node_count
    children_left = tree.tree_.children_left
    children_right = tree.tree_.children_right
    feature = tree.tree_.feature
    threshold = tree.tree_.threshold

    def find_path(node_numb, path, x):
        path.append(node_numb)
        if node_numb == x:
            return True
        left = False
        right = False
        if (children_left[node_numb] !=-1):
            left = find_path(children_left[node_numb], path, x)
        if (children_right[node_numb] !=-1):
            right = find_path(children_right[node_numb], path, x)
        if left or right :
            return True
        path.remove(node_numb)
        return False


    def get_rule(datasetname, path, column_names):
        mask = '('
        for index, node in enumerate(path):
            # check if we are not in the leaf
            if index!=len(path)-1:
                # under or over the threshold?
                if (children_left[node] == path[index+1]):
                    mask += f"{datasetname}['{column_names[feature[node]]}']<= {threshold[node]}\t "
                else:
                    mask += f"{datasetname}['{column_names[feature[node]]}']> {threshold[node]} \t "

        # insert the & at the right places
        mask = mask.replace("\t", "&", mask.count("\t") - 1)
        mask = mask.replace("\t", "")
        mask += ")"
        return mask
    
    # Leaves
    leave_id = tree.apply(dataset)

    paths = {}
    for leaf in np.unique(leave_id):
        path_leaf = []
        find_path(0, path_leaf, leaf)
        paths[leaf] = np.unique(np.sort(path_leaf))

    rules = {}
    for key in paths:
        rules[key] = get_rule(datasetname, paths[key], [name for name in dataset.columns])
        
    return positive_rules(tree, rules)
        
extract_pos_rules(best_tree, adult_train_cat_X, "adult_train_cat_X")

[]

## PAFER Experiments

In [21]:
def bootstrap(dataset, dataset_labels, sens_dataset):
    """A bootstrapping function that helps to diversify the tree generating process."""
    indices = np.random.choice(dataset.index, size=len(dataset.index))
    
    return dataset.iloc[indices], dataset_labels.iloc[indices], sens_dataset.iloc[indices]

# dataset, labels, sens_dataset = bootstrap(adult_test_cat_X, adult_test_y, adult_test_sex)

In [22]:
def experiment(trainset, sens_trainset, trainsetname, trainset_labels, testset, sens_testset, testsetname, 
               testset_labels, epsilons=[0.05, 0.1, 0.15, 0.2, 0.25], minleaf=1, ccp_alpha=0.0, runs=5, combined=False):
    """Performs an experiment as described in the paper.
    
    trainset: The DataFrame containing the training instances.
    sens_trainset: The Series containing the sensitive attribute values for the trainset.
    trainsetname: The variable name of the trainset. Required because of rule evaluation.
    trainset_labels: The true outcomes for the prediction task for the trainset.
    testset: The DataFrame containing the test instances.
    sens_testset: The Series containing the sensitive attribute values for the testset.
    testsetname: The variable name of the testset. Required because of rule evaluation.
    testset_labels: The true outcomes for the prediction task for the testset.
    epsilons: The different privacy budgets to try. 
        epsilons must be a list.
    minleaf: The tree construction parameter denoting the minimum number of instances in a leaf node.
        minleaf should be in (0, 1].
    runs: The number of runs to average over. Advised to be quite high (e.g. 50) to compensate for noise.
    combined: Whether to combine all positive rules into one query. 
    
    Returns:
        The true SP of the tree and the estimated SP."""
    
    
    tree_sps = np.zeros((runs, len(epsilons)))
    tree_depths = np.zeros((runs, len(epsilons)))
    estimated_sps = np.zeros((runs, len(epsilons)))
    for i in range(runs):
        ruleset = []
        
        # keep boostrapping until we find a ruleset that has at least one positive rule
        while ruleset == [] or ruleset == ['()']:
            # sample with replacement
            dataset, dataset_labels, sens_dataset = bootstrap(trainset, trainset_labels, sens_trainset)
            
            # build tree 
            best_tree = find_best_tree(trainset, trainset_labels, minleaf, ccp_alpha)
        
            # extract positive rules
            prune_duplicate_leaves(best_tree)
            ruleset = extract_pos_rules(best_tree, trainset, testsetname)
        
        if combined:
            ruleset = [" | ".join(rule for rule in ruleset)]

        # calculate true SP
        tree_sps[i] = statistical_parity(best_tree.predict(testset), sens_testset)
        tree_depths[i] = best_tree.tree_.max_depth
        
        # apply PAFER
        for j, epsilon in enumerate(epsilons):
            estimated_sps[i, j] = estimate_sp(ruleset, testset, sens_testset, sorted(sens_testset.unique()), mechanism='laplacian', epsilon=epsilon)
        
    return tree_sps, tree_depths, estimated_sps
        

## MINLEAF EXPERIMENTS

In [None]:
# script to run the experiments
# TODO: current datastructures are not optimal or intuitive so could be helpful to streamline (for plotting)
minleafs = np.linspace(0.2, 0.001, 80)
runs = 50

# storage for results
tree_sps = []
tree_depths = []
estimated_sps = []
for minleaf in tqdm(minleafs):
    t_sps, t_depths, e_sps = experiment(adult_train_cat_X, adult_train_sex, "adult_train_cat_X", adult_train_y, 
                                         adult_test_cat_X, adult_test_sex, "adult_test_cat_X", adult_test_y, minleaf=minleaf, runs=runs)
    
    tree_sps.append(t_sps)
    tree_depths.append(t_depths)
    estimated_sps.append(e_sps)
    

In [None]:
tree_sps = np.array(tree_sps)
tree_depths = np.array(tree_depths)
estimated_sps = np.array(estimated_sps)

for arr, name in zip([tree_sps, tree_depths, estimated_sps], ["tree_sps", "tree_depths", "estimated_sps"]):
    with open(f"adult-sex-minleaf-{name}", "wb") as f:
        np.save(f, arr)

In [None]:
minleafs = np.linspace(0.2, 0.001, 80)
runs = 50

# storage for results
tree_sps = []
tree_depths = []
estimated_sps = []
for minleaf in tqdm(minleafs):
    t_sps, t_depths, e_sps = experiment(adult_train_cat_X, adult_train_race, "adult_train_cat_X", adult_train_y, 
                                         adult_test_cat_X, adult_test_race, "adult_test_cat_X", adult_test_y, minleaf=minleaf, runs=runs)
    
    tree_sps.append(t_sps)
    tree_depths.append(t_depths)
    estimated_sps.append(e_sps)
    

In [None]:
tree_sps = np.array(tree_sps)
tree_depths = np.array(tree_depths)
estimated_sps = np.array(estimated_sps)

for arr, name in zip([tree_sps, tree_depths, estimated_sps], ["tree_sps", "tree_depths", "estimated_sps"]):
    with open(f"adult-race-minleaf-{name}", "wb") as f:
        np.save(f, arr)

In [None]:
minleafs = np.linspace(0.2, 0.001, 80)
runs = 50

# storage for results
tree_sps = []
tree_depths = []
estimated_sps = []
for minleaf in tqdm(minleafs):
    t_sps, t_depths, e_sps = experiment(adult_train_cat_X, adult_train_sex_race, "adult_train_cat_X", adult_train_y, 
                                         adult_test_cat_X, adult_test_sex_race, "adult_test_cat_X", adult_test_y, minleaf=minleaf, runs=runs)
    
    tree_sps.append(t_sps)
    tree_depths.append(t_depths)
    estimated_sps.append(e_sps)
    

In [None]:
tree_sps = np.array(tree_sps)
tree_depths = np.array(tree_depths)
estimated_sps = np.array(estimated_sps)

for arr, name in zip([tree_sps, tree_depths, estimated_sps], ["tree_sps", "tree_depths", "estimated_sps"]):
    with open(f"adult-sex_race-minleaf-{name}", "wb") as f:
        np.save(f, arr)

## CCP_ALPHA EXPERIMENTS

In [23]:
# script to run the experiments
ccp_alphas = np.linspace(0.05, 0.001, 80)
runs = 50

# storage for results
tree_sps = []
tree_depths = []
estimated_sps = []
for ccp_alpha in tqdm(ccp_alphas):
    t_sps, t_depths, e_sps = experiment(adult_train_cat_X, adult_train_sex, "adult_train_cat_X", adult_train_y, 
                                         adult_test_cat_X, adult_test_sex, "adult_test_cat_X", adult_test_y, ccp_alpha=ccp_alpha, runs=runs)
    
    tree_sps.append(t_sps)
    tree_depths.append(t_depths)
    estimated_sps.append(e_sps)
    

100%|██████████████████████████████████████████| 10/10 [18:04<00:00, 108.45s/it]


In [230]:
tree_sps = np.array(tree_sps)
estimated_sps = np.array(estimated_sps)
tree_depths = np.array(tree_depths)

for arr, name in zip([tree_sps, tree_depths, estimated_sps], ["tree_sps", "tree_depths" "estimated_sps"]):
    with open(f"adult-sex-ccp-{name}", "wb") as f:
        np.save(f, arr)

In [None]:
ccp_alphas = np.linspace(0.05, 0.001, 80)
runs = 50

# storage for results
tree_sps = []
tree_depths = []
estimated_sps = []
for ccp_alpha in tqdm(ccp_alphas):
    t_sps, t_depths e_sps = experiment(adult_train_cat_X, adult_train_race, "adult_train_cat_X", adult_train_y, 
                                         adult_test_cat_X, adult_test_race, "adult_test_cat_X", adult_test_y, ccp_alpha=ccp_alpha, runs=runs)
    
    tree_sps.append(t_sps)
    tree_depths.append(t_depths)
    estimated_sps.append(e_sps)
    

In [187]:
tree_sps = np.array(tree_sps)
estimated_sps = np.array(estimated_sps)
tree_depths = np.array(tree_depths)

for arr, name in zip([tree_sps, tree_depths, estimated_sps], ["tree_sps", "tree_depths", "estimated_sps"]):
    with open(f"adult-race-ccp-{name}", "wb") as f:
        np.save(f, arr)

In [None]:
ccp_alphas = np.linspace(0.05, 0.001, 80)
runs = 50

# storage for results
tree_sps = []
tree_depths = []
estimated_sps = []
for ccp_alpha in tqdm(ccp_alphas):
    t_sps, t_depths, e_sps = experiment(adult_train_cat_X, adult_train_sex_race, "adult_train_cat_X", adult_train_y, 
                                         adult_test_cat_X, adult_test_sex_race, "adult_test_cat_X", adult_test_y, ccp_alpha=ccp_alpha, runs=runs)
    
    tree_sps.append(t_sps)
    tree_depths.append(t_depths)
    estimated_sps.append(e_sps)
    

In [None]:
tree_sps = np.array(tree_sps)
estimated_sps = np.array(estimated_sps)
tree_depths = np.array(tree_depths)

for arr, name in zip([tree_sps, tree_depths, estimated_sps], ["tree_sps", "tree_depths", "estimated_sps"]):
    with open(f"adult-sex_race-ccp-{name}", "wb") as f:
        np.save(f, arr)

## Preliminary Experiments

In [None]:
# minleafs = np.linspace(0.2, 0.001, 80)
# runs = 50

# # storage for results
# tree_sps = []
# estimated_sps = []
# for minleaf in tqdm(minleafs):
#     t_sps, e_sps = experiment(adult_train_cat_X, adult_train_race, "adult_train_cat_X", adult_train_y, 
#                                          adult_test_cat_X, adult_test_race, "adult_test_cat_X", adult_test_y, minleaf=minleaf, runs=runs)
    
#     tree_sps.append(t_sps)
#     estimated_sps.append(e_sps)
    

In [57]:
# tree_sps = np.array(tree_sps)
# estimated_sps = np.array(estimated_sps)

# for arr, name in zip([tree_sps, estimated_sps], ["tree_sps", "estimated_sps"]):
#     with open(f"adult-race-{name}-prelim", "wb") as f:
#         np.save(f, arr)

In [59]:
# estimated_sps

array([[[0.56158666, 0.67429807, 0.62729312, 0.63788161, 0.6258125 ],
        [0.68767384, 0.75624062, 0.69611309, 0.6905098 , 0.69287197],
        [0.59128158, 0.58010349, 0.68337842, 0.64348167, 0.58882402],
        [0.56643611, 0.61609449, 0.58226534, 0.64314025, 0.65472144],
        [0.5695656 , 0.71448554, 0.63400735, 0.63285944, 0.63195388],
        [0.67738089, 0.59323878, 0.64393849, 0.68199716, 0.61076666],
        [0.70573709, 0.62834749, 0.61471532, 0.62465847, 0.62974223],
        [0.63612126, 0.65199927, 0.66953198, 0.69131622, 0.67930011],
        [0.62331176, 0.64760974, 0.68038415, 0.70384549, 0.67939058],
        [0.63037357, 0.62012692, 0.58502845, 0.62239223, 0.63153792],
        [0.7181779 , 0.617675  , 0.64051758, 0.59238577, 0.59617007],
        [0.64629109, 0.64611606, 0.69280751, 0.68660065, 0.69228723],
        [0.64704817, 0.69047115, 0.61983063, 0.56585228, 0.68647668],
        [0.68014494, 0.6878075 , 0.65954098, 0.66484317, 0.63556792],
        [0.5848538 ,