In [8]:
import os
import pandas as pd
import numpy as np
import random
import json

random.seed(42)

# Pre-Process Income

In [9]:
def pre_process_income(df):
    """
    Pre-process the income dataset to make it ready for the simulation
    In this function we consider "SEX" as the sensitive value and "PINCP" as the target value.

    Args:
        data: the raw data
        years_list: the list of years to be considered
        states_list: the list of states to be considered

    Returns:
        Returns a list of pre-processed data for each state, if multiple years are
        selected, the data are concatenated.
        We return three lists:
        - The first list contains a pandas dataframe of features for each state
        - The second list contains a pandas dataframe of labels for each state
        - The third list contains a pandas dataframe of groups for each state
        The values in the list are numpy array of the dataframes
    """

    categorical_columns = ["COW", "SCHL"] #, "RAC1P"]
    continuous_columns = ["AGEP", "WKHP", "OCCP", "POBP", "RELP"]

    # get the target and sensitive attributes
    target_attributes = df[">50K"]
    sensitive_attributes = df["SEX"]

    # convert the columns to one-hot encoding
    df = pd.get_dummies(df, columns=categorical_columns, dtype=int)

    # normalize the continuous columns between 0 and 1
    for col in continuous_columns:
        df[col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min())

    return pd.DataFrame(df)


def pre_process_single_datasets(df):
    dataframe = pd.DataFrame()
    label = pd.DataFrame()
    group = pd.DataFrame()
    second_group = pd.DataFrame()
    third_group = pd.DataFrame()
    dataframes = []
    labels = []
    groups = []
    second_groups = []
    third_groups = []
    target_attributes = df[">50K"]
    sensitive_attributes = df["SEX"]
    second_sensitive_attributes = df["MAR"]
    
    third_sensitive_attributes = df["RAC1P"]
    third_sensitive_attributes = third_sensitive_attributes.astype(int)
    target_attributes = target_attributes.astype(int)

    sensitive_attributes = [1 if item == 1 else 0 for item in sensitive_attributes]

    second_sensitive_attributes = [
        1 if item == 1 else 0 for item in second_sensitive_attributes
    ]

    # third_sensitive_attributes = [
    #     1 if item == 1 else 0 for item in third_sensitive_attributes
    # ]

    df = df.drop([">50K"], axis=1)
    # df.drop(['RAC1P_1.0', 'RAC1P_2.0'], axis=1, inplace=True)

    # concatenate the dataframes
    dataframe = pd.concat([dataframe, df])
    # remove RAC1P from dataframe

    # convert the labels and groups to dataframes
    label = pd.concat([label, pd.DataFrame(target_attributes)])
    group = pd.concat([group, pd.DataFrame(sensitive_attributes)])
    second_group = pd.concat([second_group, pd.DataFrame(second_sensitive_attributes)])
    third_group = pd.concat([third_group, pd.DataFrame(third_sensitive_attributes)])

    assert len(dataframe) == len(label) == len(group) == len(second_group)
    dataframes.append(dataframe.to_numpy())
    labels.append(label.to_numpy())
    groups.append(group.to_numpy())
    second_groups.append(second_group.to_numpy())
    third_groups.append(third_group.to_numpy())
    return dataframes, labels, groups, second_groups, third_groups

In [10]:
from sklearn.model_selection import train_test_split
cross_silo = True
# folder = "../data/cross_device_attribute_final/"
# folder = "../data/cross_device_value_final/"
folder = "../data/cross_silo_attribute_final/"
# folder = "../data/cross_silo_value_final/"
list_files = !ls {folder}
unfair_dfs = []
print(list_files)

states = ['CT',
 'RI',
 'VT',
 'TX',
 'GA',
 'PR',
 'OH',
 'NE',
 'HI',
 'MO',
 'PA',
 'DE',
 'WV',
 'MD',
 'AZ',
 'LA',
 'WA',
 'TN',
 'MA',
 'NJ',
 'ME',
 'SC',
 'MI',
 'OK',
 'IL',
 'FL',
 'UT',
 'AK',
 'WI',
 'NH',
 'VA',
 'SD',
 'MS',
 'ND',
 'NC',
 'AL',
 'IA',
 'ID',
 'WY',
 'NV',
 'NM',
 'NY',
 'CA',
 'AR',
 'MN',
 'OR',
 'MT',
 'KY',
 'KS',
 'IN',
 'CO']

partitions_names = []

for state in states:
    partitions = set()
    for file in list_files:
        if file.endswith(".csv"):
            partition = int(file.split("_")[-1].split(".")[0])
            if partition not in partitions:
                partitions_names.append(f"{state}_{partition}")
                partitions.add(partition)
                try:
                    train = pd.read_csv(f"{folder}{state}_{partition}.csv")
                    # split the train csv into train and test
                    if cross_silo:
                        train, test = train_test_split(train, test_size=0.2, random_state=42)

                    # val = pd.read_csv(f"../data/{state}_val_{partition}.csv")
                    # concatenated_data = pd.concat([train, val])
                    unfair_dfs.append(train)
                    # df = pd.read_csv(f"../data/{state}_test_{partition}.csv")
                    if cross_silo:
                        unfair_dfs.append(test)
                except:
                    print(f"Error reading file {state}_{partition}.csv")
                    continue
    
partitions_names

['AK_0.csv', 'AL_0.csv', 'AR_0.csv', 'AZ_0.csv', 'CA_0.csv', 'CO_0.csv', 'CT_0.csv', 'DE_0.csv', 'FL_0.csv', 'FL_data', 'GA_0.csv', 'HI_0.csv', 'IA_0.csv', 'ID_0.csv', 'IL_0.csv', 'IN_0.csv', 'KS_0.csv', 'KY_0.csv', 'LA_0.csv', 'MA_0.csv', 'MD_0.csv', 'ME_0.csv', 'MI_0.csv', 'MN_0.csv', 'MO_0.csv', 'MS_0.csv', 'MT_0.csv', 'NC_0.csv', 'ND_0.csv', 'NE_0.csv', 'NH_0.csv', 'NJ_0.csv', 'NM_0.csv', 'NV_0.csv', 'NY_0.csv', 'OH_0.csv', 'OK_0.csv', 'OR_0.csv', 'PA_0.csv', 'PR_0.csv', 'RI_0.csv', 'SC_0.csv', 'SD_0.csv', 'TN_0.csv', 'TX_0.csv', 'UT_0.csv', 'VA_0.csv', 'VT_0.csv', 'WA_0.csv', 'WI_0.csv', 'WV_0.csv', 'WY_0.csv']


['CT_0',
 'RI_0',
 'VT_0',
 'TX_0',
 'GA_0',
 'PR_0',
 'OH_0',
 'NE_0',
 'HI_0',
 'MO_0',
 'PA_0',
 'DE_0',
 'WV_0',
 'MD_0',
 'AZ_0',
 'LA_0',
 'WA_0',
 'TN_0',
 'MA_0',
 'NJ_0',
 'ME_0',
 'SC_0',
 'MI_0',
 'OK_0',
 'IL_0',
 'FL_0',
 'UT_0',
 'AK_0',
 'WI_0',
 'NH_0',
 'VA_0',
 'SD_0',
 'MS_0',
 'ND_0',
 'NC_0',
 'AL_0',
 'IA_0',
 'ID_0',
 'WY_0',
 'NV_0',
 'NM_0',
 'NY_0',
 'CA_0',
 'AR_0',
 'MN_0',
 'OR_0',
 'MT_0',
 'KY_0',
 'KS_0',
 'IN_0',
 'CO_0']

In [11]:
len(unfair_dfs)

102

In [12]:
concatenated_df = pd.concat(unfair_dfs, ignore_index=True)
concatenated_df["PINCP"] = [1 if item == True else 0 for item in concatenated_df["PINCP"]]

# rename the column PINCP to >50K
concatenated_df.rename(columns={"PINCP": ">50K"}, inplace=True)

In [13]:
concatenated_df["RAC1P"].unique()

array([1., 2.])

In [14]:
concatenated_df.drop(["__index_level_0__"], axis=1, inplace=True)
# concatenated_df.drop(["__index_level_0__", "Unnamed: 0", "Unnamed: 0.1"], axis=1, inplace=True)

In [15]:
concatenated_df

Unnamed: 0,AGEP,COW,SCHL,MAR,OCCP,POBP,RELP,WKHP,SEX,RAC1P,>50K
0,66.0,1.0,16.0,1.0,7315.0,9.0,0.0,35.0,1.0,1.0,1
1,52.0,1.0,17.0,2.0,5610.0,53.0,0.0,6.0,2.0,1.0,0
2,63.0,1.0,20.0,1.0,3620.0,301.0,1.0,40.0,2.0,1.0,1
3,64.0,1.0,16.0,2.0,4030.0,9.0,5.0,32.0,2.0,1.0,0
4,26.0,1.0,24.0,2.0,3050.0,9.0,0.0,37.0,2.0,2.0,1
...,...,...,...,...,...,...,...,...,...,...,...
1626653,46.0,3.0,22.0,1.0,4252.0,24.0,0.0,60.0,1.0,1.0,1
1626654,20.0,3.0,19.0,2.0,2310.0,303.0,13.0,20.0,1.0,1.0,0
1626655,50.0,1.0,20.0,2.0,4720.0,2.0,13.0,40.0,1.0,1.0,1
1626656,21.0,4.0,19.0,2.0,2435.0,53.0,0.0,17.0,2.0,1.0,0


In [16]:
len(unfair_dfs[0])

15828

In [17]:
len(unfair_dfs[1])

3957

In [18]:
# Apply one-hot encoding
pre_processed_df = pre_process_income(concatenated_df)

split_dfs = []
start_idx = 0
for df in unfair_dfs:
    end_idx = start_idx + len(df)
    split_dfs.append(pre_processed_df.iloc[start_idx:end_idx])
    start_idx = end_idx

In [19]:
print(len(split_dfs))

102


In [20]:
if cross_silo:
    for index in range(0, len(split_dfs), 2):
        train_state = split_dfs[index]
        test_state = split_dfs[index + 1]
        print(len(train_state), len(test_state))
        (
            train_data,
            train_labels,
            train_groups,
            train_second_groups,
            train_third_groups,
        ) = pre_process_single_datasets(train_state)
        (
            test_data,
            test_labels,
            test_groups,
            test_second_groups,
            test_third_groups,
        ) = pre_process_single_datasets(test_state)

        print(index // 2, train_data[0].shape, test_data[0].shape)

        if not os.path.exists(
            f"{folder}FL_data/federated/{index // 2}"
        ):
            os.makedirs(f"{folder}FL_data/federated/{index // 2}")
            # save partitions_names 
        json_file = {index:data for index, data in enumerate(partitions_names)}
        with open(f"{folder}FL_data/federated/partitions_names.json", "w") as f:
            json.dump(json_file, f)
        np.save(
            f"{folder}FL_data/federated/{index // 2}/income_dataframes_{index // 2}_train.npy",
            train_data[0],
        )
        np.save(
            f"{folder}FL_data/federated/{index // 2}/income_labels_{index // 2}_train.npy",
            train_labels[0],
        )
        np.save(
            f"{folder}FL_data/federated/{index // 2}/income_groups_{index // 2}_train.npy",
            train_groups[0],
        )
        np.save(
            f"{folder}FL_data/federated/{index // 2}/income_second_groups_{index // 2}_train.npy",
            train_second_groups[0],
        )
        np.save(
            f"{folder}FL_data/federated/{index // 2}/income_third_groups_{index // 2}_train.npy",
            train_third_groups[0],
        )


        np.save(
            f"{folder}FL_data/federated/{index // 2}/income_dataframes_{index // 2}_test.npy",
            test_data[0],
        )
        np.save(
            f"{folder}FL_data/federated/{index // 2}/income_labels_{index // 2}_test.npy",
            test_labels[0],
        )
        np.save(
            f"{folder}FL_data/federated/{index // 2}/income_groups_{index // 2}_test.npy",
            test_groups[0],
        )
        np.save(
            f"{folder}FL_data/federated/{index // 2}/income_second_groups_{index // 2}_test.npy",
            test_second_groups[0],
        )
        np.save(
            f"{folder}FL_data/federated/{index // 2}/income_third_groups_{index // 2}_test.npy",
            test_third_groups[0],
        )
else:
    for index in range(0, len(split_dfs)):
        train_state = split_dfs[index]
        (
            train_data,
            train_labels,
            train_groups,
            train_second_groups,
            train_third_groups,
        ) = pre_process_single_datasets(train_state)

        print(index, train_data[0].shape)

        if not os.path.exists(
            f"{folder}FL_data/federated/{index}"
        ):
            os.makedirs(f"{folder}FL_data/federated/{index}")
            # save partitions_names 
        json_file = {index:data for index, data in enumerate(partitions_names)}
        with open(f"{folder}FL_data/federated/partitions_names.json", "w") as f:
            json.dump(json_file, f)
        np.save(
            f"{folder}FL_data/federated/{index}/income_dataframes_{index}_train.npy",
            train_data[0],
        )
        np.save(
            f"{folder}FL_data/federated/{index}/income_labels_{index}_train.npy",
            train_labels[0],
        )
        np.save(
            f"{folder}FL_data/federated/{index}/income_groups_{index}_train.npy",
            train_groups[0],
        )
        np.save(
            f"{folder}FL_data/federated/{index}/income_second_groups_{index}_train.npy",
            train_second_groups[0],
        )
        np.save(
            f"{folder}FL_data/federated/{index}/income_third_groups_{index}_train.npy",
            train_third_groups[0],
        )



15828 3957
0 (15828, 40) (3957, 40)
4569 1143
1 (4569, 40) (1143, 40)
3008 753
2 (3008, 40) (753, 40)
102981 25746
3 (102981, 40) (25746, 40)
40378 10095
4 (40378, 40) (10095, 40)
7092 1774
5 (7092, 40) (1774, 40)
46666 11667
6 (46666, 40) (11667, 40)
8628 2157
7 (8628, 40) (2157, 40)
6184 1547
8 (6184, 40) (1547, 40)
23646 5912
9 (23646, 40) (5912, 40)
51803 12951
10 (51803, 40) (12951, 40)
3660 916
11 (3660, 40) (916, 40)
6245 1562
12 (6245, 40) (1562, 40)
26433 6609
13 (26433, 40) (6609, 40)
26621 6656
14 (26621, 40) (6656, 40)
16354 4089
15 (16354, 40) (4089, 40)
30444 7612
16 (30444, 40) (7612, 40)
25729 6433
17 (25729, 40) (6433, 40)
32091 8023
18 (32091, 40) (8023, 40)
38224 9557
19 (38224, 40) (9557, 40)
5601 1401
20 (5601, 40) (1401, 40)
19903 4976
21 (19903, 40) (4976, 40)
37666 9417
22 (37666, 40) (9417, 40)
13848 3462
23 (13848, 40) (3462, 40)
51248 12812
24 (51248, 40) (12812, 40)
79140 19785
25 (79140, 40) (19785, 40)
13069 3268
26 (13069, 40) (3268, 40)
2836 710
27 (2836

In [21]:
train_data[0].shape

(25044, 40)

In [22]:
import numpy as np
np.load(f"{folder}FL_data/federated/0/income_dataframes_0_train.npy", allow_pickle=True)

array([[0.62025316, 1.        , 0.74389002, ..., 0.        , 0.        ,
        0.        ],
       [0.44303797, 2.        , 0.57026477, ..., 0.        , 0.        ,
        0.        ],
       [0.58227848, 1.        , 0.36761711, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.4556962 , 1.        , 0.33044807, ..., 0.        , 0.        ,
        0.        ],
       [0.3164557 , 1.        , 0.12321792, ..., 0.        , 0.        ,
        0.        ],
       [0.63291139, 1.        , 0.31364562, ..., 0.        , 1.        ,
        0.        ]])

In [23]:
import numpy as np
np.load(f"{folder}FL_data/federated/0/income_labels_0_train.npy", allow_pickle=True)

array([[1],
       [0],
       [1],
       ...,
       [0],
       [1],
       [1]])

In [24]:
import numpy as np
np.load(f"{folder}FL_data/federated/0/income_groups_0_train.npy", allow_pickle=True)

array([[1],
       [0],
       [0],
       ...,
       [0],
       [1],
       [1]])

In [25]:
import numpy as np
np.load(f"{folder}FL_data/federated/0/income_second_groups_0_train.npy", allow_pickle=True)

array([[1],
       [0],
       [1],
       ...,
       [1],
       [1],
       [1]])

In [26]:
import numpy as np
third = np.load(f"{folder}FL_data/federated/0/income_third_groups_0_train.npy", allow_pickle=True)

In [27]:
third = [item[0] for item in third]
set(third)

{1, 2}

In [28]:
import numpy as np
if cross_silo:
    np.load(f"{folder}FL_data/federated/0/income_second_groups_0_test.npy", allow_pickle=True)

In [29]:
import numpy as np
if cross_silo:
    np.load(f"{folder}FL_data/federated/0/income_third_groups_0_train.npy", allow_pickle=True)

In [30]:
import numpy as np
if cross_silo:
    third = np.load(f"{folder}FL_data/federated/0/income_third_groups_0_test.npy", allow_pickle=True)

In [31]:
import numpy as np
if cross_silo:
    np.load(f"{folder}FL_data/federated/0/income_groups_0_test.npy", allow_pickle=True)

In [32]:
import numpy as np
if cross_silo:
    np.load(f"{folder}FL_data/federated/0/income_labels_0_test.npy", allow_pickle=True)

# All below is a test

In [None]:
from fairfl_data.fairness_computation import _compute_fairness

In [None]:
import pandas as pd
from sklearn.metrics import confusion_matrix
import numpy as np

def equalized_odds_difference_by_outcome(y_true, y_pred, sensitive_features):
    """
    Calculates the equalized odds difference, considering outcomes separately for each group.

    Parameters:
    y_true (np.array): True labels.
    y_pred (np.array): Predicted labels.
    sensitive_features (np.array): Sensitive feature(s) that define the groups.

    Returns:
    float: The equalized odds difference.
    """
    if not isinstance(sensitive_features, pd.Series):
        sensitive_features = pd.Series(sensitive_features)

    unique_groups = sensitive_features.unique()
    possible_outcomes = np.unique(y_true)

    tpr_values = {group: {} for group in unique_groups}
    fpr_values = {group: {} for group in unique_groups}

    for group in unique_groups:
        group_indices = sensitive_features[sensitive_features == group].index
        y_true_group = y_true[group_indices]
        y_pred_group = y_pred[group_indices]

        tn, fp, fn, tp = confusion_matrix(y_true_group, y_pred_group).ravel()
        tpr_values[group][1] = tp / (tp + fn) if (tp + fn) > 0 else 0  # TPR for outcome 1
        fpr_values[group][1] = fp / (fp + tn) if (fp + tn) > 0 else 0  # FPR for outcome 1
        # For binary classification where outcomes are 0 and 1, we can also consider metrics for the negative outcome (0)
        if len(possible_outcomes) == 2:
            tn_r, fn_r, fp_r, tp_r = confusion_matrix(1 - y_true_group, 1 - y_pred_group).ravel()
            tpr_values[group][0] = tp_r / (tp_r + fp_r) if (tp_r + fp_r) > 0 else 0 # TNR (TPR for outcome 0)
            fpr_values[group][0] = fn_r / (fn_r + tn_r) if (fn_r + tn_r) > 0 else 0 # FNR (FPR for outcome 0)

    tpr_diffs = []
    fpr_diffs = []

    for outcome in possible_outcomes:
        tprs_outcome = [tpr_values[group].get(outcome, 0) for group in unique_groups]
        fprs_outcome = [fpr_values[group].get(outcome, 0) for group in unique_groups]

        tpr_diffs.append(max(tprs_outcome) - min(tprs_outcome))
        fpr_diffs.append(max(fprs_outcome) - min(fprs_outcome))

    return max(abs(max(tpr_diffs)), abs(max(fpr_diffs)))

y_true = np.array([0, 1, 0, 1, 0, 1, 0, 1, 0, 1])
y_pred = np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1])
sensitive_features = np.array([0, 0, 0, 0, 1, 1, 1, 1, 1, 1])

# Worst-case aggregation
eo_diff_worst = equalized_odds_difference_by_outcome(y_true, y_pred, sensitive_features)
print(f"Equalized Odds Difference (by outcome, worst_case): {eo_diff_worst:.4f}")

In [None]:
confusion_matrix(y_true, y_pred)

In [None]:
import pandas as pd
from sklearn.metrics import confusion_matrix
import numpy as np

def equalized_odds_disparity_details(y_true, y_pred, sensitive_features):
    """
    Calculates the equalized odds difference and provides details on group-wise disparities.

    Parameters:
    y_true (np.array): True labels.
    y_pred (np.array): Predicted labels.
    sensitive_features (np.array): Sensitive feature(s) that define the groups.

    Returns:
    dict: A dictionary containing:
        - 'equalized_odds_difference': The overall equalized odds difference.
        - 'group_metrics': TPR and FPR for each sensitive group and outcome.
        - 'max_tpr_disparity': Details of the largest TPR difference.
        - 'max_fpr_disparity': Details of the largest FPR difference.
    """
    if not isinstance(sensitive_features, pd.Series):
        sensitive_features = pd.Series(sensitive_features)

    unique_groups = sorted(sensitive_features.unique())
    possible_outcomes = np.unique(y_true)

    group_metrics = {}
    for group in unique_groups:
        group_metrics[group] = {}
        group_indices = sensitive_features[sensitive_features == group].index
        y_true_group = y_true[group_indices]
        y_pred_group = y_pred[group_indices]
        tn, fp, fn, tp = confusion_matrix(y_true_group, y_pred_group).ravel()
        group_metrics[group][1] = {'TPR': tp / (tp + fn) if (tp + fn) > 0 else 0,
                                   'FPR': fp / (fp + tn) if (fp + tn) > 0 else 0}
        if len(possible_outcomes) == 2:
            tn_r, fn_r, fp_r, tp_r = confusion_matrix(1 - y_true_group, 1 - y_pred_group).ravel()
            group_metrics[group][0] = {'TPR': tp_r / (tp_r + fp_r) if (tp_r + fp_r) > 0 else 0, # TNR
                                       'FPR': fn_r / (fn_r + tn_r) if (fn_r + tn_r) > 0 else 0} # FNR

    max_tpr_diff = 0
    max_tpr_groups = None
    max_tpr_outcome = None

    max_fpr_diff = 0
    max_fpr_groups = None
    max_fpr_outcome = None

    for outcome in possible_outcomes:
        tprs = {group: group_metrics[group].get(outcome, {}).get('TPR', 0) for group in unique_groups}
        fprs = {group: group_metrics[group].get(outcome, {}).get('FPR', 0) for group in unique_groups}

        current_tpr_diff = max(tprs.values()) - min(tprs.values())
        if abs(current_tpr_diff) > abs(max_tpr_diff):
            max_tpr_diff = current_tpr_diff
            min_tpr_group = [g for g, tpr in tprs.items() if tpr == min(tprs.values())]
            max_tpr_group = [g for g, tpr in tprs.items() if tpr == max(tprs.values())]
            max_tpr_groups = (min_tpr_group, max_tpr_group)
            max_tpr_outcome = outcome

        current_fpr_diff = max(fprs.values()) - min(fprs.values())
        if abs(current_fpr_diff) > abs(max_fpr_diff):
            max_fpr_diff = current_fpr_diff
            min_fpr_group = [g for g, fpr in fprs.items() if fpr == min(fprs.values())]
            max_fpr_group = [g for g, fpr in fprs.items() if fpr == max(fprs.values())]
            max_fpr_groups = (min_fpr_group, max_fpr_group)
            max_fpr_outcome = outcome

    overall_eo_diff = max(abs(max_tpr_diff), abs(max_fpr_diff))

    return {
        'equalized_odds_difference': overall_eo_diff,
        'group_metrics': group_metrics,
        'max_tpr_disparity': {
            'outcome': max_tpr_outcome,
            'groups': max_tpr_groups,
            'difference': max_tpr_diff
        },
        'max_fpr_disparity': {
            'outcome': max_fpr_outcome,
            'groups': max_fpr_groups,
            'difference': max_fpr_diff
        }
    }

y_true = np.array([0, 1, 0, 1, 0, 1, 0, 1, 0, 1])
y_pred = np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1])
sensitive_features = np.array([0, 0, 0, 0, 1, 1, 1, 1, 1, 1])

disparity_analysis = equalized_odds_disparity_details(y_true, y_pred, sensitive_features)
print(f"Equalized Odds Difference: {disparity_analysis['equalized_odds_difference']:.4f}")
print("\nGroup-wise Metrics:")
print(disparity_analysis)

In [None]:
import numpy as np
import pandas as pd

def demographic_parity_difference_by_outcome(y_pred, sensitive_features):
    """
    Computes the demographic parity difference as:
    max(P(Ŷ=y | A=group)) - min(P(Ŷ=y | A=group))
    for each outcome y, and returns the maximum across outcomes,
    along with the (outcome, group_max, group_min) responsible.

    Parameters:
    y_pred (np.array): Predicted labels (can be binary or multiclass).
    sensitive_features (np.array or pd.Series): Sensitive feature(s) defining the groups.

    Returns:
    float: The maximum demographic parity difference across outcomes.
    tuple: (outcome, group_max, group_min) responsible for this max difference.
    """
    if not isinstance(sensitive_features, pd.Series):
        sensitive_features = pd.Series(sensitive_features)

    unique_groups = sensitive_features.unique()
    unique_outcomes = np.unique(y_pred)

    max_diff = 0.0
    responsible_info = (None, None, None)  # (outcome, group_max, group_min)

    for outcome in unique_outcomes:
        outcome_probs = {}

        for group in unique_groups:
            group_indices = sensitive_features[sensitive_features == group].index
            y_pred_group = y_pred[group_indices]
            prob = np.mean(y_pred_group == outcome)
            outcome_probs[group] = prob

        group_max = max(outcome_probs, key=outcome_probs.get)
        group_min = min(outcome_probs, key=outcome_probs.get)
        diff = outcome_probs[group_max] - outcome_probs[group_min]

        if diff > max_diff:
            max_diff = diff
            responsible_info = (outcome, group_max, group_min)

    return max_diff, responsible_info


In [None]:
y_pred = np.array([1, 0, 1, 1, 0, 0, 1])
sensitive = np.array(['A', 'A', 'B', 'B', 'B', 'C', 'C'])

disparity, (group1, group2) = demographic_disparity_by_group(y_pred, sensitive)
print(f"Max demographic disparity: {disparity:.3f} between groups {group1} and {group2}")


In [None]:
from itertools import product
import numpy as np 
import pandas as pd
from fairlearn.metrics import MetricFrame, selection_rate, true_positive_rate, false_positive_rate
y_pred = np.array([1, 0, 1, 1, 0, 0, 1])
sensitive = np.array(['A', 'A', 'B', 'B', 'B', 'C', 'C'])
y_true = np.array([1, 0, 1, 1, 0, 0, 1])

sf_data = pd.DataFrame(
    {
        "DP_RACE": sensitive
    }
)

sel_rate = MetricFrame(
    metrics={"sel":selection_rate},
    y_true=y_true,
    y_pred=y_pred,
    sensitive_features=sf_data,
    )
df = sel_rate.by_group
diff_matrix = df['sel'].values[:, None] - df['sel'].values[None, :]
index = df.index.values
column_names = [f"{index[i]}_{index[j]}" for i, j in product(range(len(df)), repeat=2)]

diff_df = pd.Series(diff_matrix.flatten(), index=column_names)
diff_df = pd.Series([diff_df.max(),diff_df.idxmax()], index=[f"DP_SEX_DP", f"DP_SEX_val"])
diff_df

In [None]:
diff_df[1].split("_")[1]