# Bias and Fairness Assessment (Binary Classification: Adult Income)

### Step 1: Install and import dependencies


In [1]:
from ucimlrepo import fetch_ucirepo
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

In [2]:
# fetch dataset
adult = fetch_ucirepo(id=2)
adult = adult.data.features.join(adult.data.targets, how="inner")

In [3]:
adult.head(3)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K


## Basic Preprocessing Steps

### 1. Drop missing values

In [4]:
# Drop missing values
adult.dropna(inplace=True)

### 2. Copy DataFrame for posterity

In [5]:
df = adult.copy()

In [6]:
adult["income"].value_counts()

income
<=50K     24720
<=50K.    11360
>50K       7841
>50K.      3700
Name: count, dtype: int64

### 3. Encode categorical variables

In [7]:
def outcome_merge(val):
    if val == "<=50K" or val == "<=50K.":
        return 0
    else:
        return 1

In [8]:
df["income"] = df["income"].apply(outcome_merge)

In [9]:
#  sex, count and percentages above_50k

income_by_sex = df.groupby("sex")["income"].agg(
    ["count", lambda x: (x.sum() / x.count()) * 100]
)
income_by_sex.columns = ["count", "percentage_above_50k"]
income_by_sex

Unnamed: 0_level_0,count,percentage_above_50k
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
Female,15684,11.068605
Male,31937,30.701068


In [10]:
#  race, count and percentages above_50k

income_by_race = df.groupby("race")["income"].agg(
    ["count", lambda x: (x.sum() / x.count()) * 100]
)
income_by_race.columns = ["count", "percentage_above_50k"]
income_by_race

Unnamed: 0_level_0,count,percentage_above_50k
race,Unnamed: 1_level_1,Unnamed: 2_level_1
Amer-Indian-Eskimo,460,11.956522
Asian-Pac-Islander,1447,27.436075
Black,4535,12.238148
Other,393,12.468193
White,40786,25.707351


In [11]:
df['race'] = df['race'].replace("Amer-Indian-Eskimo", "Native American or Inuit")

### 4. Split the data

In [12]:
# Split data
X = df.drop("income", axis=1)
y = df["income"]

In [13]:
for col in X.columns:
    if isinstance(X[col], object):
        X[col] = X[col].astype("category")

In [14]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
)

In [15]:
y_train.value_counts()

income
0    28910
1     9186
Name: count, dtype: int64

## Train XGBoost Model

In [16]:
model = XGBClassifier(eval_metric="logloss", random_state=42, enable_categorical=True)
model.fit(X_train, y_train)

## Evaluate XGBoost Model

In [17]:
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.94      0.91      7170
           1       0.77      0.64      0.70      2355

    accuracy                           0.86      9525
   macro avg       0.83      0.79      0.81      9525
weighted avg       0.86      0.86      0.86      9525



# Bias and Fairness Analysis with EquiBoots

**Equiboots supports a point estimate fairness analysis on a model's operating point (e.g., optimal threshold) as well as on multiple bootstraps with replacement.**


To initialize an analysis with equiboots:

1. Define a fairness Dataframe with the variables of interest.
2. Initialize an equiboots object using:
    - Ground truth (y_true)
    - Model probabilities (y_prob)
    - Model predictions (y_pred)
3. Identify the columns/variables that we will be assessing (e.g., race, sex)

In [18]:
import equiboots as eqb

In [19]:
# get predictions and true values
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:,1]
y_test = y_test.to_numpy()

X_test[['race', 'sex']] = X_test[['race', 'sex']].astype(str)

## Point Estimates

In [20]:
sensitive_features = ['race', 'sex']

fairness_df = X_test[sensitive_features].reset_index(drop=True)

eq = eqb.EquiBoots(y_true=y_test,
                   y_pred=y_pred,
                   y_prob=y_prob,
                   fairness_df=fairness_df,
                   fairness_vars=sensitive_features)

eq.grouper(groupings_vars=sensitive_features)

Groups created


In [25]:
sliced_race_data = eq.slicer("race")
sliced_sex_data = eq.slicer('sex')

race_metrics = eq.get_metrics(sliced_race_data)
sex_metrics = eq.get_metrics(sliced_sex_data)

In [28]:
race_metrics['White']

{'Accuracy': 0.8551959114139693,
 'Precision': 0.7661334804191947,
 'Recall': 0.6445475638051044,
 'F1 Score': 0.7001008064516129,
 'Specificity': 0.9300676232888009,
 'TP Rate': 0.6445475638051044,
 'FP Rate': 0.06993237671119908,
 'FN Rate': 0.3554524361948956,
 'TN Rate': 0.9300676232888009,
 'TP': 1389,
 'FP': 424,
 'FN': 766,
 'TN': 5639,
 'Prevalence': 0.26222925285957654,
 'Predicted Prevalence': 0.22061328790459966,
 'ROC AUC': 0.9159919453625562,
 'Average Precision Score': 0.8240788056151153,
 'Log Loss': 0.3114641062357465,
 'Brier Score': 0.09921996666452443,
 'Calibration AUC': 0.03331544146342353}

In [29]:
test_config = {
    "test_type": "chi_square",
    "alpha": 0.05,
    "adjust_method": "bonferroni",
    "confidence_level": 0.95,
    "classification_task": "binary_classification"
}

In [30]:
stat_test_result_race = eq.analyze_statistical_significance(race_metrics,
                                                            "race", 
                                                            test_config,)

In [31]:
stat_test_result_sex = eq.analyze_statistical_significance(sex_metrics,
                                                           "sex",
                                                           test_config)

In [32]:
stat_test_result_race

{'omnibus': StatTestResult(statistic=145.20877121510574, p_value=2.651224460871548e-24, is_significant=True, test_name='Chi-Square Test', critical_value=None, effect_size=0.07128584560455012, confidence_interval=None),
 'Black': StatTestResult(statistic=116.58109757901026, p_value=2.1020300396154496e-24, is_significant=True, test_name='Chi-Square Test', critical_value=None, effect_size=0.1131799538986941, confidence_interval=None),
 'Asian-Pac-Islander': StatTestResult(statistic=0.3473008882417876, p_value=1.0, is_significant=False, test_name='Chi-Square Test', critical_value=None, effect_size=None, confidence_interval=None),
 'Native American or Inuit': StatTestResult(statistic=12.762260179009127, p_value=0.02589991404363808, is_significant=True, test_name='Chi-Square Test', critical_value=None, effect_size=0.039172423973108376, confidence_interval=None),
 'Other': StatTestResult(statistic=17.99166167282302, p_value=0.0022079739629796633, is_significant=True, test_name='Chi-Square Tes

In [33]:
int_list = np.linspace(0, len(y_test), num=len(y_test), dtype=int).tolist()

In [34]:
eqb_bootstrap = eqb.EquiBoots(
    y_true=y_test,
    y_pred=y_pred,
    y_prob=y_prob,
    fairness_df=fairness_df,
    fairness_vars=['race'],
    seeds=int_list,
    reference_groups=['White'],
    task='binary_classification',
    bootstrap_flag=True,
    num_bootstraps=5001,
    boot_sample_size=len(y_test),
    stratify_by_outcome=True
)

In [35]:
eqb_bootstrap.grouper(groupings_vars=['race'])

Bootstrapping iterations: 100%|██████████| 5001/5001 [01:38<00:00, 50.89it/s]

Groups created





In [36]:
bootstrap_race_data = eqb_bootstrap.slicer("race")



In [37]:
boots_race_metrics = eqb_bootstrap.get_metrics(bootstrap_race_data)

Calculating metrics for each bootstrap:


100%|██████████| 5001/5001 [02:37<00:00, 31.81it/s]


In [39]:
diffs = eqb_bootstrap.calculate_differences(boots_race_metrics, "race")

In [40]:
eqb.find_group_thresholds(y_true=y_test,
                          y_prob=y_prob,
                          reference_group='White',
                          group_vec=fairness_df['race'],
                          threshold_range=[0.1, 0.9],
                          n_steps=100,
                          default_threshold=0.5)

{'Asian-Pac-Islander': 0.4878787878787879,
 'Black': 0.407070707070707,
 'Native American or Inuit': 0.3909090909090909,
 'Other': 0.17272727272727273,
 'White': 0.5}