# Race

In [93]:
# General imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# Data processing
from sklearn.model_selection import train_test_split

# Models
# LightGBM is a gradient boosting framework that uses tree based learning algorithms
import lightgbm as lgb
from sklearn.calibration import CalibratedClassifierCV

# Fairlearn algorithms and utils
from fairlearn.postprocessing import ThresholdOptimizer
from fairlearn.reductions import GridSearch, EqualizedOdds

# Metrics
from fairlearn.metrics import (
    MetricFrame,
    selection_rate, demographic_parity_difference, demographic_parity_ratio,
    true_positive_rate, false_positive_rate, false_negative_rate,
    false_positive_rate_difference, false_negative_rate_difference,
    equalized_odds_difference)
from sklearn.metrics import balanced_accuracy_score, roc_auc_score

import functools

import sklearn.metrics as skm
from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_selector as selector
from sklearn.pipeline import Pipeline

from fairlearn.metrics import MetricFrame
from fairlearn.metrics import selection_rate, count

In [45]:
# load the clean data
dc = 'data/Fairlearn_DC.csv'
dc = pd.read_csv(dc, sep = ',')
dc.head()

Unnamed: 0,ethnicity,race,gender,action_taken,preapproval_requested,loan_type,loan_purpose,interest_only_payment,balloon_payment,debt_to_income_ratio,age,income,loan_to_value_ratio
0,0,1,0,1,0,1,1,2,2,3,3,1,1
1,0,1,1,1,0,3,3,2,2,4,3,3,1
2,0,2,0,1,0,1,3,2,2,4,2,3,3
3,0,1,0,1,0,1,3,2,2,2,3,5,1
4,0,3,0,1,0,1,1,2,2,4,3,2,1


In [46]:
# load the clean data
fl = 'data/Fairlearn_FL_PortSL.csv'
fl = pd.read_csv(fl, sep = ',')
fl.head()

Unnamed: 0,ethnicity,race,gender,action_taken,preapproval_requested,loan_type,loan_purpose,interest_only_payment,balloon_payment,debt_to_income_ratio,age,income,loan_to_value_ratio
0,0,1,1,1,0,1,1,2,2,3,3,1,1
1,0,1,1,1,0,1,1,2,2,4,2,2,3
2,0,1,0,1,0,1,1,2,2,4,2,3,1
3,0,1,0,1,0,1,3,2,2,2,2,3,1
4,0,1,0,1,0,1,1,2,2,2,3,2,1


In [47]:
# load the clean data
tx = 'data/Fairlearn_TX_Waco.csv'
tx = pd.read_csv(tx, sep = ',')
tx.head()

Unnamed: 0,ethnicity,race,gender,action_taken,preapproval_requested,loan_type,loan_purpose,interest_only_payment,balloon_payment,debt_to_income_ratio,age,income,loan_to_value_ratio
0,0,1,1,1,0,1,1,2,2,4,1,2,3
1,0,1,1,0,0,1,3,2,2,4,2,2,1
2,1,1,1,1,0,1,1,2,2,4,2,3,1
3,0,1,1,1,0,1,1,2,2,1,3,3,3
4,1,1,0,1,0,1,1,2,2,4,2,4,1


In [48]:
# load the clean data
il = 'data/Fairlearn_IL_Chicago.csv'
il = pd.read_csv(il, sep = ',')
il.head()

Unnamed: 0,ethnicity,race,gender,action_taken,preapproval_requested,loan_type,loan_purpose,interest_only_payment,balloon_payment,debt_to_income_ratio,age,income,loan_to_value_ratio
0,0,1,0,1,0,1,3,2,2,3,2,2,1
1,0,1,1,0,0,1,2,2,2,4,3,4,1
2,0,1,1,1,0,1,1,2,2,3,2,1,1
3,0,1,0,1,0,1,3,2,2,2,2,2,1
4,1,1,1,1,0,1,3,2,2,4,2,3,1


In [49]:
# training set = FL + TX + IL, testing set = DC
all_df = pd.concat([tx, fl, il, dc])

In [50]:
all_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 212714 entries, 0 to 89154
Data columns (total 13 columns):
 #   Column                 Non-Null Count   Dtype
---  ------                 --------------   -----
 0   ethnicity              212714 non-null  int64
 1   race                   212714 non-null  int64
 2   gender                 212714 non-null  int64
 3   action_taken           212714 non-null  int64
 4   preapproval_requested  212714 non-null  int64
 5   loan_type              212714 non-null  int64
 6   loan_purpose           212714 non-null  int64
 7   interest_only_payment  212714 non-null  int64
 8   balloon_payment        212714 non-null  int64
 9   debt_to_income_ratio   212714 non-null  int64
 10  age                    212714 non-null  int64
 11  income                 212714 non-null  int64
 12  loan_to_value_ratio    212714 non-null  int64
dtypes: int64(13)
memory usage: 22.7 MB


In [64]:
# Extract the sensitive feature - example: race
A = pd.DataFrame(all_df, columns=['race', 'ethnicity'])
A["race"] = all_df["race"].apply(lambda x:1 if x == 1 else 0)
A["ethnicity"] = all_df["ethnicity"].apply(lambda x: 0 if x == 1 else 1)
A

Unnamed: 0,race,ethnicity
0,1,1
1,1,1
2,1,0
3,1,1
4,1,0
...,...,...
89150,0,1
89151,1,1
89152,1,0
89153,0,1


In [66]:
A_str = A
A_str["race"] = A["race"].map({ 1:"White", 0:"Non-White"})
A_str["ethnicity"] = A["ethnicity"].map({1: "Non-Hispanic", 0:"Hispanic"})
A_str

Unnamed: 0,race,ethnicity
0,White,Non-Hispanic
1,White,Non-Hispanic
2,White,Hispanic
3,White,Non-Hispanic
4,White,Hispanic
...,...,...
89150,Non-White,Non-Hispanic
89151,White,Non-Hispanic
89152,White,Hispanic
89153,Non-White,Non-Hispanic


In [67]:
# Extract the target
y = all_df["action_taken"]

In [68]:
# Train-test split
X_train = all_df.drop(columns=['action_taken']).head(89155)
X_test = all_df.drop(columns=['action_taken']).tail(34404)
y_train = y.head(89155)
y_test = y.tail(34404)
A_train = A_str.head(89155)
A_test = A_str.tail(34404)

In [72]:
# Ensure indices are aligned between X, y and A,
# after all the slicing and splitting of DataFrames
# and Series

X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)
A_train = A_train.reset_index(drop=True)
A_test = A_test.reset_index(drop=True)

In [73]:
# %%
# Next, we build two :class:`~sklearn.pipeline.Pipeline` objects
# to process the columns, one for numeric data, and the other
# for categorical data. Both impute missing values; the difference
# is whether the data are scaled (numeric columns) or
# one-hot encoded (categorical columns). Imputation of missing
# values should generally be done with care, since it could
# potentially introduce biases. Of course, removing rows with
# missing data could also cause trouble, if particular subgroups
# have poorer data quality.

numeric_transformer = Pipeline(
    steps=[
        ("impute", SimpleImputer()),
        ("scaler", StandardScaler()),
    ]
)
categorical_transformer = Pipeline(
    [
        ("impute", SimpleImputer(strategy="most_frequent")),
        ("ohe", OneHotEncoder(handle_unknown="ignore")),
    ]
)
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, selector(dtype_exclude="category")),
        ("cat", categorical_transformer, selector(dtype_include="category")),
    ]
)

In [74]:
# %%
# With our preprocessor defined, we can now build a
# new pipeline which includes an Estimator:

unmitigated_predictor = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        (
            "classifier",
            LogisticRegression(solver="liblinear", fit_intercept=True),
        ),
    ]
)

In [75]:
# %%
# With the pipeline fully defined, we can first train it
# with the training data, and then generate predictions
# from the test data.

unmitigated_predictor.fit(X_train, y_train)
y_pred = unmitigated_predictor.predict(X_test)

In [76]:
# %%
# Analysing the Model with Metrics
# ================================
#
# After our data manipulations and model training, we have the following
# from our test set:
#
# - A vector of true values called ``y_test``
# - A vector of model predictions called ``y_pred``
# - A DataFrame of categorical features relevant to fairness called ``A_test``
#
# In a traditional model analysis, we would now look at some metrics
# evaluated on the entire dataset. Suppose in this case, the relevant
# metrics are :func:`fairlearn.metrics.selection_rate` and
# :func:`sklearn.metrics.fbeta_score` (with
# ``beta=0.6``).
# We can evaluate these metrics directly:

print("Selection Rate:", selection_rate(y_test, y_pred))
print("fbeta:", skm.fbeta_score(y_test, y_pred, beta=0.6))

Selection Rate: 0.8672538076967794
fbeta: 0.8894649046075789


In [77]:
# %%
# We know that there are sensitive features in our data, and we want to
# ensure that we're not harming individuals due to membership in any of
# these groups. For this purpose, Fairlearn provides the
# :class:`fairlearn.metrics.MetricFrame`
# class. Let us construct an instance of this class, and then look at
# its capabilities:

fbeta_06 = functools.partial(skm.fbeta_score, beta=0.6)

metric_fns = {'selection_rate': selection_rate, 'fbeta_06': fbeta_06, 'count': count}

grouped_on_sex = MetricFrame(metrics=metric_fns,
                             y_true=y_test,
                             y_pred=y_pred,
                             sensitive_features=A_test['race'])


In [78]:
# %%
# The :class:`fairlearn.metrics.MetricFrame` object requires a
# minimum of four arguments:
#
# 1. The underlying metric function(s) to be evaluated
# 2. The true values
# 3. The predicted values
# 4. The sensitive feature values
#
# These are all passed as arguments to the constructor. If more than
# one underlying metric is required (as in this case), then we must
# provide them in a dictionary.
#
# The underlying metrics must have a signature ``fn(y_true, y_pred)``,
# so we have to use :func:`functools.partial` on ``fbeta_score()`` to
# furnish ``beta=0.6`` (we will show how to pass in extra array
# arguments such as sample weights shortly).
#
# We will now take a closer look at the :class:`fairlearn.metrics.MetricFrame`
# object. First, there is the ``overall`` property, which contains
# the metrics evaluated on the entire dataset. We see that this contains the
# same values calculated above:

assert grouped_on_sex.overall['selection_rate'] == selection_rate(y_test, y_pred)
assert grouped_on_sex.overall['fbeta_06'] == skm.fbeta_score(y_test, y_pred, beta=0.6)
print(grouped_on_sex.overall)

selection_rate    0.867254
fbeta_06          0.889465
count                34404
dtype: object


In [79]:
# %%
# The other property in the :class:`fairlearn.metrics.MetricFrame` object
# is ``by_group``. This contains the metrics evaluated on each subgroup defined
# by the categories in the ``sensitive_features=`` argument. Note that
# :func:`fairlearn.metrics.count` can be used to display the number of
# data points in each subgroup. In this case, we have results for males and females:

grouped_on_sex.by_group

Unnamed: 0_level_0,selection_rate,fbeta_06,count
race,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Non-White,0.768175,0.839628,15089
White,0.944654,0.921334,19315


In [80]:
# %%
# We can immediately see a substantial disparity in the selection rate between
# males and females.
#
# We can also create another :class:`fairlearn.metrics.MetricFrame` object
# using race as the sensitive feature:

grouped_on_race = MetricFrame(metrics=metric_fns,
                              y_true=y_test,
                              y_pred=y_pred,
                              sensitive_features=A_test['ethnicity'])


In [81]:
# %%
# The ``overall`` property is unchanged:
assert (grouped_on_sex.overall == grouped_on_race.overall).all()


In [82]:
# %%
# The ``by_group`` property now contains the metrics evaluated based on the 'race'
# column:
grouped_on_race.by_group

Unnamed: 0_level_0,selection_rate,fbeta_06,count
ethnicity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Hispanic,0.88171,0.894381,4024
Non-Hispanic,0.865339,0.888802,30380


In [83]:
# %%
# We see that there is also a significant disparity in selection rates when
# grouping by race.

# %%
# Sample weights and other arrays
# -------------------------------
#
# We noted above that the underlying metric functions passed to the
# :class:`fairlearn.metrics.MetricFrame` constructor need to be of
# the form ``fn(y_true, y_pred)`` - we do not support scalar arguments
# such as ``pos_label=`` or ``beta=`` in the constructor. Such
# arguments should be bound into a new function using
# :func:`functools.partial`, and the result passed in. However, we do
# support arguments which have one entry for each sample, with an array
# of sample weights being the most common example. These are divided
# into subgroups along with ``y_true`` and ``y_pred``, and passed along
# to the underlying metric.
#
# To use these arguments, we pass in a dictionary as the ``sample_params=``
# argument of the constructor. Let us generate some random weights, and
# pass these along:

random_weights = np.random.rand(len(y_test))

example_sample_params = {
    'selection_rate': {'sample_weight': random_weights},
    'fbeta_06': {'sample_weight': random_weights},
}


grouped_with_weights = MetricFrame(metrics=metric_fns,
                                   y_true=y_test,
                                   y_pred=y_pred,
                                   sensitive_features=A_test['race'],
                                   sample_params=example_sample_params)


In [84]:
# %%
# We can inspect the overall values, and check they are as expected:
assert grouped_with_weights.overall['selection_rate'] == \
    selection_rate(y_test, y_pred, sample_weight=random_weights)
assert grouped_with_weights.overall['fbeta_06'] == \
    skm.fbeta_score(y_test, y_pred, beta=0.6, sample_weight=random_weights)
print(grouped_with_weights.overall)

selection_rate    0.867461
fbeta_06          0.890338
count                34404
dtype: object


In [85]:
# %%
# We can also see the effect on the metric being evaluated on the subgroups:
grouped_with_weights.by_group

Unnamed: 0_level_0,selection_rate,fbeta_06,count
race,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Non-White,0.767454,0.841762,15089
White,0.945316,0.921255,19315


In [86]:
# %%
# Quantifying Disparities
# =======================
#
# We now know that our model is selecting individuals who are female far less
# often than individuals who are male. There is a similar effect when
# examining the results by race, with blacks being selected far less often than
# whites (and those classified as 'other'). However, there are many cases where
# presenting all these numbers at once will not be useful (for example, a high
# level dashboard which is monitoring model performance). Fairlearn provides
# several means of aggregating metrics across the subgroups, so that disparities
# can be readily quantified.
#
# The simplest of these aggregations is ``group_min()``, which reports the
# minimum value seen for a subgroup for each underlying metric (we also provide
# ``group_max()``). This is
# useful if there is a mandate that "no subgroup should have an ``fbeta_score()``
# of less than 0.6." We can evaluate the minimum values easily:
grouped_on_race.group_min()

selection_rate    0.865339
fbeta_06          0.888802
count                 4024
dtype: object

In [87]:
# %%
# As noted above, the selection rates varies greatly by race and by sex.
# This can be quantified in terms of a difference between the subgroup with
# the highest value of the metric, and the subgroup with the lowest value.
# For this, we provide the method ``difference(method='between_groups)``:
grouped_on_race.difference(method='between_groups')

selection_rate    0.016371
fbeta_06          0.005579
count                26356
dtype: object

In [88]:
# %%
# We can also evaluate the difference relative to the corresponding overall
# value of the metric. In this case we take the absolute value, so that the
# result is always positive:
grouped_on_race.difference(method='to_overall')

selection_rate    0.014456
fbeta_06          0.004916
count                30380
dtype: object

In [89]:
# %%
# There are situations where knowing the ratios of the metrics evaluated on
# the subgroups is more useful. For this we have the ``ratio()`` method.
# We can take the ratios between the minimum and maximum values of each metric:
grouped_on_race.ratio(method='between_groups')


selection_rate    0.981433
fbeta_06          0.993762
count             0.132456
dtype: object

In [90]:
# %%
# We can also compute the ratios relative to the overall value for each
# metric. Analogous to the differences, the ratios are always in the range
# :math:`[0,1]`:
grouped_on_race.ratio(method='to_overall')

selection_rate    0.983605
fbeta_06          0.994503
count             0.116963
dtype: float64

In [91]:
# %%
# Intersections of Features
# =========================
#
# So far we have only considered a single sensitive feature at a time,
# and we have already found some serious issues in our example data.
# However, sometimes serious issues can be hiding in intersections of
# features. For example, the
# `Gender Shades project <https://www.media.mit.edu/projects/gender-shades/overview/>`_
# found that facial recognition algorithms performed worse for blacks
# than whites, and also worse for women than men (despite overall high
# accuracy score). Moreover, performance on black females was *terrible*.
# We can examine the intersections of sensitive features by passing
# multiple columns to the :class:`fairlearn.metrics.MetricFrame`
# constructor:

grouped_on_race_and_sex = MetricFrame(metrics=metric_fns,
                                      y_true=y_test,
                                      y_pred=y_pred,
                                      sensitive_features=A_test[['race', 'ethnicity']])


In [92]:
# %%
# The overall values are unchanged, but the ``by_group`` table now
# shows the intersections between subgroups:
assert (grouped_on_race_and_sex.overall == grouped_on_race.overall).all()
grouped_on_race_and_sex.by_group

Unnamed: 0_level_0,Unnamed: 1_level_0,selection_rate,fbeta_06,count
race,ethnicity,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Non-White,Hispanic,0.592166,0.79202,434
Non-White,Non-Hispanic,0.773388,0.840754,14655
White,Hispanic,0.916713,0.902784,3590
White,Non-Hispanic,0.951033,0.925404,15725
