In [81]:
from sklearn.datasets import fetch_openml
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, roc_auc_score
import numpy as np
import torch

In [145]:
dataset = fetch_openml('compas-two-years')
X, y = dataset.data, dataset.target
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True)

# Let's train a simple predictor on that

In [6]:
clf = GradientBoostingClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
clf.score(X_test, y_test)  # note: I think here >65 % accuracy is quite OK because it is a very hard task, checking: "compas dataset accuracy" on google it seems often around 65 %

# We try to detect if the classifier has a bad equality of "opportunity" bias

That is, if it will predict more false positive on African Americans than others: that is, if even if you're innocent, being African American is more risky for you if you go through the algorithm...

In [39]:
tn_aa, fp_aa, fn_aa, tp_aa = confusion_matrix(y_pred[X_test['race_African-American'] == '1'], y_test[X_test['race_African-American'] == '1']).ravel()
tn_w, fp_w, fn_w, tp_w = confusion_matrix(y_pred[X_test['race_Caucasian'] == '1'], y_test[X_test['race_Caucasian'] == '1']).ravel()

We compute the false positive rate on each category: 

In [46]:
fp_aa/(fp_aa+tn_aa)

0.3542713567839196

In [47]:
fp_w/(fp_w+tn_w)

0.3194805194805195

Therefore, we see that indeed, our classifier has a higher false positive rate for African Americans, meaning given a person is **not a recidivist**, the algorithm is more likely to (wrongly) classify it as a recidivist if they are African American than if they are White Caucasian.

Let's look at the difference of AUC too:

In [58]:
scores = clf.decision_function(X_test)

In [69]:
auc_1 = roc_auc_score(y_test[X_test['race_African-American'] == '1'], scores[np.array(X_test['race_African-American'] == '1')])
auc_2 = roc_auc_score(y_test[X_test['race_African-American'] == '0'], scores[np.array(X_test['race_African-American'] == '0')])
print('auc aa: {}'.format(auc_1))
print('auc w: {}'.format(auc_2))

auc aa: 0.7277288823407775
auc w: 0.6853405070754717


Interesting, I would've thought the AUC would be higher on w, since there are more false positives on aa... Anyways let's see if trying to put the same AUC for both would help improve fairness

# Now let's try to code a differentiable cost function that, if optimized for, can reduce this gap



We try to implement a differentiable, AUC based, metric for fairness, using for now section E.1 from https://arxiv.org/pdf/2002.08159.pdf

In [201]:
def sigmoid_torch(x):
    return 1/(1+torch.exp(-x))

def auc_approx_torch(scores, y_true):
    # y_true must be 0 or 1, scores must be from minus inf to plus inf (some decision function)
    # an approximated (differentiable) version of auc
    N_neg = (y_true == 1).sum()
    N_pos = (y_true == 0).sum()
    return 0.5 * 1/(N_neg + N_pos) * (sigmoid((scores[:, None] - score[None]) * (y_true[:, None] - y_true[None]))).sum()

# we also define the batch-wise version, for B pairs randomly sampled:
def auc_approx_batch_torch(scores_i, scores_j, y_true_i, y_true_j):
    # scores_i is the set of first scores of the first elements sampled, same for y_true_i
    B = scores_i.shape[0]
    if B != 0:
        return 1/B * (sigmoid_torch((scores_i - scores_j) * (y_true_i - y_true_j))).sum()
    else: 
        return 0


def diff_fairness_torch(sensitive_attr, scores_i, scores_j, y_true_i, y_true_j):
    auc_sensitive_1 = auc_approx_batch_torch(scores_i[sensitive_attr==0], 
                                       scores_j[sensitive_attr==0],
                                       y_true_i[sensitive_attr==0],
                                       y_true_j[sensitive_attr==0])
    auc_sensitive_2 = auc_approx_batch_torch(scores_i[sensitive_attr==1], 
                                       scores_j[sensitive_attr==1],
                                       y_true_i[sensitive_attr==1],
                                       y_true_j[sensitive_attr==1])
    # I add squared here so that minimizing it will try to put this term to zero
    return (auc_sensitive_1 - auc_sensitive_2)**2

Let us run a simple neural network, with the above penalty for fairness, see if it improves fairness or not

In [202]:
# we forget about the test set for now: we take the whole dataset
X_train = np.array(X, dtype=np.float32)
y_train = np.array(y, dtype=np.float32)[:, None]  # this shape is necessary for pytorch
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, shuffle=True)

In [203]:
# warning: the sampling for the computation of that metric should be some *pairs* from the validation set 
ce = torch.nn.BCELoss()
    
def total_loss(y_pred, y, sensitive_attr, scores_i, scores_j, y_true_i, y_true_j):
    return ce(y_pred, y) + 0.2 * diff_fairness_torch(sensitive_attr, scores_i, scores_j, y_true_i, y_true_j)

In [205]:
# took code from https://pytorch.org/tutorials/beginner/pytorch_with_examples.html

import torch
import math

# Use the nn package to define our model and loss function.
model = torch.nn.Sequential(
    torch.nn.Linear(13, 1),
    torch.nn.Sigmoid())

learning_rate = 1e-3
optimizer = torch.optim.RMSprop(model.parameters(), lr=learning_rate)


len_train_data = X_train.shape[0]
batch_size = 10
pairs_batch_size = 10
val_cat = {0: None, 1:None}
val_cat[0] = X_valid[X_valid[:, 10] == 0]  # The tenth column corresponds to AfricanAmerican categorial variable
val_cat[1] = X_valid[X_valid[:, 10] == 1]
for t in range(2000):
    
    # we sample a batch of training samples and a batch of validation pairs
    BT = np.random.randint(len_train_data, size=batch_size)  # not efficient at all but we don t care
    xx = torch.tensor(X_train[BT])
    yy = torch.tensor(y_train[BT])
    pairs_x_i = []
    pairs_x_j = []
    y_i = []
    y_j = []
    sensitive_attr = []
    for _ in range(pairs_batch_size):  # super slow but we don't care
        sens_attr_chosen = np.random.randint(2) # we first sample a sensitive attribute (AA/W, here 0/1)
        # then we sample a pair of validation samples for this sensitive attribute
        val_slice_chosen = val_cat[np.random.randint(2)]
        pairs_idx = np.random.randint(val_slice_chosen.shape[0], size=2)
        pairs_x_i.append(val_slice_chosen[pairs_idx[0]])
        pairs_x_j.append(val_slice_chosen[pairs_idx[1]])
        y_i.append(y_valid[pairs_idx[0]])
        y_j.append(y_valid[pairs_idx[1]])
        sensitive_attr.append(sens_attr_chosen)
    pairs_x_i = torch.tensor(pairs_x_i)
    pairs_x_j = torch.tensor(pairs_x_j)
    y_i = torch.tensor(y_i)
    y_j = torch.tensor(y_j)
    sensitive_attr = torch.tensor(sensitive_attr)
    
    y_pred = model(xx)
    scores_i = model(pairs_x_i)
    scores_j = model(pairs_x_j)
    
    # Compute and print loss.
    loss = total_loss(y_pred, yy, sensitive_attr, scores_i, scores_j, y_i, y_j)
    if t % 100 == 99:
        print(t, loss.item())
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()


99 0.7628890872001648
199 0.757701575756073
299 0.5764566659927368
399 0.6320579648017883
499 0.6995129585266113
599 0.8715802431106567
699 0.5928434133529663
799 0.806164026260376
899 0.44155460596084595
999 0.8556731939315796
1099 0.6114168763160706
1199 0.837361752986908
1299 0.8014306426048279
1399 0.49813297390937805
1499 0.48914673924446106
1599 0.7856320738792419
1699 0.8316701650619507
1799 0.6041955947875977
1899 0.5523978471755981
1999 0.6871882677078247
