## Fairness processing

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

from sklego.linear_model import EqualOpportunityClassifier
from fairlearn.postprocessing import ThresholdOptimizer

from utils import read_diabetes_dataset
from utils import statistical_parity
from utils import average_odds
from utils import average_predictive_value
from utils import theil_index

import pandas as pd
import numpy as np

### Dataset

In [2]:
X, y = read_diabetes_dataset(binary=True)

X_train, X_valtest, y_train, y_valtest = train_test_split(
    X, y, test_size=0.3, random_state=101)
X_val, X_test, y_val, y_test = train_test_split(
    X_valtest, y_valtest, test_size=0.5, random_state=101)


In [3]:
# Male: 1, Female: 0
sens_variable = 'gender'

Z_train = X_train[sens_variable]
Z_test = X_test[sens_variable]
Z_val = X_val[sens_variable]

# sens_columns = ['race_AfricanAmerican', 'race_Asian', 'race_Caucasian', 'race_Hispanic','race_Other']
# Z_train = np.argmax(X_train[sens_columns].values, axis=1)
# Z_test = np.argmax(X_test[sens_columns].values, axis=1)
# Z_val = np.argmax(X_val[sens_columns].values, axis=1)

### In-processing

- Without mitigation

In [4]:
# Decision Tree
dt = DecisionTreeClassifier(max_depth=7, random_state=123, class_weight=None)
dt.fit(X_train, y_train)

y_train_pred = dt.predict(X_train)

print('Accuracy', accuracy_score(y_train, y_train_pred))
print('Statistical parity', statistical_parity(y_train, y_train_pred, Z_train))
print('Average odds', average_odds(y_train, y_train_pred, Z_train))
print('Average predictive value', average_predictive_value(y_train, y_train_pred, Z_train))
print('Theil index', theil_index(y_train, y_train_pred))


Accuracy 0.6397467140745475
[0] Statistical parity [-0.01037807]
[0] Average odds [-0.00827762]
[0] Average predictive value [-0.01807308]
Theil index 0.3658010370394916


In [5]:
# Logistic Regression
lr = LogisticRegression(class_weight=None, max_iter=10**3)
lr.fit(X_train, y_train)

y_train_pred = lr.predict(X_train)

print('Accuracy', accuracy_score(y_train, y_train_pred))
print('Statistical parity', statistical_parity(y_train, y_train_pred, Z_train))
print('Average odds', average_odds(y_train, y_train_pred, Z_train))
print('Average predictive value', average_predictive_value(y_train, y_train_pred, Z_train))
print('Theil index', theil_index(y_train, y_train_pred))


Accuracy 0.6260437058048021
[0] Statistical parity [-0.02823257]
[0] Average odds [-0.02776224]
[0] Average predictive value [-0.00958463]
Theil index 0.41195578101024366


- Fairness mitigation

In [6]:
Xn_train = X_train.drop([sens_variable], axis=1)

In [7]:
# Decision Tree
dt = DecisionTreeClassifier(max_depth=7, random_state=123, class_weight=None)
dt.fit(Xn_train, y_train)

y_train_pred = dt.predict(Xn_train)

print('Accuracy', accuracy_score(y_train, y_train_pred))
print('Statistical parity', statistical_parity(y_train, y_train_pred, Z_train))
print('Average odds', average_odds(y_train, y_train_pred, Z_train))
print('Average predictive value', average_predictive_value(y_train, y_train_pred, Z_train))
print('Theil index', theil_index(y_train, y_train_pred))


Accuracy 0.6397467140745475
[0] Statistical parity [-0.01029785]
[0] Average odds [-0.00821108]
[0] Average predictive value [-0.01818144]
Theil index 0.3658010370394915


In [8]:
# Logistic Regression
lr = LogisticRegression(class_weight=None, max_iter=10**3)
lr.fit(Xn_train, y_train)

y_train_pred = lr.predict(Xn_train)

print('Accuracy', accuracy_score(y_train, y_train_pred))
print('Statistical parity', statistical_parity(y_train, y_train_pred, Z_train))
print('Average odds', average_odds(y_train, y_train_pred, Z_train))
print('Average predictive value', average_predictive_value(y_train, y_train_pred, Z_train))
print('Theil index', theil_index(y_train, y_train_pred))


Accuracy 0.6249850185769645
[0] Statistical parity [-0.00475122]
[0] Average odds [-0.00328614]
[0] Average predictive value [-0.01976025]
Theil index 0.41323594808904546


- Counterfactual appends

In [9]:
X_train_cf =  X_train.copy()
X_train_cf[sens_variable] = X_train_cf[sens_variable].replace({0:1, 1:0})
X_train_cf = pd.concat([X_train_cf, X_train])
y_train_cf = pd.concat([y_train, y_train])

In [10]:
# Decision Tree
dt = DecisionTreeClassifier(max_depth=7, random_state=123, class_weight=None)
dt.fit(X_train_cf, y_train_cf)

y_train_pred = dt.predict(X_train_cf)

print('Accuracy', accuracy_score(y_train_cf, y_train_pred))
print('Statistical parity', statistical_parity(y_train_cf, y_train_pred, Z_train))
print('Average odds', average_odds(y_train_cf, y_train_pred, Z_train))
print('Average predictive value', average_predictive_value(y_train_cf, y_train_pred, Z_train))
print('Theil index', theil_index(y_train_cf, y_train_pred))


Accuracy 0.6397467140745475
[0] Statistical parity [-0.01037807]
[0] Average odds [-0.00827762]
[0] Average predictive value [-0.01807308]
Theil index 0.3658010370394915


In [11]:
# Logistic Regression
lr = LogisticRegression(class_weight=None, max_iter=10**3)
lr.fit(X_train_cf, y_train_cf)

y_train_pred = lr.predict(X_train_cf)

print('Accuracy', accuracy_score(y_train_cf, y_train_pred))
print('Statistical parity', statistical_parity(y_train_cf, y_train_pred, Z_train))
print('Average odds', average_odds(y_train_cf, y_train_pred, Z_train))
print('Average predictive value', average_predictive_value(y_train_cf, y_train_pred, Z_train))
print('Theil index', theil_index(y_train_cf, y_train_pred))


Accuracy 0.6249750309616076
[0] Statistical parity [-0.00488636]
[0] Average odds [-0.00339224]
[0] Average predictive value [-0.01944092]
Theil index 0.41312842615755935


### Post-processing

- Model

In [12]:
# Decision Tree
dt = DecisionTreeClassifier(max_depth=7, random_state=123, class_weight=None)
dt.fit(X_train, y_train)

Z_train = X_train[sens_variable]==0
y_train_pred = dt.predict(X_train)

print('Accuracy', accuracy_score(y_train, y_train_pred))
print('Statistical parity', statistical_parity(y_train, y_train_pred, Z_train))
print('Average odds', average_odds(y_train, y_train_pred, Z_train))
print('Average predictive value', average_predictive_value(y_train, y_train_pred, Z_train))
print('Theil index', theil_index(y_train, y_train_pred))


Accuracy 0.6397467140745475
[True] Statistical parity [-0.01037807]
[True] Average odds [-0.00827762]
[True] Average predictive value [-0.01807308]
Theil index 0.3658010370394916


In [13]:
postprocess_est = ThresholdOptimizer(
    estimator=dt,
    constraints="false_negative_rate_parity",
    objective="balanced_accuracy_score",
    prefit=True,
    predict_method='predict_proba')
postprocess_est.fit(X_train, y_train, sensitive_features=Z_train)
y_train_pred = postprocess_est.predict(X_train, sensitive_features=Z_train)


In [14]:
print('Accuracy', accuracy_score(y_train, y_train_pred))
print('Statistical parity', statistical_parity(y_train, y_train_pred, Z_train))
print('Average odds', average_odds(y_train, y_train_pred, Z_train))
print('Average predictive value', average_predictive_value(y_train, y_train_pred, Z_train))
print('Theil index', theil_index(y_train, y_train_pred))


Accuracy 0.6015940234109703
[True] Statistical parity [0.00161892]
[True] Average odds [0.0042884]
[True] Average predictive value [-0.01910855]
Theil index 0.22769817789195285
