## Fairness processing

In [22]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

from sklego.linear_model import EqualOpportunityClassifier

from utils import read_diabetes_dataset
from utils import statistical_parity
from utils import average_odds
from utils import average_predictive_value
from utils import theil_index

import pandas as pd

### Dataset

In [2]:
X, y = read_diabetes_dataset(binary=True)

X_train, X_valtest, y_train, y_valtest = train_test_split(
    X, y, test_size=0.3, random_state=101)
X_val, X_test, y_val, y_test = train_test_split(
    X_valtest, y_valtest, test_size=0.5, random_state=101)


In [3]:
# Male: 1, Female: 0
sens_variable = 'gender'

Z_train = X_train[sens_variable]
Z_test = X_test[sens_variable]
Z_val = X_val[sens_variable]

X['gender'].value_counts()

0    38028
1    33490
Name: gender, dtype: int64

### In-processing

- Without mitigation

In [4]:
# Decision Tree
dt = DecisionTreeClassifier(max_depth=7, random_state=123, class_weight=None)
dt.fit(X_train, y_train)

y_train_pred = dt.predict(X_train)

print('Accuracy', accuracy_score(y_train, y_train_pred))
print('Statistical parity', statistical_parity(y_train, y_train_pred, Z_train))
print('Average odds', average_odds(y_train, y_train_pred, Z_train))
print('Average predictive value', average_predictive_value(y_train, y_train_pred, Z_train))
print('Theil index', theil_index(y_train, y_train_pred))


Accuracy 0.6397467140745475
[0] Statistical parity [-0.01037807]
[0] Average odds [-0.00827762]
[0] Average predictive value [-0.01807308]
Theil index 0.3658010370394916


In [5]:
# Logistic Regression
lr = LogisticRegression(class_weight=None, max_iter=10**3)
lr.fit(X_train, y_train)

y_train_pred = lr.predict(X_train)

print('Accuracy', accuracy_score(y_train, y_train_pred))
print('Statistical parity', statistical_parity(y_train, y_train_pred, Z_train))
print('Average odds', average_odds(y_train, y_train_pred, Z_train))
print('Average predictive value', average_predictive_value(y_train, y_train_pred, Z_train))
print('Theil index', theil_index(y_train, y_train_pred))


Accuracy 0.6260636810355159
[0] Statistical parity [-0.02832001]
[0] Average odds [-0.02780472]
[0] Average predictive value [-0.00921451]
Theil index 0.41179237131693625


- Fairness mitigation

In [11]:
Xn_train = X_train.drop(['gender'], axis=1)

In [12]:
# Decision Tree
dt = DecisionTreeClassifier(max_depth=7, random_state=123, class_weight=None)
dt.fit(Xn_train, y_train)

y_train_pred = dt.predict(Xn_train)

print('Accuracy', accuracy_score(y_train, y_train_pred))
print('Statistical parity', statistical_parity(y_train, y_train_pred, Z_train))
print('Average odds', average_odds(y_train, y_train_pred, Z_train))
print('Average predictive value', average_predictive_value(y_train, y_train_pred, Z_train))
print('Theil index', theil_index(y_train, y_train_pred))


Accuracy 0.6397467140745475
[0] Statistical parity [-0.01029785]
[0] Average odds [-0.00821108]
[0] Average predictive value [-0.01818144]
Theil index 0.3658010370394915


In [13]:
# Logistic Regression
lr = LogisticRegression(class_weight=None, max_iter=10**3)
lr.fit(Xn_train, y_train)

y_train_pred = lr.predict(Xn_train)

print('Accuracy', accuracy_score(y_train, y_train_pred))
print('Statistical parity', statistical_parity(y_train, y_train_pred, Z_train))
print('Average odds', average_odds(y_train, y_train_pred, Z_train))
print('Average predictive value', average_predictive_value(y_train, y_train_pred, Z_train))
print('Theil index', theil_index(y_train, y_train_pred))


Accuracy 0.6250649194998202
[0] Statistical parity [-0.00432988]
[0] Average odds [-0.00288958]
[0] Average predictive value [-0.02029358]
Theil index 0.41307119582627405


- Counterfactual appends

In [15]:
X_train_cf =  X_train.copy()
X_train_cf['gender'] = X_train_cf['gender'].replace({0:1, 1:0})
X_train_cf = pd.concat([X_train_cf, X_train])
y_train_cf = pd.concat([y_train, y_train])

In [20]:
# Decision Tree
dt = DecisionTreeClassifier(max_depth=7, random_state=123, class_weight=None)
dt.fit(X_train_cf, y_train_cf)

y_train_pred = dt.predict(X_train_cf)

print('Accuracy', accuracy_score(y_train_cf, y_train_pred))
print('Statistical parity', statistical_parity(y_train_cf, y_train_pred, Z_train))
print('Average odds', average_odds(y_train_cf, y_train_pred, Z_train))
print('Average predictive value', average_predictive_value(y_train_cf, y_train_pred, Z_train))
print('Theil index', theil_index(y_train_cf, y_train_pred))


Accuracy 0.6397467140745475
[0] Statistical parity [-0.01037807]
[0] Average odds [-0.00827762]
[0] Average predictive value [-0.01807308]
Theil index 0.3658010370394915


In [21]:
# Logistic Regression
lr = LogisticRegression(class_weight=None, max_iter=10**3)
lr.fit(X_train_cf, y_train_cf)

y_train_pred = lr.predict(X_train_cf)

print('Accuracy', accuracy_score(y_train_cf, y_train_pred))
print('Statistical parity', statistical_parity(y_train_cf, y_train_pred, Z_train))
print('Average odds', average_odds(y_train_cf, y_train_pred, Z_train))
print('Average predictive value', average_predictive_value(y_train_cf, y_train_pred, Z_train))
print('Theil index', theil_index(y_train_cf, y_train_pred))


Accuracy 0.6249650433462507
[0] Statistical parity [-0.00422726]
[0] Average odds [-0.00273393]
[0] Average predictive value [-0.01992455]
Theil index 0.41324371932227494


- Optimization

In [34]:
y_train_v = y_train.values
y_train_v.shape = (y_train_v.shape[0], 1)

array([[0],
       [1],
       [0],
       ...,
       [0],
       [0],
       [0]])

In [33]:
C = 1000000000
covariance_threshold = 0.000001

model = EqualOpportunityClassifier(sensitive_cols=0, positive_target=True, covariance_threshold=covariance_threshold, C=C, max_iter=10**5)
model.fit(X_train, y_train)

TypeError: argument of type 'int' is not iterable