#### This notebook provides a example of how to invoke the DRO methods in practice. There are three methods implemented in DROFairModels: DRFPROB, DRFSVM, DRFLR. 

In [1]:
%load_ext autoreload
%autoreload 2

In [6]:
from matplotlib.colors import ListedColormap
from matplotlib import colors as mcolors
import math
import numpy as np
import matplotlib.pyplot as plt  # for plotting stuff
from random import seed
from scipy.stats import multivariate_normal  # generating synthetic data
from collections import namedtuple
from DROFairModels import DROFairModels
from collections import defaultdict
from sklearn import preprocessing
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from Linear_Ferm_SVM1 import Linear_FERM
# from fair_logloss import EOPP_fair_logloss_classifier
import pandas as pd
import os
from random import shuffle
import warnings
warnings.filterwarnings('ignore')
from toolbox import *

In [7]:
current_path=os.getcwd()
SEED = 1
seed(SEED)
np.random.seed(SEED)
rng = np.random.RandomState(SEED)

In [8]:
X, y, a = load_arrhythmia()
X, y, a = load_compas_data()

n_train_samples = 100
y[y==0] = -1
true_P_11, true_P_01, true_P_10, true_P_00 = get_marginals(a, y)
emp_marginals = [true_P_11, true_P_01, true_P_10, true_P_00]



X_train, a_train, y_train, X_test, a_test, y_test, threshold = stratified_sampling(
    X=X,
    a=a,
    y=y, emp_marginals=emp_marginals,
    n_train_samples=n_train_samples)

y_train[y_train==0] = -1
y_test[y_test==0] = -1
train_P_11, train_P_01, _, _ = get_marginals(a_train, y_train)
max_eta = min(train_P_11,train_P_01)/2


加载心律失常数据集..
敏感特征 1 的不同取值: {0.0, 1.0}


#### You can call the DROFairModels class with specific method name and selected parameters to run the fair classifiers. For the sake of simplicity, we eliminate the tunning process and use two-sided unfairness measure as an illustration. 

In [9]:
# An example of DRFPROB
clf=DROFairModels(reg=0.1, radius=0.01, epsilon=0.01,
                                      verbose=False, fit_intercept=True, mode = 'DRFPROB', side='two')
clf.fit(X=X_train, a=a_train, y=y_train)
print(f'Testing Accuracy of DRFPROB:{clf.score(X_test,y_test):.3f}')
print(f'Testing Fscore of DRFPROB:{clf.Fscore(X_test,y_test):.3f}')
print(f'Testing Fairness of DRFPROB:{clf.unfairness(X_test,a_test,y_test).det_unfairness:.3f}')

# # An example of DRFSVM
clf=DROFairModels(reg=1.1, radius=0.01, epsilon=0.01,
                                      verbose=False, fit_intercept=True, mode = 'DRFSVM', side='two')
clf.fit(X=X_train, a=a_train, y=y_train)
print(f'Testing Accuracy of DRFSVM:{clf.score(X_test,y_test):.3f}')
print(f'Testing Fscore of DRFSVM:{clf.Fscore(X_test,y_test):.3f}')
print(f'Testing Fairness of DRFSVM:{clf.unfairness(X_test,a_test,y_test).det_unfairness:.3f}')

# An example of DRFLR
clf=DROFairModels(reg=0.1, radius=0.01, epsilon=0.01,
                                      verbose=False, fit_intercept=True, mode = 'DRFLR')
clf.fit(X=X_train, a=a_train, y=y_train)

print(f'Testing Accuracy of DRFLR:{clf.score(X_test,y_test):.3f}')
print(f'Testing Fscore of DRFSVM:{clf.Fscore(X_test,y_test):.3f}')
print(f'Testing Fairness of DRFLR:{clf.unfairness(X_test,a_test,y_test).det_unfairness:.3f}')

Testing Accuracy of DRFPROB:0.640
Testing Fscore of DRFPROB:0.624
Testing Fairness of DRFPROB:0.219
Testing Accuracy of DRFSVM:0.636
Testing Fscore of DRFSVM:0.541
Testing Fairness of DRFSVM:0.243
Testing Accuracy of DRFLR:0.641
Testing Fscore of DRFSVM:0.538
Testing Fairness of DRFLR:0.235


#### For imbalanced dataset, decison makers can consider adopt balaced accuracy as the objective. In general, it yield much higher F-score

In [10]:
X, y, a = load_drug_data()
n_train_samples = 100
y[y==0] = -1
true_P_11, true_P_01, true_P_10, true_P_00 = get_marginals(a, y)
emp_marginals = [true_P_11, true_P_01, true_P_10, true_P_00]



X_train, a_train, y_train, X_test, a_test, y_test, threshold = stratified_sampling(
    X=X,
    a=a,
    y=y, emp_marginals=emp_marginals,
    n_train_samples=n_train_samples)

y_train[y_train==0] = -1
y_test[y_test==0] = -1
train_P_11, train_P_01, _, _ = get_marginals(a_train, y_train)
max_eta = min(train_P_11,train_P_01)/2

Loading Drug (black vs others) dataset...


In [22]:
# An example of DRFPROB
clf=DROFairModels(reg=0.1, radius=0.01, epsilon=0.01,
                                      verbose=False, fit_intercept=True, mode = 'DRFPROB', side='two',balanced_accuracy=True)
clf.fit(X=X_train, a=a_train, y=y_train)
print(f'Testing Accuracy of DRFPROB:{clf.score(X_test,y_test):.3f}')
print(f'Testing Fscore of DRFPROB:{clf.Fscore(X_test,y_test):.3f}')
print(f'Testing Fairness of DRFPROB:{clf.unfairness(X_test,a_test,y_test).det_unfairness:.3f}')


# # An example of DRFSVM
clf=DROFairModels(reg=1.1, radius=0.01, epsilon=0.01,
                                      verbose=False, fit_intercept=True, mode = 'DRFSVM', side='two',balanced_accuracy=True)
clf.fit(X=X_train, a=a_train, y=y_train)
print(f'Testing Accuracy of DRFSVM:{clf.score(X_test,y_test):.3f}')
print(f'Testing Fscore of DRFSVM:{clf.Fscore(X_test,y_test):.3f}')
print(f'Testing Fairness of DRFSVM:{clf.unfairness(X_test,a_test,y_test).det_unfairness:.3f}')

# An example of DRFLR
clf=DROFairModels(reg=0.1, radius=0.01, epsilon=0.01,
                                      verbose=False, fit_intercept=True, mode = 'DRFLR')
clf.fit(X=X_train, a=a_train, y=y_train)

print(f'Testing Accuracy of DRFLR:{clf.score(X_test,y_test):.3f}')
print(f'Testing Fscore of DRFSVM:{clf.Fscore(X_test,y_test):.3f}')
print(f'Testing Fairness of DRFLR:{clf.unfairness(X_test,a_test,y_test).det_unfairness:.3f}')

Testing Accuracy of DRFPROB:0.724
Testing Fscore of DRFPROB:0.500
Testing Fairness of DRFPROB:0.028
Testing Accuracy of DRFSVM:0.718
Testing Fscore of DRFSVM:0.540
Testing Fairness of DRFSVM:0.031
Testing Accuracy of DRFLR:0.774
Testing Fscore of DRFSVM:0.324
Testing Fairness of DRFLR:0.029
