In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklego.metrics import equal_opportunity_score
from sklego.metrics import p_percent_score
from sklearn.metrics import log_loss
from sklearn.utils.extmath import squared_norm
from moopt.scalarization_interface import scalar_interface, single_interface, w_interface
from moopt import monise
import numpy as np
import optuna, sklearn, sklearn.datasets
from fair_models import coefficient_of_variation, MOOLogisticRegression, FindCLogisticRegression, FindCCLogisticRegression,FairScalarization

## Data

In [3]:
mydata= pd.read_csv("Datasets/german_credit_data.csv")

In [4]:
mydata = mydata.drop(['Unnamed: 0', 'Purpose'], axis=1)
mydata = mydata.dropna()

mapping_Sex = {'male': 0, 'female': 1}
mapping_Housing = {'free': 1, 'rent': 2, 'own': 3}
mapping_Savings = {'little': 1, 'moderate': 2, 'quite rich': 3, 'rich': 4}
mapping_Checking = {'little': 1, 'moderate': 2, 'rich': 3}
mapping_Risk = {"bad": -1, "good": 1}

numerical_data = mydata.replace({'Sex': mapping_Sex, 'Housing': mapping_Housing, 'Saving accounts': mapping_Savings,
                'Checking account':mapping_Checking, 'Risk': mapping_Risk})

X = numerical_data.drop(['Risk'], axis=1)

y = numerical_data['Risk']

random_seed = np.random.randint(0, 1000)
random_seed2 = np.random.randint(0, 1000)

X_tv, X_test, y_tv, y_test = train_test_split(X, y, test_size=200, random_state = random_seed)
X_train, X_val, y_train, y_val = train_test_split(X_tv, y_tv, test_size=100, random_state = random_seed2)

## Model List

In [5]:
moo = monise(weightedScalar=FairScalarization(X_train, y_train, 'Sex'),
             singleScalar=FairScalarization(X_train, y_train, 'Sex'),
              nodeTimeLimit=2, targetSize=150,
              targetGap=0, nodeGap=0.01, norm=False)

moo.optimize()

sols = []

for solution in moo.solutionsList:
    sols.append(solution.x)

Using license file /opt/gurobi/gurobi.lic
Academic license - for non-commercial use only


## Voting Ensemble - All Models

In [6]:
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

In [7]:
models_t = []
for i in range(len(sols)):
    models_t.append(("Model "+str(i),sols[i]))

In [8]:
eclf1 = VotingClassifier(estimators=models_t, voting='hard')

In [9]:
eclf1 = eclf1.fit(X_train, y_train)

In [10]:
print("Metrics - Train Data")
print("Acc: ", eclf1.score(X_train, y_train))
print("Eq Opor: ", equal_opportunity_score(sensitive_column="Sex")(eclf1, X_train, y_train))
print("P Perc: ", p_percent_score(sensitive_column="Sex")(eclf1, X_train))
print("Coev Var: ", coefficient_of_variation(eclf1, X_train, y_train))

Metrics - Train Data
Acc:  0.6936936936936937
Eq Opor:  0.9819004524886877
P Perc:  0.8569157392686805
Coev Var:  0.6847719263017362


In [11]:
print("Metrics - Val Data")
print("Acc: ", eclf1.score(X_val, y_val))
print("Eq Opor: ", equal_opportunity_score(sensitive_column="Sex")(eclf1, X_val, y_val))
print("P Perc: ", p_percent_score(sensitive_column="Sex")(eclf1, X_val))
print("Coev Var: ", coefficient_of_variation(eclf1, X_val, y_val))

Metrics - Val Data
Acc:  0.59
Eq Opor:  0.9150326797385621
P Perc:  0.9229797979797979
Coev Var:  0.7655137765938702


In [12]:
print("Metrics - Test Data")
print("Acc: ", eclf1.score(X_test, y_test))
print("Eq Opor: ", equal_opportunity_score(sensitive_column="Sex")(eclf1, X_test, y_test))
print("P Perc: ", p_percent_score(sensitive_column="Sex")(eclf1, X_test))
print("Coev Var: ", coefficient_of_variation(eclf1, X_test, y_test))

Metrics - Test Data
Acc:  0.6
Eq Opor:  0.9694915254237289
P Perc:  0.8286951144094001
Coev Var:  0.7746269997882853


## Filter dominated models

In [13]:
def dominate(a, b):
    sense = np.array([1, 1, 1, -1])
    if all((sense*a)>=(sense*b)) and any((sense*a)>(sense*b)):
        return True
    else:
        return False

In [14]:
metrics = []

for i in range(len(sols)):
    metrics.append((sols[i].score(X_val, y_val),
                   equal_opportunity_score(sensitive_column="Sex")(sols[i], X_val, y_val),
                   p_percent_score(sensitive_column="Sex")(sols[i], X_val),
                   coefficient_of_variation(sols[i], X_val, y_val)))

  score = np.minimum(p_y1_z1 / p_y1_z0, p_y1_z0 / p_y1_z1)


In [15]:
par_models = []

for i in range(len(sols)):
    flag = True
    for j in range(len(sols)):
        if i != j:
            dom = dominate(metrics[j], metrics[i])
            if dom:
                flag = False
                break
    if flag:
        par_models.append(("Model "+str(i), sols[i]))

In [17]:
best_acc = []
best_eq = []
best_pp = []
best_var = []
best_acc_v = 0
best_eq_v = 0
best_pp_v = 0
best_var_v = 100

for s, model in par_models:
    if model.score(X_val, y_val) > best_acc_v:
        best_acc_v = model.score(X_val, y_val)
        best_acc = [model, model.score(X_val, y_val),
                   equal_opportunity_score(sensitive_column="Sex")(model, X_val, y_val),
                   p_percent_score(sensitive_column="Sex")(model, X_val),
                   coefficient_of_variation(model, X_val, y_val)]
    if equal_opportunity_score(sensitive_column="Sex")(model, X_val, y_val) > best_eq_v:
        best_eq_v = equal_opportunity_score(sensitive_column="Sex")(model, X_val, y_val)
        best_eq = [model, model.score(X_val, y_val),
                   equal_opportunity_score(sensitive_column="Sex")(model, X_val, y_val),
                   p_percent_score(sensitive_column="Sex")(model, X_val),
                   coefficient_of_variation(model, X_val, y_val)]
    if p_percent_score(sensitive_column="Sex")(model, X_val) > best_pp_v:
        best_pp_v = p_percent_score(sensitive_column="Sex")(model, X_val)
        best_pp = [model, model.score(X_val, y_val),
                   equal_opportunity_score(sensitive_column="Sex")(model, X_val, y_val),
                   p_percent_score(sensitive_column="Sex")(model, X_val),
                   coefficient_of_variation(model, X_val, y_val)]
    if coefficient_of_variation(model, X_val, y_val) < best_var_v:
        best_var_v = coefficient_of_variation(model, X_val, y_val) 
        best_var = [model, model.score(X_val, y_val),
                   equal_opportunity_score(sensitive_column="Sex")(model, X_val, y_val),
                   p_percent_score(sensitive_column="Sex")(model, X_val),
                   coefficient_of_variation(model, X_val, y_val)]

  score = np.minimum(p_y1_z1 / p_y1_z0, p_y1_z0 / p_y1_z1)
  score = np.minimum(p_y1_z1 / p_y1_z0, p_y1_z0 / p_y1_z1)
  score = np.minimum(p_y1_z1 / p_y1_z0, p_y1_z0 / p_y1_z1)


In [18]:
best_acc

[LogisticRegression(C=35.46152563066118, max_iter=10000,
                    multi_class='multinomial', tol=4.504504504504493e-09),
 0.63,
 0.8823529411764706,
 0.9264705882352942,
 0.7436788203240532]

In [19]:
best_eq

[LogisticRegression(C=1e-20, max_iter=10000, multi_class='multinomial', tol=0.0),
 0.48,
 1,
 0,
 1.471960144387974]

In [20]:
best_pp

[LogisticRegression(C=1.7970050087560796, max_iter=10000,
                    multi_class='multinomial', tol=4.504504504504502e-09),
 0.58,
 0.7918552036199095,
 0.994261119081779,
 0.8232168307397645]

In [21]:
best_var

[LogisticRegression(C=0.003964351541430085, max_iter=10000,
                    multi_class='multinomial', tol=4.504504504504486e-09),
 0.55,
 0.9714285714285714,
 0.8641251221896383,
 0.6422202325588763]

## Voting Ensemble - Non-Dominated Models

In [22]:
eclf2 = VotingClassifier(estimators=par_models, voting='hard')

In [23]:
eclf2 = eclf1.fit(X_train, y_train)

In [24]:
print("Metrics - Train Data")
print("Acc: ", eclf2.score(X_train, y_train))
print("Eq Opor: ", equal_opportunity_score(sensitive_column="Sex")(eclf2, X_train, y_train))
print("P Perc: ", p_percent_score(sensitive_column="Sex")(eclf2, X_train))
print("Coev Var: ", coefficient_of_variation(eclf2, X_train, y_train))

Metrics - Train Data
Acc:  0.6936936936936937
Eq Opor:  0.9819004524886877
P Perc:  0.8569157392686805
Coev Var:  0.6847719263017362


In [25]:
print("Metrics - Train Data")
print("Acc: ", eclf2.score(X_val, y_val))
print("Eq Opor: ", equal_opportunity_score(sensitive_column="Sex")(eclf2, X_val, y_val))
print("P Perc: ", p_percent_score(sensitive_column="Sex")(eclf2, X_val))
print("Coev Var: ", coefficient_of_variation(eclf2, X_val, y_val))

Metrics - Train Data
Acc:  0.59
Eq Opor:  0.9150326797385621
P Perc:  0.9229797979797979
Coev Var:  0.7655137765938702


In [26]:
print("Metrics - Test Data")
print("Acc: ", eclf2.score(X_test, y_test))
print("Eq Opor: ", equal_opportunity_score(sensitive_column="Sex")(eclf2, X_test, y_test))
print("P Perc: ", p_percent_score(sensitive_column="Sex")(eclf2, X_test))
print("Coev Var: ", coefficient_of_variation(eclf2, X_test, y_test))

Metrics - Test Data
Acc:  0.6
Eq Opor:  0.9694915254237289
P Perc:  0.8286951144094001
Coev Var:  0.7746269997882853
