In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklego.metrics import equal_opportunity_score
from sklego.metrics import p_percent_score
from sklearn.metrics import log_loss
from sklearn.utils.extmath import squared_norm
from moopt.scalarization_interface import scalar_interface, single_interface, w_interface
from moopt import monise
import numpy as np
import optuna, sklearn, sklearn.datasets
from fair_models import coefficient_of_variation, MOOLogisticRegression, FindCLogisticRegression, FindCCLogisticRegression,FairScalarization

Using Python-MIP package version 1.7.2




In [2]:
import sklearn
from scipy import stats
import math

class SimpleVoting():
    def __init__(self, estimators, voting='hard'):
        self.estimators = estimators
        self.voting = voting
        self.classes_ = estimators[0][1].classes_
    
    def predict(self, X):
        if self.voting=='soft':
            argmax = np.argmax(np.mean([m[1].predict_proba(X) for m in self.estimators],axis=0), axis=1)
            y_pred = np.array([self.classes_[v] for v in argmax])
        else:
            y_pred = stats.mode([m[1].predict(X) for m in self.estimators],axis=0)[0][0]
        
        return y_pred
    
    def score(self, X, y):
        y_pred = self.predict(X)
        return sklearn.metrics.accuracy_score(y, y_pred)

## Data

In [3]:
mydata= pd.read_csv("Datasets/german_credit_data.csv")

In [4]:
mydata = mydata.drop(['Unnamed: 0', 'Purpose'], axis=1)
mydata = mydata.dropna()

mapping_Sex = {'male': 0, 'female': 1}
mapping_Housing = {'free': 1, 'rent': 2, 'own': 3}
mapping_Savings = {'little': 1, 'moderate': 2, 'quite rich': 3, 'rich': 4}
mapping_Checking = {'little': 1, 'moderate': 2, 'rich': 3}
mapping_Risk = {"bad": -1, "good": 1}

numerical_data = mydata.replace({'Sex': mapping_Sex, 'Housing': mapping_Housing, 'Saving accounts': mapping_Savings,
                'Checking account':mapping_Checking, 'Risk': mapping_Risk})

X = numerical_data.drop(['Risk'], axis=1)

y = numerical_data['Risk']

random_seed = 2000#np.random.randint(0, 1000)
random_seed2 = 2000#np.random.randint(0, 1000)

X_tv, X_test, y_tv, y_test = train_test_split(X, y, test_size=200, random_state = random_seed)
X_train, X_val, y_train, y_val = train_test_split(X_tv, y_tv, test_size=100, random_state = random_seed2)

## Model List

In [5]:
moo = monise(weightedScalar=FairScalarization(X_train, y_train, 'Sex'),
             singleScalar=FairScalarization(X_train, y_train, 'Sex'),
              nodeTimeLimit=2, targetSize=150,
              targetGap=0, nodeGap=0.01, norm=False)

moo.optimize()

sols = []

for solution in moo.solutionsList:
    sols.append(solution.x)

Using license file /home/marcos/gurobi.lic
Academic license - for non-commercial use only


## Voting Ensemble - All Models

In [6]:
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

In [7]:
models_t = []
for i in range(len(sols)):
    models_t.append(("Model "+str(i),sols[i]))

In [8]:
eclf1 = SimpleVoting(estimators=models_t, voting='soft')

In [9]:
#eclf1 = eclf1.fit(X_val, y_val)

In [10]:
print("Metrics - Train Data")
print("Acc: ", eclf1.score(X_train, y_train))
print("Eq Opor: ", equal_opportunity_score(sensitive_column="Sex")(eclf1, X_train, y_train))
print("P Perc: ", p_percent_score(sensitive_column="Sex")(eclf1, X_train))
print("Coev Var: ", coefficient_of_variation(eclf1, X_train, y_train))

Metrics - Train Data
Acc:  0.6531531531531531
Eq Opor:  0.9589490968801313
P Perc:  0.9692961738308926
Coev Var:  0.6761773706108338


In [11]:
print("Metrics - Val Data")
print("Acc: ", eclf1.score(X_val, y_val))
print("Eq Opor: ", equal_opportunity_score(sensitive_column="Sex")(eclf1, X_val, y_val))
print("P Perc: ", p_percent_score(sensitive_column="Sex")(eclf1, X_val))
print("Coev Var: ", coefficient_of_variation(eclf1, X_val, y_val))

Metrics - Val Data
Acc:  0.68
Eq Opor:  0.9890109890109889
P Perc:  0.9629629629629629
Coev Var:  0.6614940032155477


In [12]:
print("Metrics - Test Data")
print("Acc: ", eclf1.score(X_test, y_test))
print("Eq Opor: ", equal_opportunity_score(sensitive_column="Sex")(eclf1, X_test, y_test))
print("P Perc: ", p_percent_score(sensitive_column="Sex")(eclf1, X_test))
print("Coev Var: ", coefficient_of_variation(eclf1, X_test, y_test))

Metrics - Test Data
Acc:  0.61
Eq Opor:  0.995967741935484
P Perc:  0.9309523809523809
Coev Var:  0.6775074858941582


## Filter dominated models

In [13]:
def dominate(a, b):
    sense = np.array([1, 1, 1, -1])
    if all((sense*a)>=(sense*b)) and any((sense*a)>(sense*b)):
        return True
    else:
        return False

In [14]:
metrics = []

for i in range(len(sols)):
    metrics.append((sols[i].score(X_val, y_val),
                   equal_opportunity_score(sensitive_column="Sex")(sols[i], X_val, y_val),
                   p_percent_score(sensitive_column="Sex")(sols[i], X_val),
                   coefficient_of_variation(sols[i], X_val, y_val)))

  score = np.minimum(p_y1_z1 / p_y1_z0, p_y1_z0 / p_y1_z1)


## Finding non-dominate models

In [15]:
par_models = []
metrics_selected = []
for i in range(len(sols)):
    flag = True
    for j in range(len(sols)):
        if i != j:
            dom = dominate(metrics[j], metrics[i])
            if dom:
                flag = False
                break
    if flag:
        metrics_selected+=[metrics[i]]
        par_models.append(("Model "+str(i), sols[i]))
metrics_selected = pd.DataFrame(metrics_selected, columns=['Acc', 'Eq Opor', 'P Perc', 'Coev Var'])

## Cleaning methods with too low performance

In [16]:
def percentile(data, percentile):
    size = len(data)
    return sorted(data)[int(math.ceil((size * percentile) / 100)) - 1]

all_idx = set(metrics_selected.index)
for metr, sign in zip(metrics_selected, [1, 1, 1, -1]):
    if sign>0:
        perc = percentile(metrics_selected[metr], 10)
        all_idx = all_idx.intersection(np.where(metrics_selected[metr]>=perc)[0])
    else:
        perc = percentile(metrics_selected[metr], 90)
        all_idx = all_idx.intersection(np.where(metrics_selected[metr]<=perc)[0])
        
par_models_clean = [model for idx, model in enumerate(par_models) if idx in all_idx]

In [17]:
best_acc = []
best_eq = []
best_pp = []
best_var = []
best_acc_v = 0
best_eq_v = 0
best_pp_v = 0
best_var_v = 100

for s, model in par_models_clean:
    if model.score(X_val, y_val) > best_acc_v:
        best_acc_v = model.score(X_val, y_val)
        best_acc = [model, model.score(X_val, y_val),
                   equal_opportunity_score(sensitive_column="Sex")(model, X_val, y_val),
                   p_percent_score(sensitive_column="Sex")(model, X_val),
                   coefficient_of_variation(model, X_val, y_val)]
    if equal_opportunity_score(sensitive_column="Sex")(model, X_val, y_val) > best_eq_v:
        best_eq_v = equal_opportunity_score(sensitive_column="Sex")(model, X_val, y_val)
        best_eq = [model, model.score(X_val, y_val),
                   equal_opportunity_score(sensitive_column="Sex")(model, X_val, y_val),
                   p_percent_score(sensitive_column="Sex")(model, X_val),
                   coefficient_of_variation(model, X_val, y_val)]
    if p_percent_score(sensitive_column="Sex")(model, X_val) > best_pp_v:
        best_pp_v = p_percent_score(sensitive_column="Sex")(model, X_val)
        best_pp = [model, model.score(X_val, y_val),
                   equal_opportunity_score(sensitive_column="Sex")(model, X_val, y_val),
                   p_percent_score(sensitive_column="Sex")(model, X_val),
                   coefficient_of_variation(model, X_val, y_val)]
    if coefficient_of_variation(model, X_val, y_val) < best_var_v:
        best_var_v = coefficient_of_variation(model, X_val, y_val) 
        best_var = [model, model.score(X_val, y_val),
                   equal_opportunity_score(sensitive_column="Sex")(model, X_val, y_val),
                   p_percent_score(sensitive_column="Sex")(model, X_val),
                   coefficient_of_variation(model, X_val, y_val)]

In [18]:
best_acc

[LogisticRegression(C=3.9222543584631597, class_weight=None, dual=False,
                    fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                    max_iter=10000, multi_class='multinomial', n_jobs=None,
                    penalty='l2', random_state=None, solver='lbfgs',
                    tol=4.504504504504503e-09, verbose=0, warm_start=False),
 0.66,
 0.9890109890109889,
 0.936868686868687,
 0.6647007305466981]

In [19]:
best_eq

[LogisticRegression(C=3.9222543584631597, class_weight=None, dual=False,
                    fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                    max_iter=10000, multi_class='multinomial', n_jobs=None,
                    penalty='l2', random_state=None, solver='lbfgs',
                    tol=4.504504504504503e-09, verbose=0, warm_start=False),
 0.66,
 0.9890109890109889,
 0.936868686868687,
 0.6647007305466981]

In [20]:
best_pp

[LogisticRegression(C=7.0061383517914555, class_weight=None, dual=False,
                    fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                    max_iter=10000, multi_class='multinomial', n_jobs=None,
                    penalty='l2', random_state=None, solver='lbfgs',
                    tol=4.504504504504505e-09, verbose=0, warm_start=False),
 0.65,
 0.96,
 0.989010989010989,
 0.7037571328799413]

In [21]:
best_var

[LogisticRegression(C=0.004603800144057659, class_weight=None, dual=False,
                    fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                    max_iter=10000, multi_class='multinomial', n_jobs=None,
                    penalty='l2', random_state=None, solver='lbfgs',
                    tol=4.5045045045045035e-09, verbose=0, warm_start=False),
 0.66,
 0.9333333333333333,
 0.9494505494505494,
 0.5222329678670933]

## Voting Ensemble - Non-Dominated Models

In [22]:
eclf2 = SimpleVoting(estimators=par_models_clean, voting='soft')

In [23]:
#eclf2 = eclf2.fit(X_val, y_val)

In [24]:
print("Metrics - Train Data")
print("Acc: ", eclf2.score(X_train, y_train))
print("Eq Opor: ", equal_opportunity_score(sensitive_column="Sex")(eclf2, X_train, y_train))
print("P Perc: ", p_percent_score(sensitive_column="Sex")(eclf2, X_train))
print("Coev Var: ", coefficient_of_variation(eclf2, X_train, y_train))

Metrics - Train Data
Acc:  0.6531531531531531
Eq Opor:  0.9323116219667943
P Perc:  0.961690271650801
Coev Var:  0.6675792642454637


In [25]:
print("Metrics - Validation Data")
print("Acc: ", eclf2.score(X_val, y_val))
print("Eq Opor: ", equal_opportunity_score(sensitive_column="Sex")(eclf2, X_val, y_val))
print("P Perc: ", p_percent_score(sensitive_column="Sex")(eclf2, X_val))
print("Coev Var: ", coefficient_of_variation(eclf2, X_val, y_val))

Metrics - Validation Data
Acc:  0.66
Eq Opor:  0.9890109890109889
P Perc:  0.8792270531400966
Coev Var:  0.6647007305466981


In [26]:
print("Metrics - Test Data")
print("Acc: ", eclf2.score(X_test, y_test))
print("Eq Opor: ", equal_opportunity_score(sensitive_column="Sex")(eclf2, X_test, y_test))
print("P Perc: ", p_percent_score(sensitive_column="Sex")(eclf2, X_test))
print("Coev Var: ", coefficient_of_variation(eclf2, X_test, y_test))

Metrics - Test Data
Acc:  0.625
Eq Opor:  0.9323308270676691
P Perc:  0.9807692307692308
Coev Var:  0.6575103548402857
