# Model comparison

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklego.metrics import equal_opportunity_score
from sklego.metrics import p_percent_score
from sklearn.metrics import log_loss, accuracy_score
from sklearn.utils.extmath import squared_norm
from moopt.scalarization_interface import scalar_interface, single_interface, w_interface
from moopt import monise
import numpy as np
import optuna, sklearn, sklearn.datasets
from fair_models import coefficient_of_variation, MOOLogisticRegression, FindCLogisticRegression, FindCCLogisticRegression
from fair_models import calc_reweight
from fair_models import FairScalarization, EqualScalarization

Using Python-MIP package version 1.7.2




In [2]:
import sys
sys.path.append("./MMFP/")

In [3]:
from MMPF.MinimaxParetoFair.MMPF_trainer import SKLearn_Weighted_LLR, APSTAR

In [4]:
%load_ext autoreload
%autoreload 2
%load_ext line_profiler

## Parte 1: Data treatment

In [5]:
mydata= pd.read_csv("Datasets/german_credit_data.csv")

Dados de pedidos de crédito. É um dos datasets mais utilizados para tutoriais em Fairness, como na biblioteca [$aif360$](https://github.com/IBM/AIF360/blob/master/examples/README.md). Dataset original disponível em [aqui](https://archive.ics.uci.edu/ml/datasets/statlog+(german+credit+data)), mas eu utilizei [este](https://www.kaggle.com/kabure/german-credit-data-with-risk), por estar em formato csv com os headers, embora omita informações do dataset original.

Originalmente possui 1000 dados, mas com vários valores NaN, ficando com 522 dados após remoção de dados com informações faltantes.

In [6]:
mydata.head()

Unnamed: 0.1,Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,0,67,male,2,own,,little,1169,6,radio/TV,good
1,1,22,female,2,own,little,moderate,5951,48,radio/TV,bad
2,2,49,male,1,own,little,,2096,12,education,good
3,3,45,male,2,free,little,little,7882,42,furniture/equipment,good
4,4,53,male,2,free,little,little,4870,24,car,bad


In [7]:
mydata = mydata.drop(['Unnamed: 0', 'Purpose'], axis=1)

In [8]:
mydata = mydata.dropna()

In [9]:
mapping_Sex = {'male': 0, 'female': 1}
mapping_Housing = {'free': 1, 'rent': 2, 'own': 3}
mapping_Savings = {'little': 1, 'moderate': 2, 'quite rich': 3, 'rich': 4}
mapping_Checking = {'little': 1, 'moderate': 2, 'rich': 3}
mapping_Risk = {"bad": -1, "good": 1}

numerical_data = mydata.replace({'Sex': mapping_Sex, 'Housing': mapping_Housing, 'Saving accounts': mapping_Savings,
                'Checking account':mapping_Checking, 'Risk': mapping_Risk})

In [10]:
X = numerical_data.drop(['Risk'], axis=1)

In [11]:
y = numerical_data['Risk']

In [12]:
# 395, 808
random_seed = 2000#np.random.randint(0, 1000)
random_seed

2000

In [13]:
random_seed2 = 2000#np.random.randint(0, 1000)
random_seed2

2000

In [14]:
X_tv, X_test, y_tv, y_test = train_test_split(X, y, test_size=200, random_state = random_seed)
X_train, X_val, y_train, y_val = train_test_split(X_tv, y_tv, test_size=100, random_state = random_seed2)

In [15]:
sample_weight = calc_reweight(X_train, y_train)

In [16]:
a_train = X_train['Sex']
a_val = X_val['Sex']

a_train[a_train==0] = -1
a_val[a_val==0] = -1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exec(code_obj, self.user_global_ns, self.user_ns)


In [42]:
C = 10**2
model = SKLearn_Weighted_LLR(X_train.values, y_train.values,
                             a_train.values, X_val.values,
                             y_val.values, a_val.values,
                             C_reg=C)

mua_ini = np.ones(a_val.max() + 1)
mua_ini /= mua_ini.sum()
results =APSTAR(model, mua_ini, niter=200, max_patience=200, Kini=1,
                      Kmin=20, alpha=0.5, verbose=False)
mu_best_list = results['mu_best_list']

mu_best = mu_best_list[-1]
model.weighted_fit(X_train.values, y_train.values, a_train.values, mu_best)

patience counter: 0 total iterations: 201
-----------------------------------------


In [43]:
accuracy_score(y_test, model.predict(X_test)), equal_opportunity_score(sensitive_column="Sex")(model, X_test, y_test), p_percent_score(sensitive_column="Sex")(model, X_test, y_test), coefficient_of_variation(model, X_test, y_test)

(0.635, 0.9215399610136453, 0.9540816326530612, 0.6959756269700479)

In [41]:
mu_best_list

[array([0.5, 0.5]),
 array([0.5, 0.5]),
 array([0.16666667, 0.83333333]),
 array([0.16666667, 0.83333333]),
 array([0.08333333, 0.91666667]),
 array([0.08333333, 0.91666667]),
 array([0.05, 0.95]),
 array([0.05, 0.95]),
 array([0.03333333, 0.96666667]),
 array([0.03333333, 0.96666667]),
 array([0.02380952, 0.97619048]),
 array([0.02380952, 0.97619048]),
 array([0.01785714, 0.98214286]),
 array([0.01785714, 0.98214286]),
 array([0.01388889, 0.98611111]),
 array([0.01388889, 0.98611111]),
 array([0.01111111, 0.98888889]),
 array([0.01111111, 0.98888889]),
 array([0.00909091, 0.99090909]),
 array([0.00909091, 0.99090909]),
 array([0.00757576, 0.99242424]),
 array([0.00757576, 0.99242424]),
 array([0.00641026, 0.99358974]),
 array([0.00641026, 0.99358974]),
 array([0.00549451, 0.99450549]),
 array([0.00549451, 0.99450549]),
 array([0.0047619, 0.9952381]),
 array([0.0047619, 0.9952381]),
 array([0.00416667, 0.99583333]),
 array([0.00416667, 0.99583333]),
 array([0.00367647, 0.99632353]),
 a