# Model comparison

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklego.metrics import equal_opportunity_score
from sklego.metrics import p_percent_score
from sklearn.metrics import log_loss, accuracy_score
from sklearn.utils.extmath import squared_norm
from moopt.scalarization_interface import scalar_interface, single_interface, w_interface
from moopt import monise
import numpy as np
import optuna, sklearn, sklearn.datasets
from fair_models import coefficient_of_variation, MOOLogisticRegression, FindCLogisticRegression, FindCCLogisticRegression
from fair_models import calc_reweight
from fair_models import FairScalarization, EqualScalarization



In [2]:
import sys
sys.path.append("./MMFP/")

In [3]:
from MMPF.MinimaxParetoFair.MMPF_trainer import SKLearn_Weighted_LLR, APSTAR

In [4]:
%load_ext autoreload
%autoreload 2
%load_ext line_profiler

## Parte 1: Data treatment

In [5]:
mydata= pd.read_csv("Datasets - Limpos/german.csv")

In [6]:
mydata.head()

Unnamed: 0.1,Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Risk
0,1,22,1,2,3,1,2,5951,48,-1
1,3,45,0,2,1,1,1,7882,42,1
2,4,53,0,2,1,1,1,4870,24,-1
3,7,35,0,3,2,1,2,6948,36,1
4,9,28,0,3,3,1,2,5234,30,-1


In [7]:
mydata = mydata.drop(['Unnamed: 0'], axis=1)

In [8]:
X = mydata.drop(['Risk'], axis=1)

In [9]:
y = mydata['Risk']

In [10]:
# 395, 808
random_seed = 2000#np.random.randint(0, 1000)
random_seed

2000

In [11]:
random_seed2 = 2000#np.random.randint(0, 1000)
random_seed2

2000

In [12]:
X_tv, X_test, y_tv, y_test = train_test_split(X, y, test_size=200, random_state = random_seed)
X_train, X_val, y_train, y_val = train_test_split(X_tv, y_tv, test_size=100, random_state = random_seed2)

In [13]:
sample_weight = calc_reweight(X_train, y_train, 'Sex')

In [14]:
X_train

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration
494,24,0,2,3,1,1,2145,36
44,51,0,2,3,1,2,4771,11
508,57,1,1,3,3,3,1258,24
355,34,0,2,3,4,2,3496,30
22,33,1,3,3,1,3,1474,12
...,...,...,...,...,...,...,...,...
499,45,0,1,3,1,1,4006,28
28,44,0,1,3,1,2,6204,18
468,43,0,2,2,1,1,2625,16
101,52,0,2,3,3,1,338,6


In [15]:
a_train = X_train['Sex']
a_val = X_val['Sex']

a_train[a_train==0] = -1
a_val[a_val==0] = -1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)


In [16]:
C = 10**2
model = SKLearn_Weighted_LLR(X_train.values, y_train.values,
                             a_train.values, X_val.values,
                             y_val.values, a_val.values,
                             C_reg=C)

mua_ini = np.ones(a_val.max() + 1)
mua_ini /= mua_ini.sum()
results = APSTAR(model, mua_ini, niter=200, max_patience=200, Kini=1,
                      Kmin=20, alpha=0.5, verbose=False)
mu_best_list = results['mu_best_list']

mu_best = mu_best_list[-1]
model.weighted_fit(X_train.values, y_train.values, a_train.values, mu_best)

patience counter: 0 total iterations: 201
-----------------------------------------


In [17]:
accuracy_score(y_test, model.predict(X_test)), equal_opportunity_score(sensitive_column="Sex")(model, X_test, y_test), p_percent_score(sensitive_column="Sex")(model, X_test, y_test), coefficient_of_variation(model, X_test, y_test)

(0.635, 0.9215399610136453, 0.9540816326530612, 0.6959756269700474)

In [18]:
mu_best_list

[array([0.5, 0.5]),
 array([0.5, 0.5]),
 array([0.16666667, 0.83333333]),
 array([0.16666667, 0.83333333]),
 array([0.08333333, 0.91666667]),
 array([0.08333333, 0.91666667]),
 array([0.05, 0.95]),
 array([0.05, 0.95]),
 array([0.03333333, 0.96666667]),
 array([0.03333333, 0.96666667]),
 array([0.02380952, 0.97619048]),
 array([0.02380952, 0.97619048]),
 array([0.01785714, 0.98214286]),
 array([0.01785714, 0.98214286]),
 array([0.01388889, 0.98611111]),
 array([0.01388889, 0.98611111]),
 array([0.01111111, 0.98888889]),
 array([0.01111111, 0.98888889]),
 array([0.00909091, 0.99090909]),
 array([0.00909091, 0.99090909]),
 array([0.00757576, 0.99242424]),
 array([0.00757576, 0.99242424]),
 array([0.00641026, 0.99358974]),
 array([0.00641026, 0.99358974]),
 array([0.00549451, 0.99450549]),
 array([0.00549451, 0.99450549]),
 array([0.0047619, 0.9952381]),
 array([0.0047619, 0.9952381]),
 array([0.00416667, 0.99583333]),
 array([0.00416667, 0.99583333]),
 array([0.00367647, 0.99632353]),
 a

In [19]:
reg_demo = FindCCLogisticRegression(X_train, y_train, X_val, y_val,"Sex", metric="accuracy", 
                                                base_model='minimax').tune()

In [20]:
print(reg_demo.score(X_test, y_test))
print(equal_opportunity_score(sensitive_column=fair_feature)(reg_demo,X_test, y_test))
print(p_percent_score(sensitive_column=fair_feature)(reg_demo, X_test))
print(coefficient_of_variation(reg_demo, X_test, y_test))


AttributeError: 'NoneType' object has no attribute 'score'

In [21]:
reg_demo