# Model comparison

In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklego.metrics import equal_opportunity_score
from sklego.metrics import p_percent_score
from sklearn.metrics import log_loss
from sklearn.utils.extmath import squared_norm
from moopt.scalarization_interface import scalar_interface, single_interface, w_interface
from moopt import monise
import numpy as np
import optuna, sklearn, sklearn.datasets
from fair_models import coefficient_of_variation, MOOLogisticRegression, FindCLogisticRegression, FindCCLogisticRegression

In [14]:
%load_ext autoreload
%autoreload 2
%load_ext line_profiler

The autoreload extension is already loaded. To reload it, use:
  %reload_extautoreload
The line_profiler extension is already loaded. To reload it, use:
  %reload_extline_profiler


## Parte 1: Data treatment

In [15]:
mydata= pd.read_csv("Datasets/german_credit_data.csv")

Dados de pedidos de crédito. É um dos datasets mais utilizados para tutoriais em Fairness, como na biblioteca [$aif360$](https://github.com/IBM/AIF360/blob/master/examples/README.md). Dataset original disponível em [aqui](https://archive.ics.uci.edu/ml/datasets/statlog+(german+credit+data)), mas eu utilizei [este](https://www.kaggle.com/kabure/german-credit-data-with-risk), por estar em formato csv com os headers, embora omita informações do dataset original.

Originalmente possui 1000 dados, mas com vários valores NaN, ficando com 522 dados após remoção de dados com informações faltantes.

In [16]:
mydata.head()

Unnamed: 0.1,Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,0,67,male,2,own,,little,1169,6,radio/TV,good
1,1,22,female,2,own,little,moderate,5951,48,radio/TV,bad
2,2,49,male,1,own,little,,2096,12,education,good
3,3,45,male,2,free,little,little,7882,42,furniture/equipment,good
4,4,53,male,2,free,little,little,4870,24,car,bad


In [17]:
mydata = mydata.drop(['Unnamed: 0', 'Purpose'], axis=1)

In [18]:
mydata = mydata.dropna()

In [19]:
mapping_Sex = {'male': 0, 'female': 1}
mapping_Housing = {'free': 1, 'rent': 2, 'own': 3}
mapping_Savings = {'little': 1, 'moderate': 2, 'quite rich': 3, 'rich': 4}
mapping_Checking = {'little': 1, 'moderate': 2, 'rich': 3}
mapping_Risk = {"bad": -1, "good": 1}

numerical_data = mydata.replace({'Sex': mapping_Sex, 'Housing': mapping_Housing, 'Saving accounts': mapping_Savings,
                'Checking account':mapping_Checking, 'Risk': mapping_Risk})

In [20]:
X = numerical_data.drop(['Risk'], axis=1)

In [21]:
y = numerical_data['Risk']

In [46]:
X_tv, X_test, y_tv, y_test = train_test_split(X, y, test_size=100, random_state = 90)
X_train, X_val, y_train, y_val = train_test_split(X_tv, y_tv, test_size=100)

In [47]:
def calc_reweight(X, y):
    W = {}
    W[0] = {}
    W[1] = {}

    D = len(X)
    len_men = X.groupby('Sex').count()['Age'][0]
    len_women = X.groupby('Sex').count()['Age'][1]
    len_neg = sum(y==-1)
    len_pos = sum(y==1)
    len_men_pos = len(X[(X.Sex == 0) & (y == 1)])
    len_men_neg = len(X[(X.Sex == 0) & (y == -1)])
    len_women_pos = len(X[(X.Sex == 1) & (y == 1)])
    len_women_neg = len(X[(X.Sex == 1) & (y == -1)])

    W[0][1] = (len_men*len_pos)/(D*len_men_pos)
    W[0][-1] = (len_men*len_neg)/(D*len_men_neg)

    W[1][1] = (len_women*len_pos)/(D*len_women_pos)
    W[1][-1] = (len_women*len_neg)/(D*len_women_neg)
    
    sample_weight = []
    for i in range(X.shape[0]):
        sample_weight.append(W[X.iloc[i]['Sex']][y.iloc[i]])

    return sample_weight

In [48]:
sample_weight = calc_reweight(X_train, y_train)

## Parte 3: Comparando os modelos

In [49]:
metrics = ['accuracy', 'equal_opportunity', 'p_percent', 'c_variation']
for metric in metrics:
  

  reg_log = FindCLogisticRegression(X_train, y_train, X_val, y_val, metric=metric).tune()
  reg_equal = FindCCLogisticRegression(X_train, y_train, X_val, y_val, metric=metric, base_model='equal').tune()
  reg_demo = FindCCLogisticRegression(X_train, y_train, X_val, y_val, metric=metric, base_model='demographic').tune()
  reg_rewe = FindCLogisticRegression(X_train, y_train, X_val, y_val, metric=metric, sample_weight=sample_weight).tune()
  reg_moo = MOOLogisticRegression(X_train, y_train, X_val, y_val, metric=metric).tune()

  print("------------------------------------")

  print('Metric: ', metric)

  print(' - Log: ')
  print('    - accuracy = ', reg_log.score(X_val, y_val))
  print('    - equal_opportunity_score = ', equal_opportunity_score(sensitive_column="Sex")(reg_log, X_val, y_val))
  print('    - p_percent_score = ', p_percent_score(sensitive_column="Sex")(reg_log, X_val))
  print('    - coefficient_of_variation = ', coefficient_of_variation(reg_log, X_val, y_val))

  print(' - Moo: ')
  print('    - accuracy = ', reg_moo.score(X_val, y_val))
  print('    - equal_opportunity_score = ', equal_opportunity_score(sensitive_column="Sex")(reg_moo, X_val, y_val))
  print('    - p_percent_score = ', p_percent_score(sensitive_column="Sex")(reg_moo, X_val))
  print('    - coefficient_of_variation = ', coefficient_of_variation(reg_moo, X_val, y_val))

  print(' - Rewe: ')
  print('    - accuracy = ', reg_rewe.score(X_val, y_val))
  print('    - equal_opportunity_score = ', equal_opportunity_score(sensitive_column="Sex")(reg_rewe, X_val, y_val))
  print('    - p_percent_score = ', p_percent_score(sensitive_column="Sex")(reg_rewe, X_val))
  print('    - coefficient_of_variation = ', coefficient_of_variation(reg_rewe, X_val, y_val))

  print(' - Demo: ')
  print('    - accuracy = ', reg_demo.score(X_val, y_val))
  print('    - equal_opportunity_score = ', equal_opportunity_score(sensitive_column="Sex")(reg_demo, X_val, y_val))
  print('    - p_percent_score = ', p_percent_score(sensitive_column="Sex")(reg_demo, X_val))
  print('    - coefficient_of_variation = ', coefficient_of_variation(reg_demo, X_val, y_val))

  print(' - Equal: ')
  print('    - accuracy = ', reg_equal.score(X_test, y_test))
  print('    - equal_opportunity_score = ', equal_opportunity_score(sensitive_column="Sex")(reg_equal, X_test, y_test))
  print('    - p_percent_score = ', p_percent_score(sensitive_column="Sex")(reg_equal, X_test))
  print('    - coefficient_of_variation = ', coefficient_of_variation(reg_equal, X_val, y_val))

PulpSolverError: Pulp: Error while trying to execute, use msg=True for more detailscbc