In [1]:
import os
os.chdir("..")
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
data = pd.read_excel(os.path.join("ratio_analysis_plots", "d_processed.xlsx"))

In [3]:
data

Unnamed: 0,dataset,ml_name,method,lambda,k,Avg.e^2
0,adult,KNN,mean_v1,14.569371,1.933125e-06,0.424207
1,adult,KNN,mean_v2,-3.832102,3.892365e+00,0.332409
2,adult,KNN,similar_v1,6.629730,1.111963e-03,0.147048
3,adult,KNN,similar_v2,5.332352,2.483361e-03,0.241792
4,adult,KNN,multi_v1,5.095211,3.511095e-03,0.090114
...,...,...,...,...,...,...
211,bank,MLP,mean_v2,-8.510056,7.599501e+01,0.556829
212,bank,MLP,similar_v1,-0.694158,6.901930e-02,0.452284
213,bank,MLP,similar_v2,-7.076779,1.754112e+01,0.527676
214,bank,MLP,multi_v1,18.064996,4.123700e-09,0.623496


### Analysis of Output  
Dataset is the major factor of exponential curve fit and curve steepness

#### Groupby ML

In [4]:
data.groupby("ml_name").agg({'Avg.e^2':'median'}).sort_values(by="Avg.e^2")

Unnamed: 0_level_0,Avg.e^2
ml_name,Unnamed: 1_level_1
LinearSVC,0.325652
LogReg,0.325905
KNN,0.387774
MLP,0.403259
Tree,0.417961
Forest,0.423837


In [5]:
data.groupby("ml_name").agg({'lambda':'median'}).sort_values(by="lambda")

Unnamed: 0_level_0,lambda
ml_name,Unnamed: 1_level_1
KNN,0.35397
LogReg,0.384772
LinearSVC,0.600193
MLP,1.727437
Tree,1.897911
Forest,3.016339


#### Groupby Dataset

In [6]:
data.groupby("dataset").agg({'Avg.e^2':'median'}).sort_values(by="Avg.e^2")

Unnamed: 0_level_0,Avg.e^2
dataset,Unnamed: 1_level_1
compas,0.148974
adult,0.247918
communities,0.340193
titanic,0.448134
german,0.481169
bank,0.515395


In [7]:
data.groupby("dataset").agg({'lambda':'median'}).sort_values(by="lambda")

Unnamed: 0_level_0,lambda
dataset,Unnamed: 1_level_1
titanic,-1.184967
bank,-0.785825
german,-0.611679
communities,2.770809
adult,5.71916
compas,6.417708


#### Groupby Imputation Method

In [8]:
data.groupby("method").agg({'Avg.e^2':'median'}).sort_values(by="Avg.e^2")

Unnamed: 0_level_0,Avg.e^2
method,Unnamed: 1_level_1
similar_v1,0.273038
multi_v1,0.280543
multi_v2,0.370827
mean_v1,0.406623
similar_v2,0.421221
mean_v2,0.470482


In [9]:
data.groupby("method").agg({'lambda':'median'}).sort_values(by="lambda")

Unnamed: 0_level_0,lambda
method,Unnamed: 1_level_1
mean_v2,-0.593358
similar_v2,0.539437
similar_v1,2.004557
multi_v1,2.12135
mean_v1,2.244799
multi_v2,2.481914


### Analysis of Datasets  
1. Correlation to the protected attribute can affect the initial bias, but not related to curve steepness

In [10]:
from utils.data import create_adult_dataset, create_bank_dataset
from utils.data import create_communities_dataset, create_compas_dataset
from utils.data import create_german_dataset, create_titanic_dataset
from sklearn.preprocessing import LabelEncoder

In [11]:
def drop_na(data):
    data = data.copy()
    tmp_concat = pd.concat([data.X, pd.DataFrame(data.y, columns=["_TARGET_"])], axis=1)
    tmp_concat.dropna(inplace=True)
    tmp_concat.reset_index(drop=True, inplace=True)
    data.X = tmp_concat.drop(columns=["_TARGET_"]).copy()
    data.y = tmp_concat["_TARGET_"].copy().to_numpy().ravel()
    return data

In [12]:
def convert_protected(data):
    data = data.copy()
    encoder = LabelEncoder()
    for feature in data.protected_features:
        data.X[feature] = encoder.fit_transform(data.X[feature])
    return data, encoder

In [13]:
def concat(data):
    data = data.copy()
    return pd.concat([data.X, pd.DataFrame(data.y, columns=["_TARGET_"])], axis=1)

### Adult Dataset

In [14]:
data_adult = create_adult_dataset()
data_adult = drop_na(data_adult)
data_adult, encoder = convert_protected(data_adult)
data_a = concat(data_adult)
data_adult.protected_features

['sex']

In [15]:
encoder.classes_

array([' Female', ' Male'], dtype=object)

In [16]:
data_a.describe()

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,hours-per-week,_TARGET_
count,30718.0,30718.0,30718.0,30718.0,30718.0,30718.0,30718.0,30718.0,30718.0,30718.0,30718.0
mean,38.443584,3.100332,10.328602,10.130314,2.583143,5.967088,1.417247,3.670161,0.676737,40.949313,0.24904
std,13.118227,1.139557,3.809556,2.562469,1.495674,4.025999,1.601399,0.844063,0.46773,11.985382,0.432464
min,17.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
25%,28.0,3.0,9.0,9.0,2.0,2.0,0.0,4.0,0.0,40.0,0.0
50%,37.0,3.0,11.0,10.0,2.0,6.0,1.0,4.0,1.0,40.0,0.0
75%,47.0,3.0,12.0,13.0,4.0,9.0,3.0,4.0,1.0,45.0,0.0
max,90.0,7.0,15.0,16.0,6.0,13.0,5.0,4.0,1.0,99.0,1.0


In [17]:
data_a["_TARGET_"].value_counts()

0    23068
1     7650
Name: _TARGET_, dtype: int64

In [18]:
data_a[data_adult.protected_features[0]].value_counts()

1    20788
0     9930
Name: sex, dtype: int64

In [19]:
data_a.groupby(["_TARGET_", data_adult.protected_features[0]]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,age,workclass,education,education-num,marital-status,occupation,relationship,race,hours-per-week
_TARGET_,sex,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,0,8803,8803,8803,8803,8803,8803,8803,8803,8803
0,1,14265,14265,14265,14265,14265,14265,14265,14265,14265
1,0,1127,1127,1127,1127,1127,1127,1127,1127,1127
1,1,6523,6523,6523,6523,6523,6523,6523,6523,6523


In [20]:
data_a.groupby(["_TARGET_", data_adult.protected_features[0]]).nunique()

Unnamed: 0_level_0,Unnamed: 1_level_0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,hours-per-week,_TARGET_
_TARGET_,sex,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,0,71,7,16,16,7,13,6,5,1,78,1
0,1,71,7,16,16,7,14,6,5,1,90,1
1,0,53,6,14,14,7,13,5,5,1,53,1
1,1,65,6,15,15,7,13,6,5,1,76,1


In [21]:
data_a.corr()[data_adult.protected_features[0]]

age               0.082117
workclass         0.072763
education        -0.028155
education-num     0.007443
marital-status   -0.120360
occupation        0.062068
relationship     -0.585792
race              0.086147
sex               1.000000
hours-per-week    0.230321
_TARGET_          0.216626
Name: sex, dtype: float64

### Compas Dataset

In [22]:
data_compas = create_compas_dataset()
data_compas = drop_na(data_compas)
data_compas, encoder = convert_protected(data_compas)
data_c = concat(data_compas)
data_compas.protected_features

['race']

In [23]:
encoder.classes_

array(['African-American', 'Asian', 'Caucasian', 'Hispanic',
       'Native American', 'Other'], dtype=object)

In [24]:
data_c.describe()

Unnamed: 0,age,age_cat,c_charge_degree,priors_count,juv_misd_count,juv_fel_count,juv_other_count,days_b_screening_arrest,sex,race,length_of_stay,_TARGET_
count,9388.0,9388.0,9388.0,9388.0,9388.0,9388.0,9388.0,9388.0,9388.0,9388.0,9388.0,9388.0
mean,34.760865,0.642522,0.35098,3.022262,0.078718,0.058266,0.098743,-1.800597,0.794099,1.274819,523.774782,0.339476
std,11.813824,0.811018,0.477302,4.585775,0.463982,0.451685,0.467258,5.024736,0.40438,1.45465,1725.08868,0.473557
min,18.0,0.0,0.0,0.0,0.0,0.0,0.0,-30.0,0.0,0.0,-11.704722,0.0
25%,25.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,1.0,0.0,21.388889,0.0
50%,32.0,0.0,0.0,1.0,0.0,0.0,0.0,-1.0,1.0,1.0,29.148889,0.0
75%,42.0,1.0,1.0,4.0,0.0,0.0,0.0,-1.0,1.0,2.0,168.065069,1.0
max,96.0,2.0,1.0,38.0,13.0,20.0,11.0,30.0,1.0,5.0,51670.880278,1.0


In [25]:
data_c["_TARGET_"].value_counts()

0    6201
1    3187
Name: _TARGET_, dtype: int64

In [26]:
data_c[data_compas.protected_features[0]].value_counts()

0    4672
2    3253
3     817
5     571
1      48
4      27
Name: race, dtype: int64

In [27]:
data_c.groupby(["_TARGET_", data_compas.protected_features[0]]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,age,age_cat,c_charge_degree,priors_count,juv_misd_count,juv_fel_count,juv_other_count,days_b_screening_arrest,sex,length_of_stay
_TARGET_,race,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,0,2781,2781,2781,2781,2781,2781,2781,2781,2781,2781
0,1,38,38,38,38,38,38,38,38,38,38
0,2,2323,2323,2323,2323,2323,2323,2323,2323,2323,2323
0,3,609,609,609,609,609,609,609,609,609,609
0,4,19,19,19,19,19,19,19,19,19,19
0,5,431,431,431,431,431,431,431,431,431,431
1,0,1891,1891,1891,1891,1891,1891,1891,1891,1891,1891
1,1,10,10,10,10,10,10,10,10,10,10
1,2,930,930,930,930,930,930,930,930,930,930
1,3,208,208,208,208,208,208,208,208,208,208


In [28]:
data_c.groupby(["_TARGET_", data_compas.protected_features[0]]).nunique()

Unnamed: 0_level_0,Unnamed: 1_level_0,age,age_cat,c_charge_degree,priors_count,juv_misd_count,juv_fel_count,juv_other_count,days_b_screening_arrest,sex,race,length_of_stay,_TARGET_
_TARGET_,race,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,0,59,3,2,30,8,8,8,43,2,1,2765,1
0,1,24,3,2,3,1,1,1,6,2,1,38,1
0,2,62,3,2,25,7,5,7,42,2,1,2313,1
0,3,54,3,2,18,4,3,6,28,2,1,607,1
0,4,16,3,2,7,1,2,2,3,2,1,19,1
0,5,51,3,2,13,2,4,3,17,2,1,431,1
1,0,52,3,2,35,10,9,7,48,2,1,1885,1
1,1,10,3,2,7,2,1,2,2,2,1,10,1
1,2,52,3,2,29,4,7,7,45,2,1,928,1
1,3,41,3,2,18,4,3,3,24,2,1,208,1


In [29]:
data_c.corr()[data_compas.protected_features[0]]

age                        0.137085
age_cat                   -0.023542
c_charge_degree            0.099344
priors_count              -0.206018
juv_misd_count            -0.074672
juv_fel_count             -0.048369
juv_other_count           -0.050273
days_b_screening_arrest   -0.041953
sex                       -0.016440
race                       1.000000
length_of_stay            -0.058908
_TARGET_                  -0.129416
Name: race, dtype: float64

### Communities Dataset

In [30]:
data_communities = create_communities_dataset()
data_communities = drop_na(data_communities)
data_communities, encoder = convert_protected(data_communities)
data_co = concat(data_communities)
data_communities.protected_features

['race_c']

In [31]:
encoder.classes_

array(['asian', 'black', 'hispanic', 'white'], dtype=object)

In [32]:
data_co.describe()

Unnamed: 0,population,householdsize,agePct12t21,agePct12t29,agePct16t24,agePct65up,numbUrban,pctUrban,medIncome,pctWWage,...,PctBornSameState,PctSameHouse85,PctSameCity85,PctSameState85,LandArea,PopDens,PctUsePubTrans,LemasPctOfficDrugUn,race_c,_TARGET_
count,1993.0,1993.0,1993.0,1993.0,1993.0,1993.0,1993.0,1993.0,1993.0,1993.0,...,1993.0,1993.0,1993.0,1993.0,1993.0,1993.0,1993.0,1993.0,1993.0,1993.0
mean,0.057612,0.463437,0.42421,0.493914,0.336297,0.423086,0.064104,0.696618,0.361259,0.558314,...,0.608776,0.534967,0.626322,0.65147,0.065243,0.23291,0.161741,0.094099,2.591069,0.355243
std,0.126935,0.163747,0.155234,0.143584,0.16654,0.179196,0.12828,0.444648,0.209327,0.18282,...,0.204314,0.18136,0.20052,0.198253,0.109485,0.203127,0.229099,0.240379,0.851918,0.478707
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.01,0.35,0.34,0.41,0.25,0.3,0.0,0.0,0.2,0.44,...,0.47,0.42,0.52,0.56,0.02,0.1,0.02,0.0,3.0,0.0
50%,0.02,0.44,0.4,0.48,0.29,0.42,0.03,1.0,0.32,0.56,...,0.63,0.54,0.67,0.7,0.04,0.17,0.07,0.0,3.0,0.0
75%,0.05,0.54,0.47,0.54,0.36,0.53,0.07,1.0,0.49,0.69,...,0.77,0.66,0.77,0.79,0.07,0.28,0.19,0.0,3.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,3.0,1.0


In [33]:
data_co["_TARGET_"].value_counts()

0    1285
1     708
Name: _TARGET_, dtype: int64

In [34]:
data_co[data_communities.protected_features[0]].value_counts()

3    1572
1     218
2     115
0      88
Name: race_c, dtype: int64

In [35]:
data_co.groupby(["_TARGET_", data_communities.protected_features[0]]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,population,householdsize,agePct12t21,agePct12t29,agePct16t24,agePct65up,numbUrban,pctUrban,medIncome,pctWWage,...,NumStreet,PctForeignBorn,PctBornSameState,PctSameHouse85,PctSameCity85,PctSameState85,LandArea,PopDens,PctUsePubTrans,LemasPctOfficDrugUn
_TARGET_,race_c,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
0,0,43,43,43,43,43,43,43,43,43,43,...,43,43,43,43,43,43,43,43,43,43
0,1,30,30,30,30,30,30,30,30,30,30,...,30,30,30,30,30,30,30,30,30,30
0,2,24,24,24,24,24,24,24,24,24,24,...,24,24,24,24,24,24,24,24,24,24
0,3,1188,1188,1188,1188,1188,1188,1188,1188,1188,1188,...,1188,1188,1188,1188,1188,1188,1188,1188,1188,1188
1,0,45,45,45,45,45,45,45,45,45,45,...,45,45,45,45,45,45,45,45,45,45
1,1,188,188,188,188,188,188,188,188,188,188,...,188,188,188,188,188,188,188,188,188,188
1,2,91,91,91,91,91,91,91,91,91,91,...,91,91,91,91,91,91,91,91,91,91
1,3,384,384,384,384,384,384,384,384,384,384,...,384,384,384,384,384,384,384,384,384,384


In [36]:
data_co.groupby(["_TARGET_", data_communities.protected_features[0]]).nunique()

Unnamed: 0_level_0,Unnamed: 1_level_0,population,householdsize,agePct12t21,agePct12t29,agePct16t24,agePct65up,numbUrban,pctUrban,medIncome,pctWWage,...,PctBornSameState,PctSameHouse85,PctSameCity85,PctSameState85,LandArea,PopDens,PctUsePubTrans,LemasPctOfficDrugUn,race_c,_TARGET_
_TARGET_,race_c,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
0,0,17,32,25,28,24,32,20,2,28,33,...,23,32,35,32,12,34,26,8,1,1
0,1,9,22,22,20,23,25,8,3,23,25,...,24,19,25,22,11,21,16,3,1,1
0,2,7,20,19,16,17,16,7,2,16,19,...,22,23,16,19,6,16,12,3,1,1
0,3,25,81,85,83,87,93,28,64,96,90,...,99,99,96,92,36,76,90,53,1,1
1,0,25,35,25,27,24,26,26,2,31,29,...,25,34,30,29,19,33,30,15,1,1
1,1,47,48,50,46,52,53,48,6,39,57,...,63,53,60,63,40,54,58,43,1,1
1,2,24,38,35,30,29,34,25,3,36,43,...,45,45,41,43,17,49,41,16,1,1
1,3,44,62,63,59,59,78,43,10,58,82,...,87,74,85,80,43,67,56,49,1,1


In [37]:
data_co.corr()[data_communities.protected_features[0]].sort_values().head(20)

PctIlleg              -0.550950
PctPersDenseHous      -0.478658
PctLargHouseFam       -0.431739
PctRecImmig10         -0.430873
PctRecImmig8          -0.420352
pctWPubAsst           -0.404295
PctRecImmig5          -0.393945
_TARGET_              -0.384646
PctLargHouseOccup     -0.381241
PctForeignBorn        -0.380517
PctRecentImmig        -0.380115
PctNotSpeakEnglWell   -0.356021
PctPopUnderPov        -0.335970
NumIlleg              -0.329182
PersPerRentOccHous    -0.327487
NumImmig              -0.322164
PctUnemployed         -0.309424
PopDens               -0.300363
PersPerFam            -0.298405
NumUnderPov           -0.294006
Name: race_c, dtype: float64

In [38]:
data_co.corr()[data_communities.protected_features[0]].sort_values().tail(20)

agePct65up          0.093867
pctWFarmSelf        0.095757
PctOccupMgmtProf    0.096813
PctBornSameState    0.102681
PctWorkMom          0.110446
medIncome           0.128469
perCapInc           0.132593
PctEmploy           0.145985
medFamInc           0.154943
MedNumBR            0.163703
pctWRetire          0.175166
PctHousOwnOcc       0.270120
PctPersOwnOccup     0.316089
pctWInvInc          0.344894
PctSpeakEnglOnly    0.358004
PctYoungKids2Par    0.379854
PctTeen2Par         0.411145
PctFam2Par          0.419595
PctKids2Par         0.465343
race_c              1.000000
Name: race_c, dtype: float64

### Titanic Dataset

In [39]:
data_titanic = create_titanic_dataset()
data_titanic = drop_na(data_titanic)
data_titanic, encoder = convert_protected(data_titanic)
data_t = concat(data_titanic)
data_titanic.protected_features

['Sex']

In [40]:
encoder.classes_

array(['female', 'male'], dtype=object)

In [41]:
data_t.describe()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,_TARGET_
count,712.0,712.0,712.0,712.0,712.0,712.0,712.0,712.0
mean,2.240169,0.636236,29.642093,0.514045,0.432584,34.567251,1.595506,0.404494
std,0.836854,0.48142,14.492933,0.930692,0.854181,52.938648,0.779038,0.491139
min,1.0,0.0,0.42,0.0,0.0,0.0,0.0,0.0
25%,1.0,0.0,20.0,0.0,0.0,8.05,2.0,0.0
50%,2.0,1.0,28.0,0.0,0.0,15.64585,2.0,0.0
75%,3.0,1.0,38.0,1.0,1.0,33.0,2.0,1.0
max,3.0,1.0,80.0,5.0,6.0,512.3292,2.0,1.0


In [42]:
data_t["_TARGET_"].value_counts()

0    424
1    288
Name: _TARGET_, dtype: int64

In [43]:
data_t[data_titanic.protected_features[0]].value_counts()

1    453
0    259
Name: Sex, dtype: int64

In [44]:
data_t.groupby(["_TARGET_", data_titanic.protected_features[0]]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Pclass,Age,SibSp,Parch,Fare,Embarked
_TARGET_,Sex,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0,64,64,64,64,64,64
0,1,360,360,360,360,360,360
1,0,195,195,195,195,195,195
1,1,93,93,93,93,93,93


In [45]:
data_t.groupby(["_TARGET_", data_titanic.protected_features[0]]).nunique()

Unnamed: 0_level_0,Unnamed: 1_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,_TARGET_
_TARGET_,Sex,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,0,3,1,38,6,7,38,3,1
0,1,3,1,74,6,6,140,3,1
1,0,3,1,54,5,5,114,3,1
1,1,3,1,49,4,3,59,3,1


In [46]:
data_t.corr()[data_titanic.protected_features[0]]

Pclass      0.150826
Sex         1.000000
Age         0.099037
SibSp      -0.106296
Parch      -0.249543
Fare       -0.182457
Embarked    0.109639
_TARGET_   -0.536762
Name: Sex, dtype: float64

### German Credit Dataset

In [47]:
data_german = create_german_dataset()
data_german = drop_na(data_german)
data_german, encoder = convert_protected(data_german)
data_g = concat(data_german)
data_german.protected_features

['Age']

In [48]:
encoder.classes_

array(['elder', 'young'], dtype=object)

In [49]:
data_g.describe()

Unnamed: 0,Status_account,Duration_month,Credit_history,Purpose,Credit_amount,Savings_account,Employment_since,Installment_rate,Personal_status,Debtors_guarantors,...,Property,Age,Installment_plans,Housing,Number_credits,Job,Num_liable_people,Telephone,Foreign,_TARGET_
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,...,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,1.577,20.903,2.545,3.277,3271.258,1.105,2.384,2.973,1.682,0.145,...,1.358,0.19,1.675,0.929,1.407,1.904,1.155,0.404,0.037,0.3
std,1.257638,12.058814,1.08312,2.739302,2822.736876,1.580023,1.208306,1.118715,0.70808,0.477706,...,1.050209,0.392497,0.705601,0.531264,0.577654,0.653614,0.362086,0.490943,0.188856,0.458487
min,0.0,4.0,0.0,0.0,250.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
25%,0.0,12.0,2.0,1.0,1365.5,0.0,2.0,2.0,1.0,0.0,...,0.0,0.0,2.0,1.0,1.0,2.0,1.0,0.0,0.0,0.0
50%,1.0,18.0,2.0,3.0,2319.5,0.0,2.0,3.0,2.0,0.0,...,1.0,0.0,2.0,1.0,1.0,2.0,1.0,0.0,0.0,0.0
75%,3.0,24.0,4.0,4.0,3972.25,2.0,4.0,4.0,2.0,0.0,...,2.0,0.0,2.0,1.0,2.0,2.0,1.0,1.0,0.0,1.0
max,3.0,72.0,4.0,9.0,18424.0,4.0,4.0,4.0,3.0,2.0,...,3.0,1.0,2.0,2.0,4.0,3.0,2.0,1.0,1.0,1.0


In [50]:
data_g["_TARGET_"].value_counts()

0    700
1    300
Name: _TARGET_, dtype: int64

In [51]:
data_g[data_german.protected_features[0]].value_counts()

0    810
1    190
Name: Age, dtype: int64

In [52]:
data_g.groupby(["_TARGET_", data_german.protected_features[0]]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Status_account,Duration_month,Credit_history,Purpose,Credit_amount,Savings_account,Employment_since,Installment_rate,Personal_status,Debtors_guarantors,Residence_since,Property,Installment_plans,Housing,Number_credits,Job,Num_liable_people,Telephone,Foreign
_TARGET_,Age,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0,0,590,590,590,590,590,590,590,590,590,590,590,590,590,590,590,590,590,590,590
0,1,110,110,110,110,110,110,110,110,110,110,110,110,110,110,110,110,110,110,110
1,0,220,220,220,220,220,220,220,220,220,220,220,220,220,220,220,220,220,220,220
1,1,80,80,80,80,80,80,80,80,80,80,80,80,80,80,80,80,80,80,80


In [53]:
data_g.groupby(["_TARGET_", data_german.protected_features[0]]).nunique()

Unnamed: 0_level_0,Unnamed: 1_level_0,Status_account,Duration_month,Credit_history,Purpose,Credit_amount,Savings_account,Employment_since,Installment_rate,Personal_status,Debtors_guarantors,...,Property,Age,Installment_plans,Housing,Number_credits,Job,Num_liable_people,Telephone,Foreign,_TARGET_
_TARGET_,Age,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
0,0,4,31,5,10,555,5,5,4,4,3,...,4,1,3,3,4,4,2,2,2,1
0,1,4,19,5,8,109,5,5,4,4,3,...,4,1,3,3,3,4,2,2,2,1
1,0,4,21,5,9,219,5,5,4,4,3,...,4,1,3,3,4,4,2,2,2,1
1,1,4,18,5,9,78,5,5,4,3,3,...,4,1,3,3,2,4,2,2,1,1


In [54]:
data_g.corr()[data_german.protected_features[0]]

Status_account       -0.137146
Duration_month       -0.007946
Credit_history       -0.114317
Purpose               0.005931
Credit_amount        -0.045989
Savings_account      -0.058027
Employment_since     -0.175101
Installment_rate     -0.061256
Personal_status      -0.077726
Debtors_guarantors    0.018419
Residence_since      -0.012824
Property             -0.080186
Age                   1.000000
Installment_plans     0.046084
Housing              -0.290479
Number_credits       -0.138322
Job                  -0.112219
Num_liable_people    -0.165169
Telephone            -0.164986
Foreign              -0.054422
_TARGET_              0.127938
Name: Age, dtype: float64

### Bank Dataset

In [55]:
data_bank = create_bank_dataset()
data_bank = drop_na(data_bank)
data_bank, encoder = convert_protected(data_bank)
data_b = concat(data_bank)
data_bank.protected_features

['age']

In [56]:
encoder.classes_

array(['elder', 'young'], dtype=object)

In [57]:
data_b.describe()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,_TARGET_
count,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0
mean,0.332065,4.339762,1.167725,1.224813,0.018027,1362.272058,0.555838,0.160226,0.640242,15.806419,5.523014,258.16308,2.763841,40.197828,0.580323,2.559974,0.116985
std,0.470959,3.272657,0.60823,0.747997,0.133049,3044.765829,0.496878,0.36682,0.897951,8.322476,3.006911,257.527812,3.098021,100.128746,2.303441,0.989059,0.321406
min,0.0,0.0,0.0,0.0,0.0,-8019.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,-1.0,0.0,0.0,0.0
25%,0.0,1.0,1.0,1.0,0.0,72.0,0.0,0.0,0.0,8.0,3.0,103.0,1.0,-1.0,0.0,3.0,0.0
50%,0.0,4.0,1.0,1.0,0.0,448.0,1.0,0.0,0.0,16.0,6.0,180.0,2.0,-1.0,0.0,3.0,0.0
75%,1.0,7.0,2.0,2.0,0.0,1428.0,1.0,0.0,2.0,21.0,8.0,319.0,3.0,-1.0,0.0,3.0,0.0
max,1.0,11.0,2.0,3.0,1.0,102127.0,1.0,1.0,2.0,31.0,11.0,4918.0,63.0,871.0,275.0,3.0,1.0


In [58]:
data_b["_TARGET_"].value_counts()

0    39922
1     5289
Name: _TARGET_, dtype: int64

In [59]:
data_b[data_bank.protected_features[0]].value_counts()

0    30198
1    15013
Name: age, dtype: int64

In [60]:
data_b.groupby(["_TARGET_", data_bank.protected_features[0]]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
_TARGET_,age,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,0,26889,26889,26889,26889,26889,26889,26889,26889,26889,26889,26889,26889,26889,26889,26889
0,1,13033,13033,13033,13033,13033,13033,13033,13033,13033,13033,13033,13033,13033,13033,13033
1,0,3309,3309,3309,3309,3309,3309,3309,3309,3309,3309,3309,3309,3309,3309,3309
1,1,1980,1980,1980,1980,1980,1980,1980,1980,1980,1980,1980,1980,1980,1980,1980


In [61]:
data_b.groupby(["_TARGET_", data_bank.protected_features[0]]).nunique()

Unnamed: 0_level_0,Unnamed: 1_level_0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,_TARGET_
_TARGET_,age,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
0,0,1,12,3,4,2,5980,2,2,3,31,12,1191,44,443,36,4,1
0,1,1,12,3,4,2,3657,2,2,3,31,12,976,43,388,34,4,1
1,0,1,12,3,4,2,1777,2,2,3,31,12,1180,20,346,22,4,1
1,1,1,12,3,4,2,1159,2,2,3,31,12,913,18,266,22,4,1


In [62]:
data_b.corr()[data_bank.protected_features[0]]

age          1.000000
job          0.043744
marital      0.380677
education    0.089655
default      0.008954
balance     -0.053570
housing      0.074313
loan         0.000707
contact     -0.065795
day          0.012924
month        0.014213
duration     0.017457
campaign    -0.021444
pdays        0.020864
previous     0.004824
poutcome    -0.010869
_TARGET_     0.032689
Name: age, dtype: float64