In [1]:
import pandas as pd
import numpy as np

url = 'https://raw.githubusercontent.com/vladoxNCL/fairCorrect/master/Datasets/'

In [2]:

# Get balancing feature to end column
df = pd.read_csv(url + 'adult.data', header=None)

# Assign names to columns
names = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status',
         'occupation', 'relationship', 'race', 'sex', 'capital-gain',
         'capital-loss', 'hours-per-week', 'native-country', 'label']

df.columns = names

# Cleanup names from spaces
col = ['workclass', 'education', 'marital-status', 'occupation',
       'relationship', 'race', 'sex', 'native-country', 'label']

for c in col:
    df[c] = df[c].map(str.strip)

# Replace ? character for NaN value
df = df.replace('?', np.nan)

In [11]:
df.groupby('race').label.value_counts(normalize=True)

race                label
Amer-Indian-Eskimo  <=50K    0.884244
                    >50K     0.115756
Asian-Pac-Islander  <=50K    0.734360
                    >50K     0.265640
Black               <=50K    0.876120
                    >50K     0.123880
Other               <=50K    0.907749
                    >50K     0.092251
White               <=50K    0.744140
                    >50K     0.255860
Name: label, dtype: float64

In [4]:
PAs = ['sex', 'native-country', 'race', 'age']

In [17]:
df['native-country'].replace(np.nan, 'Unknown', inplace=True)

In [18]:
popPR = df.label.value_counts(normalize=True)[1]
PAdiffs = {p:{} for p in PAs}
for p in PAdiffs.keys():
    for u in df[p].unique():
        try:
            PAdiffs[p][u] = df[df[p] == u].label.value_counts(normalize=True)[1] - popPR
        except IndexError:
            PAdiffs[p][u] = - popPR

{'sex': {'Male': 0.06492701896517339, 'Female': -0.13134896882844013},
 'native-country': {'United-States': 0.005025204295490859,
  'Cuba': 0.0223483372908177,
  'Jamaica': -0.11735276732256761,
  'India': 0.15919044255397563,
  'Unknown': 0.009619259020527904,
  'Mexico': -0.18948762898568225,
  'South': -0.04080955744602438,
  'Puerto-Rico': -0.13554639955128756,
  'Honduras': -0.16388648052294746,
  'England': 0.09252377588730892,
  'Canada': 0.08150449214075245,
  'Germany': 0.08035832576565444,
  'Iran': 0.17779509371676633,
  'Philippines': 0.06727125063478367,
  'Italy': 0.10165619597863312,
  'Poland': -0.04080955744602438,
  'Columbia': -0.20691125236127864,
  'Cambodia': 0.12761149518555454,
  'Thailand': -0.07414289077935773,
  'Ecuador': -0.09795241458888154,
  'Laos': -0.1296984463349133,
  'Taiwan': 0.15134730529907364,
  'Haiti': -0.14990046653693348,
  'Portugal': -0.13270144933791628,
  'Dominican-Republic': -0.2122381288745958,
  'El-Salvador': -0.15590389706866592,
 

In [19]:
df['PAval'] = df[PAs].apply(lambda x: PAdiffs['sex'][x['sex']]
                            + PAdiffs['race'][x['race']]
                            + PAdiffs['native-country'][x['native-country']]
                            + PAdiffs['age'][x['age']], axis=1)

In [21]:
df['PA'] = df['PAval'].apply(lambda x: 1 if x > 0 else 0)

In [22]:
df.PA.value_counts()

1    17581
0    14980
Name: PA, dtype: int64

In [23]:
df.groupby('PA')['label'].value_counts(normalize=True)

PA  label
0   <=50K    0.920227
    >50K     0.079773
1   <=50K    0.621978
    >50K     0.378022
Name: label, dtype: float64

In [24]:
df.drop(['PAval'], axis=1, inplace=True)

In [26]:
# One-hot encode categorical variables
col = ['workclass', 'education', 'marital-status', 'occupation',
       'relationship', 'race', 'native-country',]

for c in col:
    dummies = []
    dummies.append(pd.get_dummies(df[c]))
    df_dummies = pd.concat(dummies, axis = 1)
    df = pd.concat((df, df_dummies), axis = 1)
    df = df.drop([c], axis = 1)

# Assign sex and label binary values 0 and 1
df.sex = df.sex.replace('Male', 1)
df.sex = df.sex.replace('Female', 0)
df.label = df.label.replace('<=50K', 0)
df.label = df.label.replace('>50K', 1)

# Drop fnlwgt variable
df = df.drop(['fnlwgt'], axis=1)

In [27]:
savepath = '../Datasets/'
df.to_csv(savepath + 'income_mod_onehot.csv', index=False)

## Older version below

In [3]:
df['foreigner'] = np.where(df['native-country']=='United-States',0,1)

In [4]:
df['white'] = np.where(df['race']=='White',1,0)

In [5]:
df['over35'] = np.where(df['age']>=35, 1, 0)

In [6]:
# One-hot encode categorical variables
col = ['workclass', 'education', 'marital-status', 'occupation',
       'relationship', 'race', 'native-country',]

for c in col:
    dummies = []
    dummies.append(pd.get_dummies(df[c]))
    df_dummies = pd.concat(dummies, axis = 1)
    df = pd.concat((df, df_dummies), axis = 1)
    df = df.drop([c], axis = 1)

# Assign sex and label binary values 0 and 1
df.sex = df.sex.replace('Male', 1)
df.sex = df.sex.replace('Female', 0)
df.label = df.label.replace('<=50K', 0)
df.label = df.label.replace('>50K', 1)

# Drop fnlwgt variable
df = df.drop(['fnlwgt'], axis=1)

In [7]:
def findFav(df, pa, label):
    pos_rates = [df[(df[pa] == u) & (df[label] == 1)].shape[0] / df[df[pa] == u].shape[0]
                 for u in df[pa].unique()]
    max_value = max(pos_rates)
    max_index = pos_rates.index(max_value)

    # Favoured group
    fav = df[pa].unique()[max_index]
    return fav

In [8]:
def findUnfav(df, pa, label):
    pos_rates = [df[(df[pa] == u) & (df[label] == 1)].shape[0] / df[df[pa] == u].shape[0]
                 for u in df[pa].unique()]
    min_value = min(pos_rates)
    min_index = pos_rates.index(min_value)

    # Favoured group
    unfav = df[pa].unique()[min_index]
    return unfav

In [9]:
PAs = ['sex', 'foreigner', 'white', 'over35']
PAfav = []
PAdiffs = []

for p in PAs:
    f = findFav(df, p, 'label')
    u = findUnfav(df, p, 'label')
    fpr = df[df[p] == f]['label'].value_counts(normalize=True)[1]
    upr = df[df[p] == u]['label'].value_counts(normalize=True)[1]
    ppr = df['label'].value_counts(normalize=True)[1]
#     diff = fpr - upr
    diff = upr - ppr
    PAfav.append(f)
    PAdiffs.append(diff)

In [10]:
PAfav

[1, 0, 1, 1]

In [11]:
PAdiffs

[-0.13134896882844013,
 -0.04322772317884657,
 -0.08822789253559235,
 -0.13489454570141624]

In [12]:
df['PAval'] = 0
for i in range(len(PAs)):
    PA = PAs[i]
    fav = PAfav[i]
    diff = PAdiffs[i]
    df.PAval = df[[PA, 'PAval']].apply(lambda x: x['PAval'] + diff if x[PA] == fav 
                                       else x['PAval'] - diff, axis=1)

In [13]:
df.PAval.value_counts()

 0.585075    10811
 0.110539     6842
-0.282013     4011
 0.192523     3957
 0.378518      972
-0.014034      937
-0.096017      863
 0.488569      825
-0.488569      777
 0.014034      696
 0.282013      441
 0.096017      366
-0.192523      340
-0.378518      308
-0.585075      212
-0.110539      203
Name: PAval, dtype: int64

In [15]:
df.groupby('PAval')['label'].value_counts(normalize=True)

PAval      label
-0.585075  0        0.952830
           1        0.047170
-0.488569  0        0.971686
           1        0.028314
-0.378518  0        0.948052
           1        0.051948
-0.282013  0        0.937671
           1        0.062329
-0.192523  0        0.867647
           1        0.132353
-0.110539  0        0.837438
           1        0.162562
-0.096017  0        0.913094
           1        0.086906
-0.014034  0        0.908218
           1        0.091782
 0.014034  0        0.906609
           1        0.093391
 0.096017  0        0.868852
           1        0.131148
 0.110539  0        0.853113
           1        0.146887
 0.192523  0        0.819560
           1        0.180440
 0.282013  0        0.616780
           1        0.383220
 0.378518  0        0.707819
           1        0.292181
 0.488569  0        0.655758
           1        0.344242
 0.585075  0        0.562020
           1        0.437980
Name: label, dtype: float64

In [15]:
df['PA'] = df['PAval'].apply(lambda x: 1 if x > 0 else 0)

In [16]:
df.PA.value_counts()

1    24910
0     7651
Name: PA, dtype: int64

In [17]:
df.groupby('PA')['label'].value_counts(normalize=True)

PA  label
0   0        0.929813
    1        0.070187
1   0        0.706784
    1        0.293216
Name: label, dtype: float64

In [20]:
df.drop(['foreigner', 'white', 'over35', 'PAval'], axis=1, inplace=True)

In [22]:
savepath = '../Datasets/'
df.to_csv(savepath + 'income_mod_onehot.csv', index=False)