- Reads raw data and processes it for model training
- Drops records with missing values
- Rearrange columns so that the second last column is the sensitive attribute, and the last column is the label
    - Both sensitive attribute and labels should be 0 or 1
- Converts text data to numerical values

In [7]:
import pandas as pd

---

# Adult dataset
- Dataset taken from UCI ML repository
- Sensitive attribute: gender
- Combining both provided datasets, assuming that the training script will split as necessary

In [8]:
data_1 = pd.read_csv("adult/adult.data", header=None, index_col=None)
data_2 = pd.read_csv("adult/adult.test", header=None, index_col=None)
data = pd.concat((data_1, data_2))
data.dropna()
for col in data.columns:
    data = data[data[col] != "?"]
    data = data[data[col] != " ?"]
data[1] = pd.factorize(data[1])[0]
data[3] = pd.factorize(data[3])[0]
data[5] = pd.factorize(data[5])[0]
data[6] = pd.factorize(data[6])[0]
data[7] = pd.factorize(data[7])[0]
data[8] = pd.factorize(data[8])[0]
data[9] = data[9].replace({" Female": 1, " Male": 0})
data[13] = pd.factorize(data[13])[0]
data[14] = data[14].replace({" <=50K": 0, " >50K": 1, " <=50K.": 0, " >50K.": 1})

del data[4]  # education and education-num are redundant
data = data.reindex(columns=[0, 1, 2, 3, 5, 6, 7, 8, 10, 11, 12, 13, 9, 14])
print(data.head(10))
print(data.shape)
data.to_csv("adult.csv", header=None, index=None)

   0   1       2   3   5   6   7   8      10  11  12  13  9   14
0  39   0   77516   0   0   0   0   0   2174   0  40   0   0   0
1  50   1   83311   0   1   1   1   0      0   0  13   0   0   0
2  38   2  215646   1   2   2   0   0      0   0  40   0   0   0
3  53   2  234721   2   1   2   1   1      0   0  40   0   0   0
4  28   2  338409   0   1   3   2   1      0   0  40   1   1   0
5  37   2  284582   3   1   1   2   0      0   0  40   0   1   0
6  49   2  160187   4   3   4   0   1      0   0  16   2   1   0
7  52   1  209642   1   1   1   1   0      0   0  45   0   0   1
8  31   2   45781   3   0   3   0   0  14084   0  50   0   1   1
9  42   2  159449   0   1   1   1   0   5178   0  40   0   0   1
(45222, 14)


  data[9] = data[9].replace({" Female": 1, " Male": 0})
  data[14] = data[14].replace({" <=50K": 0, " >50K": 1, " <=50K.": 0, " >50K.": 1})


---

# Bank Marketing dataset
- Dataset taken from UCI ML repository
- Bank-additional dataset used
- Sensitive attribute: age (<25 and >=60) is 0 else 1

In [9]:
data = pd.read_csv("bank/bank.csv", sep=";")
data = data.dropna()
for c in [
    "job",
    "marital",
    "education",
    "default",
    "housing",
    "loan",
    "contact",
    "month",
    "day_of_week",
    "poutcome",
]:
    data[c] = pd.factorize(data[c])[0]
data["y"] = data["y"].replace({"yes": 1, "no": 0})
data["age_group"] = pd.cut(
    data.age, [0, 26, 60, 100], right=False, labels=[1, 0, 1], ordered=False
)
c = list(data.columns)
c = c[:-2] + [c[-1]] + [c[-2]]
data = data.reindex(columns=c)
print(data.head(10))
print(data.shape)
data.to_csv("bank.csv", header=None, index=None)

  data["y"] = data["y"].replace({"yes": 1, "no": 0})


   age  job  marital  education  default  housing  loan  contact  month  \
0   56    0        0          0        0        0     0        0      0   
1   57    1        0          1        1        0     0        0      0   
2   37    1        0          1        0        1     0        0      0   
3   40    2        0          2        0        0     0        0      0   
4   56    1        0          1        0        0     1        0      0   
5   45    1        0          3        1        0     0        0      0   
6   59    2        0          4        0        0     0        0      0   
7   41    3        0          5        1        0     0        0      0   
8   24    4        1          4        0        1     0        0      0   
9   25    1        1          1        0        1     0        0      0   

   day_of_week  ...  pdays  previous  poutcome  emp.var.rate  cons.price.idx  \
0            0  ...    999         0         0           1.1          93.994   
1            0

---

# Compas dataset
- Dataset taken from propublic compas analysis github repository
- Features used by IBM AIF360 kept
- Sensitive attribute: race

In [10]:
data = pd.read_csv("compas/compas.csv")
data = data[
    [
        "sex",
        "age",
        "age_cat",
        "juv_fel_count",
        "juv_misd_count",
        "juv_other_count",
        "priors_count",
        "c_charge_degree",
        "c_charge_desc",
        "race",
        "two_year_recid",
    ]
]
data = data.dropna()
data["sex"] = pd.factorize(data["sex"])[0]
data["age_cat"] = pd.factorize(data["age_cat"])[0]
data["c_charge_degree"] = pd.factorize(data["c_charge_degree"])[0]
data["c_charge_desc"] = pd.factorize(data["c_charge_desc"])[0]
data["race"] = data["race"].replace(
    {
        "Caucasian": 0,
        "Other": 1,
        "African-American": 1,
        "Hispanic": 1,
        "Native American": 1,
        "Asian": 1,
    }
)

print(data.head(10))
print(data.shape)
data.to_csv("compas.csv", header=None, index=None)

   sex  age  age_cat  juv_fel_count  juv_misd_count  juv_other_count  \
0    0   69        0              0               0                0   
1    0   34        1              0               0                0   
2    0   24        2              0               0                1   
3    0   23        2              0               1                0   
4    0   43        1              0               0                0   
5    0   44        1              0               0                0   
6    0   41        1              0               0                0   
7    0   43        1              0               0                0   
8    1   39        1              0               0                0   
9    0   21        2              0               0                0   

   priors_count  c_charge_degree  c_charge_desc  race  two_year_recid  
0             0                0              0     1               0  
1             0                0              1     1          

  data["race"] = data["race"].replace(


---

# Default of Credit Card Clients dataset
- Dataset taken from UCI ML repository
- Sensitive attribute: gender

In [11]:
data = pd.read_csv("default/default.csv", index_col="ID")
data = data.dropna()
data["SEX"] = data["SEX"] - 1
c = list(data.columns)
c = [c[0]] + c[2:-1] + [c[1]] + [c[-1]]
data = data.reindex(columns=c)
print(data.head(10))
print(data.shape)
data.to_csv("default.csv", index=None, header=None)

    LIMIT_BAL  EDUCATION  MARRIAGE  AGE  PAY_0  PAY_2  PAY_3  PAY_4  PAY_5  \
ID                                                                           
1       20000          2         1   24      2      2     -1     -1     -2   
2      120000          2         2   26     -1      2      0      0      0   
3       90000          2         2   34      0      0      0      0      0   
4       50000          2         1   37      0      0      0      0      0   
5       50000          2         1   57     -1      0     -1      0      0   
6       50000          1         2   37      0      0      0      0      0   
7      500000          1         2   29      0      0      0      0      0   
8      100000          2         2   23      0     -1     -1      0      0   
9      140000          3         1   28      0      0      2      0      0   
10      20000          3         2   35     -2     -2     -2     -2     -1   

    PAY_6  ...  BILL_AMT5  BILL_AMT6  PAY_AMT1  PAY_AMT2  PAY_A

---

# German Credit dataset
- Dataset taken from UCI ML repository
- Sensitive attribute: gender

In [12]:
data = pd.read_csv("german/german.data", header=None, index_col=None, sep=" ")
data = data.dropna()

data[0] = pd.factorize(data[0])[0]
data[2] = pd.factorize(data[2])[0]
data[3] = pd.factorize(data[3])[0]
data[5] = pd.factorize(data[5])[0]
data[6] = pd.factorize(data[6])[0]
data[9] = pd.factorize(data[9])[0]
data[11] = pd.factorize(data[11])[0]
data[13] = pd.factorize(data[13])[0]
data[14] = pd.factorize(data[14])[0]
data[16] = pd.factorize(data[16])[0]
data[18] = pd.factorize(data[18])[0]
data[19] = pd.factorize(data[19])[0]

data[8] = data[8].replace({"A91": 0, "A92": 1, "A93": 0, "A94": 0, "A95": 1})
data[20] = data[20].replace({1: 0, 2: 1})

data = data.reindex(
    columns=[0, 2, 3, 5, 6, 9, 11, 13, 14, 16, 18, 19, 1, 4, 7, 10, 12, 15, 17, 8, 20]
)
print(data.head(10))
print(data.shape)
data.to_csv("german.csv", header=None, index=None)

   0   2   3   5   6   9   11  13  14  16  ...  19  1     4   7   10  12  15  \
0   0   0   0   0   0   0   0   0   0   0  ...   0   6  1169   4   4  67   2   
1   1   1   0   1   1   0   0   0   0   0  ...   0  48  5951   2   2  22   1   
2   2   0   1   1   2   0   0   0   0   1  ...   0  12  2096   2   3  49   1   
3   0   1   2   1   2   1   1   0   1   0  ...   0  42  7882   2   4  45   1   
4   0   2   3   1   1   0   2   0   1   0  ...   0  24  4870   3   4  53   2   
5   2   1   1   0   1   0   2   0   1   1  ...   0  36  9055   2   4  35   1   
6   2   1   2   2   0   0   1   0   0   0  ...   0  24  2835   3   4  53   1   
7   1   1   4   1   1   0   3   0   2   2  ...   0  36  6948   2   2  35   1   
8   2   1   0   3   2   0   0   0   0   1  ...   0  12  3059   2   4  61   1   
9   1   0   3   1   3   0   3   0   0   2  ...   0  30  5234   4   2  28   2   

   17  8   20  
0   1   0   0  
1   1   1   1  
2   2   0   0  
3   2   0   0  
4   2   0   1  
5   2   0   0  
6   1  

  data[8] = data[8].replace({"A91": 0, "A92": 1, "A93": 0, "A94": 0, "A95": 1})
