# Scoring Function  

Measure importance of each data entry in a dataset (respective to accuracy & bias)  

In [1]:
# prepare dataset
import os
os.chdir("..")
import sys
import numpy as np
import pandas as pd

In [2]:
import time
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler

In [3]:
from utils.data import Dataset, create_adult_dataset
from utils.completer import complete_by_similar_row

In [4]:
data = create_adult_dataset()
data.X

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,hours-per-week
0,39,6.0,9,13,4,0.0,1,4,Male,40
1,50,5.0,9,13,2,3.0,0,4,Male,13
2,38,3.0,11,9,0,5.0,1,4,Male,40
3,53,3.0,1,7,2,5.0,0,2,Male,40
4,28,3.0,9,13,2,9.0,5,2,Female,40
...,...,...,...,...,...,...,...,...,...,...
32556,27,3.0,7,12,2,12.0,5,4,Female,38
32557,40,3.0,11,9,2,6.0,0,4,Male,40
32558,58,3.0,11,9,6,0.0,4,4,Female,40
32559,22,3.0,11,9,4,0.0,3,4,Male,20


In [5]:
tmp_concat = pd.concat([data.X, pd.DataFrame(data.y, columns=["_TARGET_"])], axis=1)
tmp_concat.dropna(inplace=True)
tmp_concat.reset_index(drop=True, inplace=True)
data.X = tmp_concat.drop(columns=["_TARGET_"]).copy()
data.y = tmp_concat["_TARGET_"].copy().to_numpy().ravel()
data.X.isnull().sum()

age               0
workclass         0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
hours-per-week    0
dtype: int64

In [6]:
type(data.X)

pandas.core.frame.DataFrame

In [7]:
class Experiment:
    def __init__(self, Xdata : pd.core.frame.DataFrame, Ydata : np.ndarray, protected):
        assert len(Xdata) == len(Ydata)
        assert protected in Xdata.columns.tolist()
        self.X = Xdata
        self.Y = Ydata
        self.p = protected
        self.clf = RandomForestClassifier()
        self.scaler = StandardScaler()
        self.reset()
    
    def bias(self, data, A=1, B=1):
        """
        A*|FPR_A - FPR_B| + B*|FNR_A - FNR_C|
        vector: [TN_A, FP_A, FN_A, TP_A, TN_B, FP_B, FN_B, TP_B]
        """
        FPR_A = data[1] / (data[1] + data[0])
        FNR_A = data[2] / (data[2] + data[3])
        FPR_B  = data[5] / (data[5] + data[4])
        FNR_B  = data[6] / (data[6] + data[7])
        bias = A*abs(FPR_A - FPR_B) + B*abs(FNR_A - FNR_B)
        return bias
    
    def reset(self):
        self.X_train = None
        self.Y_train = None
        self.X_val = None
        self.Y_val = None
        
    def split(self, test_size=0.3):
        self.X_train, self.X_val, self.Y_train, self.Y_val = train_test_split(self.X, self.Y, test_size=test_size)
        self.X_train.reset_index(drop=True, inplace=True)
        self.X_val.reset_index(drop=True, inplace=True)
        
    def report(self, protectedA, protectedB):
        Tstart = time.time()
        assert protectedA in self.X[self.p].unique().tolist()
        assert protectedB in self.X[self.p].unique().tolist()
        # prepare data
        X_train = self.X_train.drop(columns=[self.p])
        X_val = self.X_val.drop(columns=[self.p])
        # apply standard scaler
        self.scaler.fit(X_train)
        X_train = self.scaler.transform(X_train)
        X_val = self.scaler.transform(X_val)
        # train model
        self.clf.fit(X_train, self.Y_train)
        # compute accuracy
        pred = self.clf.predict(X_val)
        acc = accuracy_score(self.Y_val, pred)
        # compute bias
        X_val_A = X_val[self.X_val[self.p] == protectedA]
        X_val_B = X_val[self.X_val[self.p] == protectedB]
        Y_val_A = self.Y_val[self.X_val[self.X_val[self.p] == protectedA].index.tolist()]
        Y_val_B = self.Y_val[self.X_val[self.X_val[self.p] == protectedB].index.tolist()]
        matrix_A = confusion_matrix(Y_val_A, self.clf.predict(X_val_A))
        matrix_B = confusion_matrix(Y_val_B, self.clf.predict(X_val_B))
        bias_data = matrix_A.ravel().tolist() + matrix_B.ravel().tolist()
        try:
            bias = self.bias(bias_data)
        except Exception as e:
            print("Bias Exception: {}".format(e))
            bias = np.nan
        Tend = time.time()
        print("Accuracy = {:.3f} \t Bias = {:.3f} \t Time = {:.3f}s".format(acc, bias, Tend-Tstart))


In [8]:
e = Experiment(data.X, data.y, data.protected_features[0])

In [9]:
e.reset()
e.split()
e.report(protectedA=" Male", protectedB=" Female")

Accuracy = 0.817 	 Bias = 0.198 	 Time = 1.610s
