In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [11]:
import sys
sys.path.append("/content/drive/MyDrive/fairness_discrepancy")

In [12]:
import numpy as np
import matplotlib.pyplot as plt
from contextlib import redirect_stdout
from random import seed
from scipy.optimize import minimize
from sklearn.metrics.pairwise import rbf_kernel

from prep_adult_data import *
import utils as ut

from loss_wrapper import loss_wrapper

SEED = 1122334455
seed(SEED) # set the random seed so that the random permutations can be reproduced again
np.random.seed(SEED)

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [13]:
# @jit(nopython=True, parallel=True)
def discrepancy_loss(c, x, y, x_control, alpha, sensitive_attrs):
    svm_loss = 0.0
    coloring_loss = 0.0
    # assert no of samples
    temp_matrix = (c*y).T @ kernel_matrix
    svm_loss = .5*np.dot(temp_matrix, c*y)
    svm_loss -= c.sum()

    b = (temp_matrix.sum() - y.sum())/x.shape[0]

    for attr in sensitive_attrs:
        if(attr=="sex"):
            cond = x_control[attr] == 1.0
            # male_y = y[cond]	
            # female_y = y[~cond]
            male_kernel_matrix = kernel_matrix[cond]
            female_kernel_matrix = kernel_matrix[~cond]
            male_loss  = 0.0
            female_loss = 0.0

            male_loss = np.tanh(((c*y).T @ male_kernel_matrix.T) - b).sum()
            female_loss = np.tanh(((c*y).T @ female_kernel_matrix.T) - b).sum()
            # for i in range(male_kernel_matrix.shape[0]):
            #     temp = np.dot(c*y, male_kernel_matrix[i]) - b
            #     male_loss += np.tanh(temp)
                
            # for i in range(female_kernel_matrix.shape[0]):
            #     temp = np.dot(c*y, female_kernel_matrix[i]) - b
            #     female_loss += np.tanh(temp)
                
            coloring_loss = max(abs(male_loss), abs(female_loss))

    loss = (1-alpha)*svm_loss + alpha*coloring_loss
    print("loss: ", loss)
    return loss	

In [14]:
loss_function = discrepancy_loss
sensitive_attrs = ["sex"]

In [15]:
def load_data(data_size):
    """ Load the adult data """
    X, y, x_control = load_adult_data(data_size) # set the argument to none, or no arguments if you want to test with the whole data -- we are subsampling for performance speedup
    ut.compute_p_rule(x_control["sex"], y) # compute the p-rule in the original data

    """ Split the data into train and test """
    train_fold_size = 0.7
    # x_train, y_train, x_control_train, x_test, y_test, x_control_test = ut.split_into_train_test(X, y, x_control, train_fold_size)
    return ut.split_into_train_test(X, y, x_control, train_fold_size)

In [16]:
data_size = 20222
method = 'cobyla'
initiator = 0.00001
l = 1
catol = 0.001
alpha = 0.5
max_iter = 100

In [17]:
x_train, y_train, x_control_train, x_test, y_test, x_control_test = load_data(data_size)
kernel_matrix = rbf_kernel(x_train, x_train)

Looking for file 'adult.data' in the current directory...
File found in current directory..
Looking for file 'adult.test' in the current directory...
File found in current directory..
Loading only 20222 examples from the data
Total data points: 20222
# non-protected examples: 13736
# protected examples: 6486
Non-protected in positive class: 4212 (31%)
Protected in positive class: 743 (11%)
P-rule is: 37%


In [18]:
n = x_train.shape[0]
m = x_test.shape[0]
c_init = np.random.rand(n,1)*initiator

print("iter: ", max_iter, ", lambda: ", l, ", alpha: ", alpha, ", kernel: rbf"\
        " method: ", method, ", catol: ", catol, ", initiator: ", initiator)
print("c_init: ", c_init)

y_test = y_test.reshape(m, 1)
x_control_test["sex"] = x_control_test["sex"].reshape(m, 1)

y_train = y_train.reshape(n, 1)
x_control_train["sex"] = x_control_train["sex"].reshape(n, 1)

loss_func_args=(x_train, y_train, x_control_train, alpha, sensitive_attrs)
kernel_obj = loss_wrapper(loss_function)
c = c_init

iter:  100 , lambda:  1 , alpha:  0.5 , kernel: rbf method:  cobyla , catol:  0.001 , initiator:  1e-05
c_init:  [[6.22300636e-06]
 [1.69693936e-06]
 [6.47108542e-06]
 ...
 [7.86326139e-06]
 [1.70882711e-06]
 [1.70806183e-06]]


In [19]:
print(x_control_train["sex"].shape)

(14155, 1)


In [23]:
c = minimize(fun = kernel_obj.simulate,
            x0 = c_init,
            args = loss_func_args,
            method = method,
            options = {"maxiter":max_iter}
            #bounds = [(0, 1/(2*n*l)) for i in range(n)]
            # constraints = constraints
            )
        
print()
print("weights: ", c.x)
print("cy dot constraint :", np.dot(c.x,y))
print()

if(c.success != True):
    print("Optimization problem did not converge.. Check the solution returned by the optimizer.")
    print("Returned solution is:")
    print(c)
    print()

IndexError: ignored