# The code of this notebook is identical to `LR constrianed.ipynb` except it does not implement the constraints in the paper.
This was just made as reference to compare the fairness of the logistic regression algorithm towards different races, with and without the constraints.

In [2]:
import pandas as pd
import numpy as np

In [3]:
# df = pd.read_csv('C:/Users/Brendan/Downloads/compas-scores-two-years.csv')
df = pd.read_csv('compas-scores-two-years.csv')

In [4]:
#data wrangling 
df['race'] = df['race'].replace('African-American', 1).replace('Caucasian', 0)

df = df[(df['race'] == 0) | (df['race'] == 1)]

df['sex'] = df['sex'].replace('Male', 1).replace('Female', 0)

df['score_text'] = df['score_text'].replace('High', 1).replace('Medium', 0).replace('Low', -1)

df['c_charge_degree'] = df['c_charge_degree'].replace('M',1).replace('F',0)

df['days_in_jail'] = (pd.to_datetime(df['c_jail_out'])-pd.to_datetime(df['c_jail_in'])).dt.days

In [16]:
# Identifying columns with a significant amount of NA values
df.isna().sum()

id                            0
name                          0
first                         0
last                          0
compas_screening_date         0
sex                           0
dob                           0
age                           0
age_cat                       0
race                          0
juv_fel_count                 0
decile_score                  0
juv_misd_count                0
juv_other_count               0
priors_count                  0
days_b_screening_arrest     235
c_jail_in                   235
c_jail_out                  235
c_case_number                14
c_offense_date              999
c_arrest_date              5165
c_days_from_compas           14
c_charge_degree               0
c_charge_desc                21
is_recid                      0
r_case_number              3089
r_charge_degree            3089
r_days_from_arrest         4087
r_offense_date             3089
r_charge_desc              3141
r_jail_in                  4087
r_jail_o

In [5]:
# dropping rows with NA values as the optimizer doesn't work with them
df = df[['race','age', 'c_charge_degree', 'score_text', 'sex', 'priors_count', 'days_b_screening_arrest', 'decile_score', 'is_recid','two_year_recid']].dropna()

X = df[['race','age', 'c_charge_degree', 'score_text', 'sex', 'priors_count', 'days_b_screening_arrest', 'decile_score', 'is_recid']]
z = df['race']
y = df['two_year_recid']

from sklearn.model_selection import train_test_split
X_train, X_test, z_train, z_test, y_train, y_test = train_test_split(X, z, y, test_size=0.25, random_state=5243)
print("Rows in data:",len(df))

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_normalized = scaler.fit_transform(X_train)
X_test_normalized = scaler.fit_transform(X_test)

Rows in data: 5915


In [6]:
np.random.seed(1000)
theta = np.random.rand(X_train_normalized.shape[1])
print(theta)

import scipy
from scipy.optimize import minimize
from numpy.core.fromnumeric import transpose

# constraints
def condition_1(theta, X, Z, c):
  return -np.matmul((Z-Z.mean()),np.matmul(theta,transpose(X)))/X.shape[0] + c

def condition_2(theta, X, Z, c):
  return np.matmul((Z-Z.mean()),np.matmul(theta,transpose(X)))/X.shape[0] + c

# pdf
def p(theta,X):
  return 1/(1 + np.exp(np.matmul(X, -theta)))

# function to minimize
def minimizer(theta,X):
  return -sum(np.log(p(theta,X)))

# Using scipy to minimize the loss function with constraints defined earlier
optimization = scipy.optimize.minimize(
    minimizer,
    args=X_train_normalized,
    x0=theta,
    method='SLSQP',
#     constraints=(
#         {'type': 'ineq', 'fun': condition_1, 'args': (X_train_normalized, z_train , 0.8)},
#         {'type': 'ineq', 'fun': condition_2, 'args': (X_train_normalized, z_train , 0.8)}),
        options={"maxiter": 100000})
theta = optimization.x
theta

[0.65358959 0.11500694 0.95028286 0.4821914  0.87247454 0.21233268
 0.04070962 0.39719446 0.2331322 ]


  return 1/(1 + np.exp(np.matmul(X, -theta)))
  return -sum(np.log(p(theta,X)))


array([-6.10841041e-07,  5.84076738e-07,  3.75009022e-08, -1.93468017e-06,
       -1.40779694e-08, -9.49493990e-08,  6.26809049e-07,  2.48949457e-06,
       -4.08445167e-07])

In [8]:
# Preprocessing results
X_test_r0 = scaler.fit_transform(X_test)[X_test.reset_index()[X_test.reset_index()['race']==0].index]
X_test_r1 = scaler.fit_transform(X_test)[X_test.reset_index()[X_test.reset_index()['race']==1].index]
y_test_r0 = y_test.iloc[X_test.reset_index()[X_test.reset_index()['race']==0].index]
y_test_r1 = y_test.iloc[X_test.reset_index()[X_test.reset_index()['race']==1].index]

In [7]:
# Accuracy
preds = 1/(1+np.exp(np.matmul(X_test_normalized, theta)))
print("Overall accuracy:",((preds > 0.5) == y_test).mean())
# Results may vary when re-run

Overall accuracy: 0.6260987153482083


In [9]:
# Calibration: accuracies by race
preds_r0 = 1/(1+np.exp(np.matmul(X_test_r0, theta)))
preds_r1 = 1/(1+np.exp(np.matmul(X_test_r1, theta)))
print("Accuracy for Caucasian:",((preds_r0 > 0.5) == y_test_r0).mean())
print("Accuracy for African-American",((preds_r1 > 0.5) == y_test_r1).mean())

Accuracy for Caucasian: 0.6622296173044925
Accuracy for African-American 0.6013667425968109


In [10]:
# Parity
print("Probability of predicting two_year_recid = 1 for Caucasian:",((preds_r0 > 0.5) == 1).mean())
print("Probability of predicting two_year_recid = 1 for African-American:",((preds_r1 > 0.5) == 1).mean())

Probability of predicting two_year_recid = 1 for Caucasian: 0.23960066555740434
Probability of predicting two_year_recid = 1 for African-American: 0.7391799544419134


In [11]:
# Odds
test_results = pd.DataFrame({"X":(1/(1+np.exp(np.matmul(X_test_normalized, theta)))>0.5),"z":z_test,"y":y_test})
rs0 = test_results[test_results['y']==0]
print("Accuracy for Caucasian when true y = 0:",(rs0[rs0['z']==0]['X'] == 0).mean())
print("Accuracy for African-American when true y = 0:",(rs0[rs0['z']==1]['X'] == 0).mean())

rs1 = test_results[test_results['y']==1]
print("Accuracy for Caucasian when true y = 1:",(rs1[rs1['z']==0]['X']).mean())
print("Accuracy for African-American when true y = 1:",(rs1[rs1['z']==1]['X']).mean())

Accuracy for Caucasian when true y = 0: 0.8691860465116279
Accuracy for African-American when true y = 0: 0.3452685421994885
Accuracy for Caucasian when true y = 1: 0.3852140077821012
Accuracy for African-American when true y = 1: 0.8069815195071869
