In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.gaussian_process import GaussianProcessRegressor, GaussianProcessClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression

In [2]:
data = pd.read_csv('law_data.csv',index_col=0)
data.describe()

Unnamed: 0,sex,LSAT,UGPA,ZFYA,sander_index,first_pf
count,21791.0,21791.0,21791.0,21791.0,21791.0,21791.0
mean,1.562342,36.772383,3.226589,0.096426,0.766949,0.88844
std,0.49611,5.446659,0.414182,0.932631,0.086736,0.314831
min,1.0,11.0,0.0,-3.35,0.3875,0.0
25%,1.0,33.0,3.0,-0.55,0.711607,1.0
50%,2.0,37.0,3.3,0.09,0.769643,1.0
75%,2.0,41.0,3.5,0.75,0.827381,1.0
max,2.0,48.0,4.2,3.48,1.0,1.0


In [3]:
def onehottify(df):
    #based on the data formatting in the original code repo https://github.com/mkusner/counterfactual-fairness/blob/master/law_school_classifiers.R
    df = df[df.region_first != 'PO'].copy()
    df = df.copy()
    #one-hot encode race and gender
    race_df = pd.get_dummies(df['race'])
    for col in race_df.columns:
        df[col.lower()] = race_df[col].values
    df['male'] = df.sex.apply(lambda x: int(x == 2))
    df['female'] = df.sex.apply(lambda x: int(x == 1))
    df = df.drop(['race','sex'],axis=1)
    #train-test split (original split is done via an R-function and isn't pre-provided)
    #original code filters first_pf = 1 for train data (I don't know why)
    df_train = df.iloc[0:int(df.shape[0]*.8)]
    df_test = df.drop(df_train.index,axis=0)
    return df_train, df_test

train, test = onehottify(data)
print([train.shape,test.shape])
train

[(17432, 16), (4358, 16)]


Unnamed: 0,LSAT,UGPA,region_first,ZFYA,sander_index,first_pf,amerindian,asian,black,hispanic,mexican,other,puertorican,white,male,female
0,39.0,3.1,GL,-0.98,0.782738,1.0,0,0,0,0,0,0,0,1,0,1
1,36.0,3.0,GL,0.09,0.735714,1.0,0,0,0,0,0,0,0,1,0,1
2,30.0,3.1,MS,-0.35,0.670238,1.0,0,0,0,0,0,0,0,1,1,0
5,39.0,2.2,NE,0.58,0.697024,1.0,0,0,0,1,0,0,0,0,1,0
6,37.0,3.4,GL,-1.26,0.786310,1.0,0,0,0,0,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22478,40.0,2.7,SC,0.22,0.757143,1.0,0,0,0,0,0,0,0,1,1,0
22479,44.0,2.6,SC,0.84,0.797619,1.0,0,0,0,0,0,0,0,1,1,0
22480,31.0,3.8,SC,-0.65,0.749405,1.0,0,0,0,0,0,0,0,1,0,1
22481,42.0,3.8,SC,0.84,0.886905,1.0,0,0,0,0,0,0,0,1,0,1


In [4]:
def protected_cols():
    return ['male','female','amerindian','asian','black','hispanic','mexican','other','puertorican','white']

def unprotected_cols():
    return ['LSAT','UGPA']

def ycol():
    return ['ZFYA']

def lr():
    return LinearRegression()

def rmse(x,x2):
    return np.sqrt(np.mean((x - x2)**2))

def normalize(xtrain,xtest):
    mean = xtrain.mean(axis=0)
    std = xtrain.std(axis=0)
    norm = lambda x: (x - mean)/std
    return norm(xtrain),norm(xtest)

def make_split(dtrain,dtest,xcols,yvar):
    xtrain = dtrain[xcols].astype(float)
    xtest = dtest[xcols].astype(float)
    ytrain = dtrain[yvar].astype(float)
    ytest = dtest[yvar].astype(float)
    xtrain, xtest = normalize(xtrain,xtest)
    return xtrain, xtest, ytrain, ytest

In [5]:
def unfair_aware_model(dtrain,dtest):
    cols = protected_cols() + unprotected_cols()
    xtrain, xtest, ytrain, ytest = make_split(dtrain,dtest,cols,ycol())
    model = lr()
    model.fit(xtrain.values,ytrain.values)
    ypred = model.predict(xtest.values)
    return ypred, rmse(ypred,ytest.values)

unfair_aware_model(train,test)

(array([[ 0.05884847],
        [ 0.34801669],
        [ 0.34801669],
        ...,
        [ 0.31648194],
        [-0.02759233],
        [-0.07595873]]),
 0.8774416616997881)

In [6]:
def unfair_unaware_model(dtrain,dtest):
    cols = unprotected_cols()
    xtrain, xtest, ytrain, ytest = make_split(dtrain,dtest,cols,ycol())
    model = lr()
    model.fit(xtrain.values,ytrain.values)
    ypred = model.predict(xtest.values)
    return ypred, rmse(ypred,ytest.values)

unfair_unaware_model(train,test)

(array([[-0.05815671],
        [ 0.30538129],
        [ 0.30538129],
        ...,
        [ 0.28900084],
        [-0.21824016],
        [-0.29313182]]),
 0.9020494345037581)

In [7]:
def score_residual(dtrain,dtest,y):
    cols = protected_cols()
    xtrain, xtest, ytrain, ytest = make_split(dtrain,dtest,cols,[y])
    model = lr()
    model.fit(xtrain.values,ytrain.values)
    train_ypred = model.predict(xtrain.values)
    test_ypred = model.predict(xtest.values)
    
    train_residual = ytrain - train_ypred
    test_residual = ytest - test_ypred
    
    return train_residual.values, test_residual.values

def fair_deterministic_model(dtrain,dtest):
    dtrain = dtrain.copy()
    dtest = dtest.copy()
    gpa_res_train, gpa_res_test = score_residual(dtrain,dtest,'UGPA')
    lsat_res_train, lsat_res_test= score_residual(dtrain,dtest,'LSAT')
    
    dtrain['gpa_residual'] = gpa_res_train
    dtrain['lsat_residual']=  lsat_res_train
    dtest['gpa_residual'] = gpa_res_test
    dtest['lsat_residual'] = lsat_res_test
    
    xtrain, xtest, ytrain, ytest = make_split(dtrain,dtest,['gpa_residual','lsat_residual'],ycol())
    model = lr()
    model.fit(xtrain.values,ytrain.values)
    ypred = model.predict(xtest.values)
    return ypred, rmse(ypred,ytest.values)

fair_deterministic_model(train,test)

(array([[-0.04684315],
        [ 0.22009449],
        [ 0.22009449],
        ...,
        [ 0.20928633],
        [-0.15796058],
        [-0.20862172]]),
 0.9290829349674626)

In [None]:
from causalnex.structure import StructureModel
from causalnex.plots import plot_structure, NODE_STYLE, EDGE_STYLE
from IPython.display import Image
from causalnex.estimator.em import EMSingleLatentVariable
from causalnex.network import BayesianNetwork

def find_latent_knowledge(dtrain,dtest):
#     df = pd.concat([dtrain.copy(),dtest.copy()],axis=0)
    df = dtrain.copy()
    df = df[protected_cols() + unprotected_cols() + ycol()].astype(float)
    sm = StructureModel()
    edges = []
    for col in protected_cols():
        for col2 in unprotected_cols():
            edges.append((col,col2))
    sm.add_edges_from(edges)
    bn = BayesianNetwork(sm)
    bn.fit_node_states_and_cpds(df)
    bn.add_node(node='u',edges_to_add=[('u',l) for l in unprotected_cols()],edges_to_remove=[])
    options= list(np.linspace(-.99,.99))
    boundaries = EMSingleLatentVariable.get_default_box(
        sm=bn.structure,
        node_states={
            **bn.node_states,
            'u': set(options)
        },
        lv_name='u'
    )
    priors = EMSingleLatentVariable.get_default_priors(
        sm=bn.structure,
        node_states={
            **bn.node_states,
            'u': set(options)
        },
        lv_name='u'
    )
    bn.fit_latent_cpds(
        lv_name='u',
        lv_states=options,
        data=df,
        box_constraints=boundaries,
        priors=priors,
        n_runs=3,
    )
    return bn

test = find_latent_knowledge(train,test)
test.cpds

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].map(self._node_states[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].map(self._node_states[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].map(self._node_states[col])
A value is trying to be set on a copy of a slice from a DataFrame.


In [14]:
np.linspace(-.99,.99)

array([-0.99      , -0.94959184, -0.90918367, -0.86877551, -0.82836735,
       -0.78795918, -0.74755102, -0.70714286, -0.66673469, -0.62632653,
       -0.58591837, -0.5455102 , -0.50510204, -0.46469388, -0.42428571,
       -0.38387755, -0.34346939, -0.30306122, -0.26265306, -0.2222449 ,
       -0.18183673, -0.14142857, -0.10102041, -0.06061224, -0.02020408,
        0.02020408,  0.06061224,  0.10102041,  0.14142857,  0.18183673,
        0.2222449 ,  0.26265306,  0.30306122,  0.34346939,  0.38387755,
        0.42428571,  0.46469388,  0.50510204,  0.5455102 ,  0.58591837,
        0.62632653,  0.66673469,  0.70714286,  0.74755102,  0.78795918,
        0.82836735,  0.86877551,  0.90918367,  0.94959184,  0.99      ])