In [5]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, fbeta_score, make_scorer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
f_half_scorer = make_scorer(fbeta_score, beta=0.5)

%matplotlib inline

  return f(*args, **kwds)


# Load and clean the data

In [None]:
df = pd.read_csv('data/OH.csv')

### These are the columns that will be used in the model

In [None]:
workable_cols = ['stop_date','stop_time','location_raw','driver_gender','driver_race','driver_race_raw','violation',
                 'search_conducted','contraband_found','is_arrested','drugs_related_stop']

df_reduced = df[workable_cols].copy()

### Convert time into hours and date into month

In [None]:
df_reduced['stop_hour'] = df_reduced['stop_time'].apply(lambda x: x.split(':')[0])
df_reduced['stop_month'] = df_reduced['stop_date'].apply(lambda x: x.split('-')[1])
df_reduced.drop('stop_time',axis=1,inplace=True)
df_reduced.drop('stop_date',axis=1,inplace=True)

### Save the 20 most frequent locations, which account for ~90% of all the locations. The remainder is set to null, to be imputed later

In [None]:
print(df_reduced['location_raw'].value_counts()[:20].sum()/df_reduced.shape[0])
keep_locations = list(df_reduced['location_raw'].value_counts().keys()[:20])
df_reduced['location_raw'] = df_reduced['location_raw'].apply(lambda x: int(x) if x in keep_locations else None)

### There are two columns for driver race. Impute the 'other' values in one, with the values from the other

In [None]:
df_reduced['driver_race'].loc[df_reduced['driver_race']=='Other'] = df_reduced['driver_race_raw'].loc[df_reduced['driver_race']=='Other']
#df_reduced['driver_race'] = df_reduced.apply(lambda x: x['driver_race_raw'] if x['driver_race'] == 'Other' else x['driver_race'],axis=1)
df_reduced.drop('driver_race_raw',axis=1,inplace=True)

### There is a realtively small number of unique violations

In [None]:
violations_unique = []
for l in df_reduced['violation'].dropna().unique():
    violations_unique = violations_unique + l.lower().split(',')
violations_unique = set(violations_unique)
print(violations_unique)

### Create a mapping between violations to integer values, which are ranked from worst to least
### The worst one will be selected in each row

In [None]:
def violations_denoter(x):
    
    violations_severity = {
        'dui':0,
        'speeding':1,
        'stop sign/light':2,
        'license':3,
        'cell phone':4,
        'paperwork':5,
        'registration/plates':6,
        'safe movement':7,
        'seat belt':8,    
        'equipment':9,
        'lights':10,
        'truck':11,
        'other':12,
        'other (non-mapped)':13
    }
    
    violations = []
    for k,v in violations_severity.items():
        if (k in x.lower()):
            violations.append(v)
            
    return min(violations)

In [None]:
df_reduced['violations_numbered'] = df_reduced['violation'].fillna('other (non-mapped)').apply(violations_denoter)
df_reduced.drop('violation',axis=1,inplace=True)

In [None]:
categorical = list(df_reduced.columns)
categorical.remove('is_arrested')

In [None]:
def replace_na_categorical(df,column):

    prob = dict(df[column].value_counts()/len(df))
    
    keys = list(prob.keys())
    
    sum_prob = sum(prob.values())

    for k,v in prob.items():
        prob[k] = prob[k]/sum_prob

    prob_list = list(prob.values())

    to_fillin = np.random.choice(keys, len(df[column].loc[df[column].isnull()]), p = prob_list)
    
    df[column].loc[df[column].isnull()] = to_fillin
    
    return df

In [None]:
for c in categorical:
    df_reduced = replace_na_categorical(df_reduced,c)

In [None]:
dummy = pd.get_dummies(df_reduced,columns=categorical)
dummy.to_csv('data/df_cleaned.csv',index=False)
dummy_sampled = dummy.sample(frac=0.1,random_state=2018).reset_index(drop=True)
dummy_sampled.to_csv('data/df_sampled_cleaned.csv',index=False)

## Set up the model training and testing

In [481]:
def run_HPS_search(train_X,train_Y,model,n_iter,score,cv,weights=None):
    
    model_selection, params = get_model_and_params(model)
    
    RS = RandomizedSearchCV(model_selection,param_distributions=params,n_iter=n_iter,scoring=score,cv=cv,error_score='0')
    RS.fit(train_X,train_Y,sample_weight=weights)
    print("Best params - ", RS.best_params_)
    print("Highest %s = %s"%(score,RS.best_score_))
    
    return RS.best_estimator_

In [482]:
def get_model_and_params(model):
    
    model_selection = {
        'RFC':RandomForestClassifier(),
        'GBC':GradientBoostingClassifier(),
        'LR':LogisticRegression()
    }
    
    model_hyperparameters = {
        'RFC':{
            'n_estimators':range(1,101),
            'max_depth':range(1,50),
            'n_jobs':[-1],
            'criterion':['gini','entropy'],
            'class_weight':['balanced_subsample']
        },
        'GBC':{
            'loss':['deviance','exponential'],
            'learning_rate':10**np.linspace(-4,-1,10),
            'n_estimators':range(50,150)
        },
        'LR':{
            'C':10.**np.linspace(-3,3,20),
            'tol':10**np.linspace(-5,-1,20),
            'penalty':['l2'],
            'class_weight':['balanced']
        }
    }
    
    return model_selection[model], model_hyperparameters[model]

In [685]:
def run_clf(model,train_X,train_Y,params,weights=None):
    
    clf = get_model_and_params(model)[0]
    
    clf.set_params(**params)
    clf.fit(train_X,train_Y,sample_weight=weights)
    
    return clf

In [645]:
def check_error_and_discrimination(clf,test_X,test_Y,df_test,sensitive_features):

    Y_predict = clf.predict(test_X)

    print("F1 score = %s" %f1_score(test_Y,Y_predict))
    print("Precision score = %s" %precision_score(test_Y,Y_predict))
    print("Recall score = %s" %recall_score(test_Y,Y_predict))
    print("Accuracy score = %s" %accuracy_score(test_Y,Y_predict))

    print()

    s = np.asarray(df_test[sensitive_features])
    print("Discrimination_ratio = %s\nDiscrimination_normalized = %s" %(discrimination(np.expand_dims(Y_predict,1),s)))

    print()

    features = ['driver_race_Asian','driver_race_Black','driver_race_White','driver_race_Hispanic']
    
    bias = [
    features,
    list(df_test[features].mean()),
    list(df_test[features].loc[Y_predict==1].mean()),
    list(df_test[features].loc[Y_predict==0].mean())
    ]
        
    return bias

In [485]:
def discrimination(y,s):
    
    disc_ratio = float(len(s[s==0]))*float(y[s==1].sum())/(float(y[s==0].sum())*float(len(s[s==1])))
    disc_norm = np.abs(float(y[s==0].sum())/float(len(s[s==0])) - float(y[s==1].sum())/float(len(s[s==1])))/float(y.mean())
    
    return disc_ratio, disc_norm

In [None]:
def balance_df(df,f_balance):
    
    if (f_balance == -1):
        return df
    
    cond = df['is_arrested']==True

    df_balanced = pd.concat(
        [df.loc[cond],
         df.loc[~cond].sample(n=df.loc[cond].shape[0]*f_balance)],
        axis=0
    ).sample(frac=1).reset_index(drop=True)
    
    return df_balanced

In [618]:
df = pd.read_csv('data/df_sampled_cleaned.csv')

In [619]:
df_train, df_test = train_test_split(df,test_size=0.3)

In [620]:
target_feature = 'is_arrested'
sensitive_features = ['driver_race_Black']

In [621]:
train_X = np.asarray(df_train.drop([target_feature],axis=1))
train_Y = np.asarray(df_train[target_feature]).astype(int)

test_X = np.asarray(df_test.drop([target_feature],axis=1))
test_Y = np.asarray(df_test[target_feature]).astype(int)

In [622]:
params = {'n_jobs': -1, 'n_estimators': 47, 'max_depth': 8, 'criterion': 'gini', 'class_weight': None}
weights = np.ones(train_Y.shape[0])
weights[train_Y==1] = 1.
weights[train_Y==0] = 0.5
RFC = run_clf('RFC',train_X,train_Y,params,weights=weights)
bias = check_error_and_discrimination(RFC,test_X,test_Y,df_test,sensitive_features)
df_bias = pd.DataFrame(bias).transpose()
df_bias.columns = ["Group","N_group/N_total","N_group/N_total|(predicted arrest)","N_group/N_total|(predicted not arrest)"]

F1 score = 0.228466707392
Precision score = 0.52791718946
Recall score = 0.145777392811
Accuracy score = 0.993855011353

Discrimination ratio = 2.653648918290935
Discrimination (normalized) = 1.37081408312



In [623]:
df_bias.head()

Unnamed: 0,Group,N_group/N_total,N_group/N_total|(predicted arrest),N_group/N_total|(predicted not arrest)
0,driver_race_Asian,0.013555,0.00313676,0.013573
1,driver_race_Black,0.12477,0.274467,0.124512
2,driver_race_White,0.83926,0.696989,0.839506
3,driver_race_Hispanic,0.0217688,0.0244668,0.0217642


## Begin setting up the VFAE

In [717]:
def gen_weights_biases(shape):
    return tf.Variable(tf.random_normal(shape, stddev=tf.sqrt(0.5 / float(shape[0]))))

In [718]:
def activate(x,activation):
    if (activation == 'relu'):
        return tf.nn.relu(x)
    elif (activation == 'sigmoid'):
        return tf.nn.sigmoid(x)
    elif (activation == 'tanh'):
        return tf.nn.tanh(x)
    elif (activation == 'softmax'):
        return tf.nn.softmax(x)
    elif (activation == 'linear'):
        return x

In [719]:
def get_batch(list_arrays,batch_size,index_shuffled,b):
    
    return [x[index_shuffled[b*batch_size:(b+1)*batch_size]] for x in list_arrays]

In [720]:
def initialize_params(dims,N_epochs=1000,print_freq=100,batch_size=100,lr=1e-3,alpha=1.,beta=0.):

    params = {
        'enc1':{
            'in_dim':dims['x']+dims['s'],
            'hid_dim':dims['enc1_hid'],
            'out_dim':dims['z1'],
            'act':{
                'hid':'relu',
                'mu':'linear',
                'log_sigma':'linear'
            }
        },   
        'enc2':{
            'in_dim':dims['z1']+1,
            'hid_dim':dims['enc2_hid'],
            'out_dim':dims['z2'],
            'act':{
                'hid':'relu',
                'mu':'linear',
                'log_sigma':'linear'
            }
        },
        'dec1':{
            'in_dim':dims['z2']+1,
            'hid_dim':dims['dec1_hid'],
            'out_dim':dims['z1'],
            'act':{
                'hid':'relu',
                'mu':'linear',
                'log_sigma':'linear'
            }
        },
        'dec2':{
            'in_dim':dims['z1']+dims['s'],
            'hid_dim':dims['dec2_hid'],
            'out_dim':dims['x']+dims['s'],
            'act':{
                'hid':'relu',
                'mu':'sigmoid',
                'log_sigma':'sigmoid'
            }
        },
        'us':{
            'in_dim':dims['z1'],
            'hid_dim':dims['us_hid'],
            'out_dim':dims['y_cat'],
            'act':{
                'hid':'relu',
                'mu':'softmax',
                'log_sigma':'softmax'
            }
        },
        'N_epochs':N_epochs,
        'print_frequency':print_freq,
        'batch_size':batch_size,
        'lr':lr,
        'alpha':alpha,
        'beta':beta
    }
    
    return params

In [721]:
def initialize_weights_biases(params):

    weights = {
        'enc1':{
            'hid':gen_weights_biases([params['enc1']['in_dim'],params['enc1']['hid_dim']]),
            'mu':gen_weights_biases([params['enc1']['hid_dim'],params['enc1']['out_dim']]),
            'log_sigma':gen_weights_biases([params['enc1']['hid_dim'],params['enc1']['out_dim']])
        },
        'enc2':{
            'hid':gen_weights_biases([params['enc2']['in_dim'],params['enc2']['hid_dim']]),
            'mu':gen_weights_biases([params['enc2']['hid_dim'],params['enc2']['out_dim']]),
            'log_sigma':gen_weights_biases([params['enc2']['hid_dim'],params['enc2']['out_dim']])
        },
        'dec1':{
            'hid':gen_weights_biases([params['dec1']['in_dim'],params['dec1']['hid_dim']]),
            'mu':gen_weights_biases([params['dec1']['hid_dim'],params['dec1']['out_dim']]),
            'log_sigma':gen_weights_biases([params['dec1']['hid_dim'],params['dec1']['out_dim']])
        },
        'dec2':{
            'hid':gen_weights_biases([params['dec2']['in_dim'],params['dec2']['hid_dim']]),
            'mu':gen_weights_biases([params['dec2']['hid_dim'],params['dec2']['out_dim']]),
            'log_sigma':gen_weights_biases([params['dec2']['hid_dim'],params['dec2']['out_dim']])
        },
        'us':{
            'hid':gen_weights_biases([params['us']['in_dim'],params['us']['hid_dim']]),
            'mu':gen_weights_biases([params['us']['hid_dim'],params['us']['out_dim']]),
            'log_sigma':gen_weights_biases([params['us']['hid_dim'],params['us']['out_dim']])
        }       
    }

    bias = {
        'enc1':{
            'hid':gen_weights_biases([params['enc1']['hid_dim']]),
            'mu':gen_weights_biases([params['enc1']['out_dim']]),
            'log_sigma':gen_weights_biases([params['enc1']['out_dim']])
        },
        'enc2':{
            'hid':gen_weights_biases([params['enc2']['hid_dim']]),
            'mu':gen_weights_biases([params['enc2']['out_dim']]),
            'log_sigma':gen_weights_biases([params['enc2']['out_dim']])
        },
        'dec1':{
            'hid':gen_weights_biases([params['dec1']['hid_dim']]),
            'mu':gen_weights_biases([params['dec1']['out_dim']]),
            'log_sigma':gen_weights_biases([params['dec1']['out_dim']])
        },
        'dec2':{
            'hid':gen_weights_biases([params['dec2']['hid_dim']]),
            'mu':gen_weights_biases([params['dec2']['out_dim']]),
            'log_sigma':gen_weights_biases([params['dec2']['out_dim']])
        },
        'us':{
            'hid':gen_weights_biases([params['us']['hid_dim']]),
            'mu':gen_weights_biases([params['us']['out_dim']]),
            'log_sigma':gen_weights_biases([params['us']['out_dim']])
        }
    }
    
    return weights, bias

In [722]:
def MLP(x_in,weights,bias,activation,epsilon):
    
    hidden_en = activate(tf.matmul(x_in,weights['hid'])+bias['hid'],activation['hid'])

    mu = activate(tf.matmul(hidden_en,weights['mu'])+bias['mu'],activation['mu'])

    log_sigma = activate(tf.matmul(hidden_en,weights['log_sigma'])+bias['log_sigma'],activation['log_sigma'])

    return mu + tf.exp(log_sigma / 2) * epsilon, mu, log_sigma

In [723]:
def KL(mu1,log_sigma_sq1,mu2=0.,log_sigma_sq2=0.):
    return 0.5*tf.reduce_sum(log_sigma_sq2-log_sigma_sq1-1+(tf.exp(log_sigma_sq1)+tf.pow(mu1-mu2,2))/tf.exp(log_sigma_sq2),axis=1)

In [724]:
def LH(x,mu,log_sigma):
    return 0.5 * tf.reduce_sum(np.log(2 * np.pi) + log_sigma + tf.pow(x - mu,2) / tf.exp(log_sigma), axis=1)

In [725]:
def train_VFAE(train_X,train_Y,train_s,test_X,test_Y,test_s,weights,bias,params,dims):

    x = tf.placeholder(tf.float32,shape=[None,dims['x']],name='x')
    s = tf.placeholder(tf.float32,shape=[None,dims['s']],name='s')
    y = tf.placeholder(tf.float32,shape=[None,1],name='y')
    index = tf.placeholder(tf.int32,shape=[None,1],name='index')
    
    epsilon0 = tf.random_normal([params['enc1']['out_dim']], dtype=tf.float32, name='epsilon0')
    z1_enc, z1_enc_mu, z1_enc_log_sigma = MLP(tf.concat([x,s],axis=1),weights['enc1'],bias['enc1'],params['enc1']['act'],epsilon0)
    
    epsilon1 = tf.random_normal([params['enc2']['out_dim']], dtype=tf.float32, name='epsilon1')
    z2_enc, z2_enc_mu, z2_enc_log_sigma = MLP(tf.concat([z1_enc,y],axis=1),weights['enc2'],bias['enc2'],params['enc2']['act'],epsilon1)
    
    epsilon2 = tf.random_normal([params['dec1']['out_dim']], dtype=tf.float32, name='epsilon2')
    z1_dec, z1_dec_mu, z1_dec_log_sigma = MLP(tf.concat([z2_enc,y],axis=1),weights['dec1'],bias['dec1'],params['dec1']['act'],epsilon2)
    
    epsilon3 = tf.zeros([params['dec2']['out_dim']], dtype=tf.float32, name='epsilon3')
    x_out = MLP(tf.concat([z1_dec,s],axis=1),weights['dec2'],bias['dec2'],params['dec2']['act'],epsilon3)[0]

    epsilon4 = tf.zeros([params['us']['out_dim']], dtype=tf.float32, name='epsilon4')
    y_us = MLP(z1_enc,weights['us'],bias['us'],params['us']['act'],epsilon4)[0]
    
    KL_z1 = KL(z1_enc_mu,z1_enc_log_sigma,z1_dec_mu,z1_dec_log_sigma)
    KL_z2 = KL(z2_enc_mu,z2_enc_log_sigma)
    
    LH_x = tf.reduce_sum(tf.concat([x,s],axis=1) * tf.log(1e-10+x_out) + (1 - tf.concat([x,s],axis=1)) * tf.log(1e-10+1 - x_out),axis=1)
    
    idx = tf.stack([index, tf.cast(y,tf.int32)], axis=-1)

    LH_y = tf.reduce_sum(tf.log(1e-10+tf.gather_nd(y_us, idx)),axis=1)

    MMD = 0.
    
    loss = -(-tf.reduce_mean(KL_z1)-tf.reduce_mean(KL_z2)+tf.reduce_mean(LH_x)-params['alpha']*tf.reduce_mean(LH_y))# - beta*MMD
    -tf.reduce_mean(-KL_z1-KL_z2+LH_x-params['alpha']*LH_y)# - params['beta']*MMD

    optimizer = tf.train.AdamOptimizer(learning_rate=params['lr'])

    train = optimizer.minimize(loss)

    sess = tf.Session()

    sess.run(tf.global_variables_initializer())
    
    index_shuffled = np.arange(train_X.shape[0])
    np.random.shuffle(index_shuffled)

    N_batches = int(float(train_X.shape[0])/float(params['batch_size']))
    
    for i in range(params['N_epochs']):
        
        for b in range(N_batches):
            
            batch_X, batch_Y, batch_s = get_batch([train_X,train_Y,train_s],params['batch_size'],index_shuffled,b)

            batch_dict = {x:batch_X,y:batch_Y,s:batch_s,index:np.arange(batch_Y.shape[0]).reshape(batch_Y.shape[0],1)}
            train_dict = {x:train_X,y:train_Y,s:train_s,index:np.arange(train_Y.shape[0]).reshape(train_Y.shape[0],1)}

            sess.run(train,feed_dict=batch_dict)

            if ((i % params['print_frequency'] == 0) and (b == N_batches-1)):

                print("Epoch %s: batch loss = %s and global loss = %s"%(i,
                        sess.run(loss,feed_dict=batch_dict),
                        sess.run(loss,feed_dict=train_dict)))
                print(sess.run([tf.reduce_mean(KL_z1),tf.reduce_mean(KL_z2),tf.reduce_mean(LH_x),tf.reduce_mean(LH_y)],feed_dict=batch_dict))
                test_dict = {x:test_X,y:test_Y,s:test_s,index:np.arange(test_Y.shape[0]).reshape(test_Y.shape[0],1)}
                np.save('output/latent_x_epoch_%s'%(i),sess.run(z1_enc,feed_dict=test_dict))
            
    test_dict = {x:test_X,y:test_Y,s:test_s,index:np.arange(test_Y.shape[0]).reshape(test_Y.shape[0],1)}
    return sess.run([x_out,z1_enc,loss],feed_dict=test_dict)

### Split the dataset into a training and cross-validation sets, to determine the optimal hyperparameters for the VFAE

In [726]:
def obtain_X_Y_s(df,target_feature,sensitive_features):
    
    X = np.asarray(df.drop([target_feature]+sensitive_features,axis=1))
    Y = np.expand_dims(np.asarray(df[target_feature]).astype(int),1)
    s = np.expand_dims(np.asarray(df[sensitive_features]).astype(int),1)

    return X, Y, s

In [704]:
target_feature = 'is_arrested'
sensitive_features = ['driver_race_Black']

df_VFAE_train, df_VFAE_CV = train_test_split(df,train_size=0.33)
df_VFAE_CV, df_VFAE_test = train_test_split(df_VFAE_CV,test_size=0.5)

df_VFAE_train = df_VFAE_train.reset_index(drop=True)
df_VFAE_CV = df_VFAE_CV.reset_index(drop=True)
df_VFAE_test = df_VFAE_test.reset_index(drop=True)

VFAE_train_X, VFAE_train_Y, VFAE_train_s = obtain_X_Y_s(df_VFAE_train,target_feature,sensitive_features)
VFAE_CV_X, VFAE_CV_Y, VFAE_CV_s = obtain_X_Y_s(df_VFAE_CV,target_feature,sensitive_features)
VFAE_test_X, VFAE_test_Y, VFAE_test_s = obtain_X_Y_s(df_VFAE_test,target_feature,sensitive_features)

In [707]:
tf.reset_default_graph()

# Hidden units in the hidden layers
# [z1, z2, encoding1, encoding2, decoding1, decoding2, unsupervised]
hidden_dim_list = [
    [50,50,100,100,100,100,100]
]

lr_list = [1e-3]

alpha_list = [1e-1]

loss_min = 1e3

for d in hidden_dim_list:
    
    for lr in lr_list:
        
        for alpha in alpha_list:
        
            print("Begin analysis for dims = %s, lr = %s, and alpha = %s:"%(d,lr,alpha))

            dims = {
                'x':VFAE_train_X.shape[1],
                'y_cat':len(np.unique(VFAE_train_Y)),
                's':VFAE_train_s.shape[1],
                'z1':d[0],
                'z2':d[1],
                'enc1_hid':d[2],
                'enc2_hid':d[3],
                'dec1_hid':d[4],
                'dec2_hid':d[5],
                'us_hid':d[6]
            }

            params = initialize_params(dims,N_epochs=30,lr=lr,print_freq=5,batch_size=10000,alpha=alpha)
            weights, bias = initialize_weights_biases(params)
            enc_X, latent_X, loss = train_VFAE(VFAE_train_X,VFAE_train_Y,VFAE_train_s,VFAE_CV_X,VFAE_CV_Y,VFAE_CV_s,weights,bias,params,dims)

            print("The CV loss = %s"%(loss))
            print()
            print()

            if (loss < loss_min):
                loss_min = loss
                d_optimal = d
                lr_optimal = lr
                alpha_optimal = alpha
                latent_X_optimal = latent_X
                enc_X_optimal = enc_X
            
print("The minimum loss of %s was obtained using dims = %s, lr = %s, and alpha = %s"%(loss_min,d_optimal,lr_optimal,alpha_optimal))            

Begin analysis for dims = [50, 50, 100, 100, 100, 100, 100], lr = 0.001, and alpha = 0.1:
Epoch 0: batch loss = 66.4549 and global loss = 64.8756
[6.3467026, 2.9205317, -55.297737, -1.4578407]
Epoch 5: batch loss = 16.6861 and global loss = 16.3218
[1.1768936, 0.52575457, -16.81337, -22.89934]
Epoch 10: batch loss = 14.9242 and global loss = 15.4115
[1.1704465, 0.21019964, -16.432467, -22.899651]
Epoch 15: batch loss = 15.2118 and global loss = 15.047
[0.86113799, 0.37193912, -16.182674, -22.899651]
Epoch 20: batch loss = 16.262 and global loss = 14.3666
[0.23268989, 0.059066754, -16.054325, -22.899651]
Epoch 25: batch loss = 13.8851 and global loss = 13.8839
[0.27517316, 0.022389447, -16.078943, -22.899651]


ValueError: too many values to unpack (expected 3)

In [None]:
# Run with the optimal params
dims = {
    'x':VFAE_test_X.shape[1],
    'y_cat':len(np.unique(VFAE_test_Y)),
    's':VFAE_test_s.shape[1],
    'z1':d_optimal[0],
    'z2':d_optimal[1],
    'enc1_hid':d_optimal[2],
    'enc2_hid':d_optimal[3],
    'dec1_hid':d_optimal[4],
    'dec2_hid':d_optimal[5],
    'us_hid':d_optimal[6]
}

params = initialize_params(dims,N_epochs=40,lr=lr_optimal,print_freq=10,batch_size=1000,alpha=alpha_optimal)
weights, bias = initialize_weights_biases(params)
enc_X, latent_X, loss, y_us, weights_post, bias_post = train_VFAE(VFAE_test_X,VFAE_test_Y,VFAE_test_s,VFAE_test_X,VFAE_test_Y,VFAE_test_s,weights,bias,params,dims)

In [710]:
train_index, test_index = train_test_split(np.arange(VFAE_CV_X.shape[0]),test_size=0.3)
RF_train_X, RF_test_X = VFAE_CV_X[train_index], VFAE_CV_X[test_index]
RF_train_Y, RF_test_Y = VFAE_CV_Y[train_index].ravel(), VFAE_CV_Y[test_index].ravel()
RF_train_s, RF_test_s = VFAE_CV_s[train_index], VFAE_CV_s[test_index]
df_train, df_test = df_VFAE_CV.loc[train_index], df_VFAE_CV.loc[test_index]

enc_train_X, enc_test_X = latent_X[train_index], latent_X[test_index]

In [693]:
params = {'n_jobs': -1, 'n_estimators': 47, 'max_depth': 8, 'criterion': 'gini', 'class_weight': None}
weights = np.ones(RF_train_Y.shape[0])
weights[RF_train_Y==1] = 1.
weights[RF_train_Y==0] = 0.5
RFC = run_clf('RFC',enc_train_X,RF_train_Y,params,weights=weights)
bias = check_error_and_discrimination(RFC,enc_test_X,RF_test_Y,df_test,sensitive_features)
df_bias = pd.DataFrame(bias).transpose()
df_bias.columns = ["Group","N_group/N_total","N_group/N_total|(predicted arrest)","N_group/N_total|(predicted not arrest)"]

F1 score = 0.309278350515
Precision score = 0.590551181102
Recall score = 0.209497206704
Accuracy score = 0.993963311349

Discrimination ratio = 1.956154987824032
Discrimination (normalized) = 0.853114563411



In [694]:
df_bias.head()

Unnamed: 0,Group,N_group/N_total,N_group/N_total|(predicted arrest),N_group/N_total|(predicted not arrest)
0,driver_race_Asian,0.0125599,0.0,0.0125887
1,driver_race_Black,0.12632,0.220472,0.126104
2,driver_race_White,0.839262,0.779528,0.839399
3,driver_race_Hispanic,0.0211014,0.0,0.0211498
