In [1]:
import numpy as np
import pandas as pd

df = pd.read_csv('law_data.csv')
df = df.drop(columns = ['Unnamed: 0'])
df = df[df['region_first'] != 'PO']

In [2]:
race_coded = pd.get_dummies(df['race'])
df = pd.concat([df,race_coded],axis=1)

In [3]:
gender_coded = pd.get_dummies(df['sex'])
gender_coded.columns = ['female', 'male']
df = pd.concat([df, gender_coded],axis=1)

In [4]:
df = df.drop(columns = ['race', 'sex'])

In [5]:
sense_cols = ['Amerindian', 'Asian', 'Black', 'Hispanic', 'Mexican', 'Other',
       'Puertorican', 'White', 'female', 'male']

In [6]:
from sklearn.model_selection import train_test_split

X = df.loc[:, df.columns !='ZFYA']
y = df['ZFYA']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [7]:
n = len(X_train)
ne = len(X_test)

K = len(sense_cols)  #latent variable knowledge that affects gpa, last and fya, but is not related to race and sex

In [8]:
X_train['LSAT'] = X_train['LSAT'].astype(int)
X_test['LSAT'] = X_test['LSAT'].astype(int)

In [9]:
law_stan_data = {
    'N' : n, #number of observations
    'K' : K, 
    'a' : np.array(X_train[sense_cols]), #protected variable race and sex
    'ugpa' : np.array(X_train['UGPA']), 
    'lsat' : np.array(X_train['LSAT']),
    'zfya' : np.array(y_train)
       }

In [10]:
model= """
data {
  int<lower = 0> N; // number of observations
  int<lower = 0> K; // number of covariates
  matrix[N, K]   a; // sensitive variables
  real           ugpa[N]; // UGPA
  int            lsat[N]; // LSAT
  real           zfya[N]; // ZFYA
  
}

transformed data {
  
 vector[K] zero_K;
 vector[K] one_K;
 
 zero_K = rep_vector(0,K);
 one_K = rep_vector(1,K);

}

parameters {

  vector[N] u;

  real ugpa0;
  real eta_u_ugpa;
  real lsat0;
  real eta_u_lsat;
  real eta_u_zfya;
  
  vector[K] eta_a_ugpa;
  vector[K] eta_a_lsat;
  vector[K] eta_a_zfya;
  
  
  real<lower=0> sigma_g_Sq;
}

transformed parameters  {
 // Population standard deviation (a positive real number)
 real<lower=0> sigma_g;
 // Standard deviation (derived from variance)
 sigma_g = sqrt(sigma_g_Sq);
}

model {
  
  // don't have data about this
  u ~ normal(0, 1);
  
  ugpa0 ~ normal(0, 1);
  eta_u_ugpa ~ normal(0, 1);
  lsat0 ~ normal(0, 1);
  eta_u_lsat ~ normal(0, 1);
  eta_u_zfya ~ normal(0, 1);

  eta_a_ugpa ~ normal(zero_K, one_K);
  eta_a_lsat ~ normal(zero_K, one_K);
  eta_a_zfya ~ normal(zero_K, one_K);

  sigma_g_Sq ~ inv_gamma(1, 1);

  // have data about these
  ugpa ~ normal(ugpa0 + eta_u_ugpa * u + a * eta_a_ugpa, sigma_g);
  lsat ~ poisson(exp(lsat0 + eta_u_lsat * u + a * eta_a_lsat));
  zfya ~ normal(eta_u_zfya * u + a * eta_a_zfya, 1);

}
"""

In [11]:
import stan
import nest_asyncio
nest_asyncio.apply()

In [12]:
#import asyncio
#asyncio.sleep(100)
posterior = stan.build(model, data=law_stan_data, random_seed=1)
fit = posterior.sample(num_chains= 1, num_samples=30)
la_law_train = fit["eta"]


Building...



Building: 30.3s, done.Messages from stanc:
Sampling:   0%
Sampling:   0% (1/1030)
Sampling:  10% (100/1030)
Sampling:  19% (200/1030)
Sampling:  29% (300/1030)
Sampling:  39% (400/1030)
Sampling:  49% (500/1030)
Sampling:  58% (600/1030)
Sampling:  68% (700/1030)
Sampling:  78% (800/1030)
Sampling:  87% (900/1030)
Sampling:  97% (1000/1030)
Sampling:  97% (1001/1030)
Sampling: 100% (1030/1030)
Sampling: 100% (1030/1030), done.
Messages received during sampling:
  Gradient evaluation took 0.009241 seconds
  1000 transitions using 10 leapfrog steps per transition would take 92.41 seconds.
  Adjust your expectations accordingly!


AssertionError: eta

In [31]:
U_train = fit['u'].mean(axis = 0)
pd.DataFrame(U_train).to_csv('U_train.csv')

ugpa0 = fit['ugpa0'].mean()
eta_u_gpa = fit['eta_u_ugpa'].mean()
eta_a_ugpa = fit['eta_a_ugpa'].mean(axis = 0)

lsat0 = fit['lsat0'].mean()
eta_u_lsat = fit['eta_u_lsat'].mean()
eta_a_lsat = fit['eta_a_lsat'].mean(axis = 0)

sigma_g = fit['sigma_g'].mean()