In [1]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from model import VFAE, train
from dataset import DictionaryDataset
from checkpoint import Checkpoint
from test import Test
from loss import VFAE_loss
from torch.utils.data import DataLoader

### VAE training

In [2]:
df = pd.read_csv('OH_sampled_cleaned.csv')
target = 'is_arrested'
sensitive = 'driver_race_Black'
features = [col for col in df.columns.tolist() if col not in [target, sensitive]]
x, s, y = df[features], df[sensitive], df[target]

Xtrain, Xtest, strain, stest, ytrain, ytest = train_test_split(x,s,y,test_size=0.2)

In [3]:
train_data = {
    'x': np.array(Xtrain, dtype=np.float32),
    's': np.reshape(np.array(strain, dtype=np.float32), (-1,1),), 
    'y': np.reshape(np.array(ytrain, dtype=np.float32), (-1,1)) 
}

test_data = {
    'x': np.array(Xtest, dtype=np.float32),
    's': np.reshape(np.array(stest, dtype=np.float32), (-1,1),), 
    'y': np.reshape(np.array(ytest, dtype=np.float32), (-1,1)) 
}

In [4]:
# Hyperparams
x_dim = 62
s_dim = 1
y_dim = 1
z1_enc_dim = 100
z2_enc_dim = 100
z1_dec_dim = 100
x_dec_dim = 100
z_dim = 50
dropout_rate = 0.0

alpha = 1
beta = 0.0
gamma = 1.0
mmd_dims = 500

In [5]:
vfae = VFAE(x_dim, s_dim, y_dim, z1_enc_dim, z2_enc_dim, z1_dec_dim, x_dec_dim, z_dim, dropout_rate, )
optim = torch.optim.Adam(vfae.parameters(),lr=1e-3)

In [6]:
dataset = DictionaryDataset(train_data)
train_dataloader = DataLoader(dataset, batch_size=75000, shuffle=True)
loss_function = VFAE_loss(alpha=alpha, beta=beta, gamma=gamma, dims_out=mmd_dims)
checkpointer = Checkpoint('checkpoints')

In [5]:
vfae.train()
for e in range(100):
    loss = train(e, vfae, train_dataloader, loss_function, optim, print_freq=250000)
    checkpointer(loss, vfae, e)

NameError: name 'vfae' is not defined

In [6]:
vfae = VFAE(x_dim, s_dim, y_dim, z1_enc_dim, z2_enc_dim, z1_dec_dim, x_dec_dim, z_dim, dropout_rate, )
tester = Test(vfae, 'checkpoints/epoch-7-0.0001428541581844911.pth',)

In [7]:
test_dataset = DictionaryDataset(test_data)
test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=False)
tester(test_dataloader)

(1.0, 1.0, 1.0, 1.0)

### Training regular classifiers

In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support

In [16]:
df = pd.read_csv('OH_sampled_cleaned.csv')
x = df[[col for col in df.columns.tolist() if col != 'is_arrested']]
y = df['is_arrested']

Xtrain, Xtest, ytrain, ytest = train_test_split(x,y,test_size=0.2)
rf = RandomForestClassifier()
rf.fit(Xtrain, ytrain)
y_pred = rf.predict(Xtest)

NameError: name 'ypred' is not defined

In [22]:
precision, recall, fscore, _ = precision_recall_fscore_support(ytest, y_pred, average='binary', )

In [23]:
precision

0.42857142857142855