In [1]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from model import VFAE, train
from dataset import DictionaryDataset
from checkpoint import Checkpoint
from test import Test
from loss import VFAE_loss
from torch.utils.data import DataLoader

### VAE training

In [2]:
def relabel(y):
    if y == 1: return 0
    elif y == 2:return 1

In [3]:
df = pd.read_csv('OH_sampled_cleaned.csv')
df

Unnamed: 0,is_arrested,location_raw_9.0,location_raw_13.0,location_raw_18.0,location_raw_21.0,location_raw_22.0,location_raw_25.0,location_raw_31.0,location_raw_45.0,location_raw_47.0,...,violations_numbered_3,violations_numbered_4,violations_numbered_5,violations_numbered_6,violations_numbered_7,violations_numbered_8,violations_numbered_9,violations_numbered_10,violations_numbered_11,violations_numbered_12
0,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
616595,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
616596,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
616597,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
616598,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
df = pd.read_csv('OH_sampled_cleaned.csv')
x, s, y = df[[1,4,7,10]], df[12],df[20]

x=(x-x.min())/(x.max() - x.min())
s=(s-s.min())/(s.max() - s.min())
y = y.apply(relabel)

Xtrain, Xtest, strain, stest, ytrain, ytest = train_test_split(x,s,y,test_size=0.2)

train_data = {
    'x': np.array(Xtrain, dtype=np.float32),
    's': np.reshape(np.array(strain, dtype=np.float32), (-1,1),), 
    'y': np.reshape(np.array(ytrain, dtype=np.float32), (-1,1)) 
}

test_data = {
    'x': np.array(Xtest, dtype=np.float32),
    's': np.reshape(np.array(stest, dtype=np.float32), (-1,1),), 
    'y': np.reshape(np.array(ytest, dtype=np.float32), (-1,1)) 
}

In [4]:
# Hyperparams
x_dim = 4
s_dim = 1
y_dim = 1
z1_enc_dim = 60
z2_enc_dim = 60
z1_dec_dim = 60
x_dec_dim = 60
z_dim = 50
dropout_rate = 0.0

alpha = 1
beta = 0
gamma = 1.0
mmd_dims = 500

In [5]:
vfae = VFAE(x_dim, s_dim, y_dim, z1_enc_dim, z2_enc_dim, z1_dec_dim, x_dec_dim, z_dim, dropout_rate, )
optim = torch.optim.Adam(vfae.parameters(),lr=1e-3)

In [11]:
dataset = DictionaryDataset(train_data)
train_dataloader = DataLoader(dataset, batch_size=250, shuffle=True)
loss_function = VFAE_loss(alpha=alpha, beta=beta, gamma=gamma, dims_out=mmd_dims)
checkpointer = Checkpoint('checkpoints')

In [13]:
for e in range(100):
    loss = train(e, vfae, train_dataloader, loss_function, optim, print_freq=1)
    checkpointer(loss, vfae, e)

Train Epoch: 0 Average loss: 0.1287
Train Epoch: 1 Average loss: 0.1051
Train Epoch: 2 Average loss: 0.0853
Train Epoch: 3 Average loss: 0.0647
Train Epoch: 4 Average loss: 0.0440
Train Epoch: 5 Average loss: 0.0274
Train Epoch: 6 Average loss: 0.0266
Train Epoch: 7 Average loss: 0.0554
Train Epoch: 8 Average loss: 0.1016
Train Epoch: 9 Average loss: 0.0961
Train Epoch: 10 Average loss: 0.0711
Train Epoch: 11 Average loss: 0.0426
Train Epoch: 12 Average loss: 0.0378
Train Epoch: 13 Average loss: 0.0692
Train Epoch: 14 Average loss: 0.1142
Train Epoch: 15 Average loss: 0.1167
Train Epoch: 16 Average loss: 0.0792
Train Epoch: 17 Average loss: 0.0569
Train Epoch: 18 Average loss: 0.0535
Train Epoch: 19 Average loss: 0.0588
Train Epoch: 20 Average loss: 0.0632
Train Epoch: 21 Average loss: 0.0880
Train Epoch: 22 Average loss: 0.1266
Train Epoch: 23 Average loss: 0.1386
Train Epoch: 24 Average loss: 0.0815
Train Epoch: 25 Average loss: 0.0529
Train Epoch: 26 Average loss: 0.1143
Train Epoch

In [11]:
vfae = VFAE(x_dim, s_dim, y_dim, z1_enc_dim, z2_enc_dim, z1_dec_dim, x_dec_dim, z_dim, dropout_rate, )
infer = Inference(vfae, 'checkpoints/epoch-32-0.06483417749404907.pth',)

In [12]:
test_dataset = DictionaryDataset(test_data)
test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=False)
infer(test_dataloader)

0.62

In [14]:
from sklearn.ensemble import RandomForestClassifier

In [27]:
df = pd.read_csv('german.data', sep = ' ', header = None)
x, y = df[[1,4,7,10,12]],df[20]
x=(x-x.min())/(x.max() - x.min())
y = y.apply(relabel)

Xtrain, Xtest, ytrain, ytest = train_test_split(x,y,test_size=0.2)
rf = RandomForestClassifier()
rf.fit(Xtrain, ytrain)
rf.score(Xtest, ytest)


0.68