In [1]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from model import VFAE, train
from dataset import DictionaryDataset
from checkpoint import Checkpoint
from data_generator import DataGen
from loss import VFAE_loss
from torch.utils.data import DataLoader

### VAE training

In [2]:
df = pd.read_csv('data/adult_cleaned.csv')
df.drop(['Unnamed: 0'], inplace=True, axis=1)

target = 'income'
sensitive = 'age'

features = [col for col in df.columns.tolist() if col not in [target, sensitive]]
x, s, y = df[features], df[sensitive], df[target]

Xtrain, Xtest, strain, stest, ytrain, ytest = train_test_split(x,s,y,test_size=0.3)

In [3]:
train_data = {
    'x': np.array(Xtrain, dtype=np.float32),
    's': np.reshape(np.array(strain, dtype=np.float32), (-1,1),), 
    'y': np.reshape(np.array(ytrain, dtype=np.float32), (-1,1)) 
}

test_data = {
    'x': np.array(Xtest, dtype=np.float32),
    's': np.reshape(np.array(stest, dtype=np.float32), (-1,1),), 
    'y': np.reshape(np.array(ytest, dtype=np.float32), (-1,1)) 
}

In [4]:
# Hyperparams
x_dim = 98
s_dim = 1
y_dim = 1
z1_enc_dim = 100
z2_enc_dim = 100
z1_dec_dim = 100
x_dec_dim = 100
z_dim = 50
dropout_rate = 0.2

alpha = 1.0
beta = 0.1
gamma = 1
mmd_dims = 500

In [5]:
vfae = VFAE(x_dim, s_dim, y_dim, z1_enc_dim, z2_enc_dim, z1_dec_dim, x_dec_dim, z_dim, dropout_rate, )
optim = torch.optim.Adam(vfae.parameters(),lr=1e-3)

In [6]:
dataset = DictionaryDataset(train_data)
train_dataloader = DataLoader(dataset, batch_size=25000, shuffle=True)
loss_function = VFAE_loss(alpha=alpha, beta=beta, gamma=gamma, dims_out=mmd_dims)
checkpointer = Checkpoint('checkpoints')

In [8]:
vfae.train()
for e in range(100):
    loss = train(e, vfae, train_dataloader, loss_function, optim, print_freq=250000)
    checkpointer(loss, vfae, e)

Train Epoch: 0 Average loss: 26.9044
Train Epoch: 1 Average loss: 23.9343
Train Epoch: 2 Average loss: 21.4870
Train Epoch: 3 Average loss: 19.1071
Train Epoch: 4 Average loss: 16.7572
Train Epoch: 5 Average loss: 14.4594
Train Epoch: 6 Average loss: 12.0019
Train Epoch: 7 Average loss: 9.4378
Train Epoch: 8 Average loss: 6.9370
Train Epoch: 9 Average loss: 4.8284
Train Epoch: 10 Average loss: 3.4604
Train Epoch: 11 Average loss: 3.9002
Train Epoch: 12 Average loss: 7.0763
Train Epoch: 13 Average loss: 13.7333
Train Epoch: 14 Average loss: 21.3963
Train Epoch: 15 Average loss: 24.1779
Train Epoch: 16 Average loss: 20.6935


KeyboardInterrupt: 

In [7]:
vfae = VFAE(x_dim, s_dim, y_dim, z1_enc_dim, z2_enc_dim, z1_dec_dim, x_dec_dim, z_dim, dropout_rate, )
generate = DataGen(vfae, 'checkpoints/epoch-10-3.460378646850586.pth',)

In [8]:
test_dataset = DictionaryDataset(test_data)
test_dataloader = DataLoader(test_dataset, batch_size=len(test_dataset), shuffle=False)
output = generate(test_dataloader)

#### Prediction with a classifier head from encoded x

In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support
from sklearn.linear_model import LogisticRegression

In [17]:
X_train, X_test, y_train, y_test = train_test_split(output['x_pred'],ytest,test_size=0.2)

lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
precision_recall_fscore_support(y_test, y_pred, average = 'weighted')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


(array([0.75461066, 0.        ]),
 array([0.99864407, 0.        ]),
 array([0.859644, 0.      ]),
 array([1475,  479]))