## Import Libraries and Datasets

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy import stats
import seaborn as sns
import sklearn.preprocessing as skpre
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import time

import DataProcessor
import SRMSE
import VAE

In [2]:
# Load in the persons PUMS dataset for WA state
h_df = pd.read_csv("data/PUMS_2017_5YR_WA/psam_h53.csv")

## Choose PUMS and Preprocess

In [3]:
# Filter to desired variables (numeric then categorical)
pums_data = h_df[['HINCP','NP','VEH','BLD','ACCESS']].copy()

# Convert numerical variables to categorical bins
pums_data['HINCP'], hincp_bins = pd.qcut(pums_data['HINCP'], q=10, labels=False, retbins=True)

# Remove NA values and check n before/after
print(f"Dataset n={len(pums_data)} pre-cleaning")
pums_data = pums_data.dropna()
print(f"Dataset n={len(pums_data)} post-cleaning")

Dataset n=164836 pre-cleaning
Dataset n=140586 post-cleaning


In [4]:
VAR_DIM = pums_data.shape[1]
VAR_NAMES = pums_data.columns

# Split categorical data into OHE vars, save num classes per variable
dummies_list = []
pums_encodings_list = []
for x in range(VAR_DIM):
    dummies = pums_data.iloc[:,x]
    pums_encodings_list.append(np.unique(dummies.values))
    dummies = pd.get_dummies(dummies, prefix=f"{pums_data.columns[x]}_")
    dummies_list.append(dummies)
CAT_LENGTHS = [x.shape[1] for x in dummies_list]

In [5]:
# Final data frame after encoding OHE
model_data_df = pd.DataFrame()
for ohe_var in dummies_list:
    model_data_df = pd.concat([model_data_df, ohe_var], axis=1)
MANIFEST_DIM = model_data_df.shape[1]

# Preview data that will be fed into model
model_data_df

Unnamed: 0,HINCP__0.0,HINCP__1.0,HINCP__2.0,HINCP__3.0,HINCP__4.0,HINCP__5.0,HINCP__6.0,HINCP__7.0,HINCP__8.0,HINCP__9.0,...,BLD__4.0,BLD__5.0,BLD__6.0,BLD__7.0,BLD__8.0,BLD__9.0,BLD__10.0,ACCESS__1.0,ACCESS__2.0,ACCESS__3.0
0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,1,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
3,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
161771,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
161772,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
161774,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
161775,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0


In [6]:
# Separate into train/test data
model_data = model_data_df.values
train_idx = round(len(model_data)*.10)
train_data = model_data[0:train_idx,:]
test_data = model_data[train_idx:len(model_data),:]

print(f"Training on {train_idx} samples")

Training on 14059 samples


## Set Parameters and Define Model

In [11]:
# Grid Search SRMSE 1
BATCH_SIZES = [64, 128, 256, 512, 1024]
EPOCHS = [1000]
LATENT_DIMS = [2, 3, 4]
HIDDEN_DIMS = [8, 16, 32, 64]
KL_WEIGHTS = [.01, .05, .1, 1.0]
# [155.67232656478882, 1.8501038551330566, 0.05296959012880786, 0.02081236519716125, [128, 1000, 2, 32, 0.05]]
# [98.54442024230957, 1.7626943588256836, 0.06421160670330948, 0.020779257538261826, [256, 1000, 4, 32, 0.1]]
# [44.24084186553955, 2.3369486331939697, 0.06478558327639009, 0.02501131310084622, [1024, 1000, 4, 16, 0.1]]

# Best results so far:
# BATCH_SIZE = 512
# EPOCHS = 1000
# LATENT_DIM = 3
# HIDDEN_DIM = 16
# KL_WEIGHT = .05

## Training

In [12]:
val_loss = []
val_epochs = []
class validationCallback(keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        if epoch % 10 == 0:
            val_epochs.append(epoch)
            # Use random sample of 1000 from test data to validate the model
            idx = np.random.randint(test_data.shape[0], size=1000)
            val_data = test_data[idx]
            results = vae.predict(val_data)
            loss_cat = VAE.get_reconstruction_loss(val_data, results, CAT_LENGTHS)
            val_loss.append(float(loss_cat))

In [13]:
def testModel(test_vae, latent_dim):
    # How many synthetic households to generate
    POP_SIZE = 100000  # Number of hh in WA

    # Generate random normal sample to represent each latent variable, for each row (different hh per row)
    np.random.seed(42)
    inputs = np.random.normal(loc=0, scale=1, size=(POP_SIZE, latent_dim))

    start_time = time.time()
    # Generate households; each hh has unique latent input
    results = test_vae.decoder.predict([inputs, inputs[:,latent_dim:]])  # Nothing is actually in the second input
    gen_time = time.time() - start_time

    synth_hh_df = pd.DataFrame()
    for i, x in enumerate(results[0]):
        result = np.apply_along_axis(DataProcessor.logit_to_val, 1, x, pums_encodings_list[i])
        result_df = pd.DataFrame(result).add_prefix(f"{VAR_NAMES[i]}_")
        synth_hh_df = pd.concat([synth_hh_df, result_df], axis=1)
    synth_hh_df.columns = VAR_NAMES
    
    # Get univariate and bivariate srmse for the current model
    srmse_uni = SRMSE.calculateSRMSE(synth_hh_df, pums_data.sample(POP_SIZE, random_state=42))
    srmse_bi = SRMSE.calculateBivariateSRMSE(synth_hh_df, pums_data.sample(POP_SIZE, random_state=42))

    return gen_time, srmse_uni, srmse_bi

In [14]:
HH_IDX = len(CAT_LENGTHS)  # Number of household variables (used in CVAE)

# Use grid search to train model and get results for many different hyperparameters
grid_search_results = []
for bs in BATCH_SIZES:
    for e in EPOCHS:
        for ld in LATENT_DIMS:
            for hd in HIDDEN_DIMS:
                for klw in KL_WEIGHTS:
                    print(f"[{bs}, {e}, {ld}, {hd}, {klw}]")
                    start_time = time.time()
                    vae = VAE.VAE(MANIFEST_DIM, hd, ld, CAT_LENGTHS, HH_IDX, klw)
                    vae.compile(optimizer=keras.optimizers.Adam())
                    history = vae.fit(train_data, epochs=e, batch_size=bs, callbacks=[validationCallback()], verbose=0)
                    train_time = (time.time()-start_time)
                    gen_time, srmse_uni, srmse_bi = testModel(test_vae=vae, latent_dim=ld)
                    results = [train_time, gen_time, srmse_uni, srmse_bi, [bs,e,ld,hd,klw]]
                    grid_search_results.append(results)

# Find the best results for each metric in the grid search
min_srmse_uni_idx = 0
min_srmse_uni_value = 999999999
min_srmse_bi_idx = 0
min_srmse_bi_value = 999999999
min_eff_idx = 0
min_eff_value = 999999999
for i, result in enumerate(grid_search_results):
    tot_time = result[0]+result[1]
    time_eff = (result[2]+result[3] / 2) * tot_time
    if result[2] < min_srmse_uni_value:
        min_srmse_uni_idx = i
        min_srmse_uni_value = result[2]
    if result[3] < min_srmse_bi_value:
        min_srmse_bi_idx = i
        min_srmse_bi_value = result[3]
    if time_eff < min_eff_value:
        min_eff_idx = i
        min_eff_value = time_eff

# Display the results
print(grid_search_results[min_srmse_uni_idx])
print(grid_search_results[min_srmse_bi_idx])
print(grid_search_results[min_eff_idx])

[64, 1000, 2, 8, 0.01]
Univariate (marginal) SRMSE: 0.18266375772771345
Bivariate (joint) SRMSE: 0.07766168227327241
[64, 1000, 2, 8, 0.05]
Univariate (marginal) SRMSE: 0.12233834089567842
Bivariate (joint) SRMSE: 0.051495970661296346
[64, 1000, 2, 8, 0.1]
Univariate (marginal) SRMSE: 0.10762163751907212
Bivariate (joint) SRMSE: 0.04802911833197508
[64, 1000, 2, 8, 1.0]
Univariate (marginal) SRMSE: 0.2527292133790892
Bivariate (joint) SRMSE: 0.12284909416837787
[64, 1000, 2, 16, 0.01]
Univariate (marginal) SRMSE: 0.1823068960096491
Bivariate (joint) SRMSE: 0.05496593321293818
[64, 1000, 2, 16, 0.05]
Univariate (marginal) SRMSE: 0.09886210294303013
Bivariate (joint) SRMSE: 0.03020264579870063
[64, 1000, 2, 16, 0.1]
Univariate (marginal) SRMSE: 0.0733500327553266
Bivariate (joint) SRMSE: 0.025360487236754783
[64, 1000, 2, 16, 1.0]
Univariate (marginal) SRMSE: 0.19345168868155746
Bivariate (joint) SRMSE: 0.08981012785848091
[64, 1000, 2, 32, 0.01]
Univariate (marginal) SRMSE: 0.1686216051

Univariate (marginal) SRMSE: 0.08071270628657648
Bivariate (joint) SRMSE: 0.028749101043706544
[128, 1000, 3, 16, 1.0]
Univariate (marginal) SRMSE: 0.24600110449715049
Bivariate (joint) SRMSE: 0.12113547758815871
[128, 1000, 3, 32, 0.01]
Univariate (marginal) SRMSE: 0.24580364553182987
Bivariate (joint) SRMSE: 0.04244048577712241
[128, 1000, 3, 32, 0.05]
Univariate (marginal) SRMSE: 0.07427251689158931
Bivariate (joint) SRMSE: 0.02597968200524275
[128, 1000, 3, 32, 0.1]
Univariate (marginal) SRMSE: 0.08283307995501396
Bivariate (joint) SRMSE: 0.022521986707724005
[128, 1000, 3, 32, 1.0]
Univariate (marginal) SRMSE: 0.1766694842421073
Bivariate (joint) SRMSE: 0.08349930169189795
[128, 1000, 3, 64, 0.01]
Univariate (marginal) SRMSE: 0.17992427183109766
Bivariate (joint) SRMSE: 0.05302217534031103
[128, 1000, 3, 64, 0.05]
Univariate (marginal) SRMSE: 0.15304666339082684
Bivariate (joint) SRMSE: 0.03167127079901344
[128, 1000, 3, 64, 0.1]
Univariate (marginal) SRMSE: 0.08930395384421042
Bi

Bivariate (joint) SRMSE: 0.07608189968701039
[256, 1000, 4, 64, 0.01]
Univariate (marginal) SRMSE: 0.1637403136183867
Bivariate (joint) SRMSE: 0.048750858108583345
[256, 1000, 4, 64, 0.05]
Univariate (marginal) SRMSE: 0.1306019526310267
Bivariate (joint) SRMSE: 0.029273241465207066
[256, 1000, 4, 64, 0.1]
Univariate (marginal) SRMSE: 0.10846624554154306
Bivariate (joint) SRMSE: 0.02341336515772016
[256, 1000, 4, 64, 1.0]
Univariate (marginal) SRMSE: 0.15705420270247927
Bivariate (joint) SRMSE: 0.07020346848553674
[512, 1000, 2, 8, 0.01]
Univariate (marginal) SRMSE: 0.17827484136346955
Bivariate (joint) SRMSE: 0.06878227576790139
[512, 1000, 2, 8, 0.05]
Univariate (marginal) SRMSE: 0.21221146704044047
Bivariate (joint) SRMSE: 0.0806342191140838
[512, 1000, 2, 8, 0.1]
Univariate (marginal) SRMSE: 0.10707274886172925
Bivariate (joint) SRMSE: 0.0408052087427108
[512, 1000, 2, 8, 1.0]
Univariate (marginal) SRMSE: 0.23620614978666277
Bivariate (joint) SRMSE: 0.11621398270737046
[512, 1000, 2

Univariate (marginal) SRMSE: 0.105217552556737
Bivariate (joint) SRMSE: 0.04483078011301998
[1024, 1000, 3, 8, 0.1]
Univariate (marginal) SRMSE: 0.14948467426917575
Bivariate (joint) SRMSE: 0.05865380433095758
[1024, 1000, 3, 8, 1.0]
Univariate (marginal) SRMSE: 0.2354553599897616
Bivariate (joint) SRMSE: 0.11710711685456722
[1024, 1000, 3, 16, 0.01]
Univariate (marginal) SRMSE: 0.18019981102718072
Bivariate (joint) SRMSE: 0.0404438587453928
[1024, 1000, 3, 16, 0.05]
Univariate (marginal) SRMSE: 0.08481853527191582
Bivariate (joint) SRMSE: 0.028951097780146814
[1024, 1000, 3, 16, 0.1]
Univariate (marginal) SRMSE: 0.1553014557965534
Bivariate (joint) SRMSE: 0.029496138484919863
[1024, 1000, 3, 16, 1.0]
Univariate (marginal) SRMSE: 0.2350367119638734
Bivariate (joint) SRMSE: 0.11560108879704244
[1024, 1000, 3, 32, 0.01]
Univariate (marginal) SRMSE: 0.15902634509469893
Bivariate (joint) SRMSE: 0.05745973126819499
[1024, 1000, 3, 32, 0.05]
Univariate (marginal) SRMSE: 0.08743659108836774
B

In [None]:
# Train the model using single set of hyperparameters
HH_IDX = len(CAT_LENGTHS)  # Number of household variables (used in CVAE)
start_time = time.time()

vae = VAE.VAE(MANIFEST_DIM, HIDDEN_DIM, LATENT_DIM, CAT_LENGTHS, HH_IDX, KL_WEIGHT)
vae.compile(optimizer=keras.optimizers.Adam())
history = vae.fit(train_data, epochs=EPOCHS, batch_size=BATCH_SIZE, callbacks=[validationCallback()])

train_time = (time.time()-start_time)
print(train_time)

In [None]:
# Plot model loss/training progress
plt.plot(history.history['tot_loss'])
plt.plot(history.history['rec_loss'])
plt.plot(history.history['kl_loss'])
plt.plot(val_epochs, val_loss)
plt.title("Training History")
plt.ylabel("Loss")
plt.xlabel("Epoch")
plt.legend(["total","rec","kl","val"], loc="upper right")
plt.show()

## Testing

In [None]:
# Get latent vars from the encoder; feed to decoder and get sampled manifest variables
z_mean, z_logvar, z = vae.encoder.predict(test_data[:,:MANIFEST_DIM])

# Show distributions of the resulting variables
fig, axes = plt.subplots(nrows=1, ncols=LATENT_DIM, figsize=(10,3))
fig.tight_layout(w_pad=2, h_pad=10)

for i in range(LATENT_DIM):
    sns.histplot(z[:,i], ax=axes[i], bins=50, stat="probability")
    axes[i].set_title(f"Latent Variable {i+1} in Test Data")

In [None]:
# Draw predictions from test data
results = vae.predict(test_data)
loss_cat = VAE.get_reconstruction_loss(test_data, results, CAT_LENGTHS)
print(f"Categorical Variable Loss: {loss_cat}")

In [None]:
# Convert the testing data from ohe to values
test_data_df = pd.DataFrame()
for i, x in enumerate(results[0]):
    result = np.apply_along_axis(DataProcessor.logit_to_val, 1, x, pums_encodings_list[i])
    result_df = pd.DataFrame(result).add_prefix(f"{VAR_NAMES[i]}_")
    test_data_df = pd.concat([test_data_df, result_df], axis=1)
test_data_df.columns = VAR_NAMES

# Convert the test results from logits to values
test_results_df = pd.DataFrame()
for i, x in enumerate(results[0]):
    result = np.apply_along_axis(DataProcessor.logit_to_val, 1, x, pums_encodings_list[i])
    result_df = pd.DataFrame(result).add_prefix(f"{VAR_NAMES[i]}_")
    test_results_df = pd.concat([test_results_df, result_df], axis=1)
test_results_df.columns = VAR_NAMES

In [None]:
# Show distributions of the resulting variables
fig, axes = plt.subplots(nrows=len(VAR_NAMES), ncols=2, figsize=(10,10))
fig.tight_layout(h_pad=4)

for i, col in enumerate(VAR_NAMES):
    sns.histplot(test_results_df[col], ax=axes[i,0], stat='probability', bins=100).set(xlim=(min(test_data_df[col]),max(test_data_df[col])))
    axes[i,0].set_title(col)
    axes[i,0].set_xlabel(None)
    sns.histplot(test_data_df[col], ax=axes[i,1], stat='probability', bins=100).set(xlim=(min(test_data_df[col]),max(test_data_df[col])))
    axes[i,1].set_title(col)
    axes[i,1].set_xlabel(None)

## Generating Synthetic Population

In [None]:
# How many synthetic households to generate
POP_SIZE = 100000  # Number of hh in WA

# Generate random normal sample to represent each latent variable, for each row (different hh per row)
inputs = np.random.normal(loc=0, scale=1, size=(POP_SIZE, LATENT_DIM))

start_time = time.time()

# Generate households; each hh has unique latent input
results = vae.decoder.predict([inputs, inputs[:,LATENT_DIM:]])  # Nothing is actually in the second input

print(f"--- {time.time() - start_time} seconds ---")

In [None]:
# Save raw output values to be conditional inputs for the person CVAE
synth_hh_ohe_df = pd.DataFrame()
for i, x in enumerate(results[0]):
    result = np.apply_along_axis(DataProcessor.logit_to_ohe, 1, x)  # Most variables in PUMS are indexed from 1
    result_df = pd.DataFrame(result).add_prefix(f"{VAR_NAMES[i]}_")
    synth_hh_ohe_df = pd.concat([synth_hh_ohe_df, result_df], axis=1)

synth_hh_df = pd.DataFrame()
for i, x in enumerate(results[0]):
    result = np.apply_along_axis(DataProcessor.logit_to_val, 1, x, pums_encodings_list[i])
    result_df = pd.DataFrame(result).add_prefix(f"{VAR_NAMES[i]}_")
    synth_hh_df = pd.concat([synth_hh_df, result_df], axis=1)
synth_hh_df.columns = VAR_NAMES

synth_hh_df.to_csv('data/synthetic_populations/wa_households.csv', index=False)
synth_hh_ohe_df.to_csv('data/synthetic_populations/wa_households_raw.csv', index=False)

In [None]:
# Generate Distributions of All HH variables
fig, axes = plt.subplots(nrows=len(VAR_NAMES), ncols=1, figsize=(5,10))
fig.tight_layout(h_pad=4)

for i, var in enumerate(VAR_NAMES):
    sns.ecdfplot(data=pums_data, x=var, ax=axes[i]).set(xlim=(min(pums_data[var]),max(pums_data[var])))
    sns.ecdfplot(data=synth_hh_df, x=var, ax=axes[i]).set(xlim=(min(pums_data[var]),max(pums_data[var])))
    axes[i].set_title(var)
    axes[i].set_xlabel(None)
    plt.legend(labels=['pums','vae'])

In [None]:
# Show distributions of the resulting variables
fig, axes = plt.subplots(nrows=len(VAR_NAMES), ncols=2, figsize=(10,10))
fig.tight_layout(h_pad=2, w_pad=2)

for i, col in enumerate(VAR_NAMES):
    sns.histplot(synth_hh_df[col], ax=axes[i,0], stat='probability', bins=100).set(xlim=(min(pums_data[col]),max(pums_data[col])))
    axes[i,0].set_title(col)
    axes[i,0].set_xlabel(None)
    sns.histplot(pums_data[col], ax=axes[i,1], stat='probability', bins=100).set(xlim=(min(pums_data[col]),max(pums_data[col])))
    axes[i,1].set_title(col)
    axes[i,1].set_xlabel(None)

In [None]:
SRMSE.calculateSRMSE(synth_hh_df, pums_data.sample(POP_SIZE))
SRMSE.calculateBivariateSRMSE(synth_hh_df, pums_data.sample(POP_SIZE))