In [None]:
import sys, os
sys.path.append(os.path.join(os.getcwd(), '..'))

import numpy as np
import pandas as pd

# Import data and create DataFrame of data to augment
X_train = np.load('../data/processed/train.npz')['x']
y_train = np.load('../data/processed/train.npz')['y']
X_v = np.load('../data/processed/v.npz')['x']
y_v = np.load('../data/processed/v.npz')['y']

In [None]:
X_train_df = pd.DataFrame(X_train)
X_train_df['target'] = y_train

# Infer metadata: categorical and numeric features
from sdv.metadata import SingleTableMetadata
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(X_train_df)

In [None]:
from sdv.single_table import TVAESynthesizer

synthesizer = TVAESynthesizer(
    metadata, # required
    save_path='../outputs/synthesizers/tvae_tot.pkl',
    batch_size=100000,
    compress_dims = (128, 128),
    decompress_dims = (128, 128),
    epochs=500,
    embedding_dim=64,
    cuda='cuda:2',
    verbose=True,
    patience=50,
    weights=pd.read_csv('../outputs/results/feature_importances.csv')['gain'].values
)

synthesizer.fit(X_train_df)

In [None]:
%config InlineBackend.figure_format = 'svg'
import matplotlib.pyplot as plt

losses = synthesizer.get_loss_values().groupby('Epoch').mean()

plt.rc('font', family='serif')
plt.rc('axes', grid=True)
plt.rcParams['grid.color'] = (0.5, 0.5, 0.5, 0.2)
plt.rc('ytick', direction='out', color='gray')
plt.rc('xtick', direction='out', color='gray')
plt.rcParams.update({'font.size': 12})

fig, axes = plt.subplots(1, 2, figsize=(14, 5))
axes[0].plot(range(1, len(losses) + 1), losses['Loss'], lw=1, color='lightskyblue')
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Loss')

axes[1].plot(range(1, len(losses) + 1), losses['KS'], lw=1, color='lightskyblue')
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Weighted Kolgomorov-Smirnov statistic')

fig.savefig('../outputs/figures/tvae_loss_tot.pdf', dpi=300, bbox_inches='tight');

In [None]:
from ctgan import TVAE

synthesizer = TVAE.load('../outputs/synthesizers/tvae_tot.pkl')

In [None]:
synthetic_data = synthesizer.sample(len(X_train_df))

X_train_df.columns = [str(c) for c in X_train_df.columns]
synthetic_data.columns = [str(c) for c in synthetic_data.columns]

true_def_0 = X_train_df['0'][X_train_df['target'] == 1]
synth_def_0 = synthetic_data['0'][synthetic_data['target'] == 1]

true_nondef_0 = X_train_df['0'][X_train_df['target'] == 0]
synth_nondef_0 = synthetic_data['0'][synthetic_data['target'] == 0]

fig, axes = plt.subplots(1, 2, figsize=(14, 5))
# Feature 0
min_v, max_v = -0.5, 1.1
bin_width = (max_v - min_v) / 30
bin_edges = np.arange(min_v, max_v + bin_width, bin_width)
axes[0].hist(true_def_0.values, bins=bin_edges, edgecolor='dimgray', color='lightskyblue', lw=0.1, alpha=0.6, density=True, label='True')
axes[0].hist(synth_def_0.values, bins=bin_edges, edgecolor='dimgray', color='sandybrown', lw=0.1, alpha=0.6, density=True, label='Synthetic')
axes[0].set_xlabel('Feature 0 - defaulters')
axes[0].set_ylabel('Density')
axes[0].grid(axis='x')
axes[0].legend(loc='upper left')
axes[0].set_yticks([0, 0.5, 1, 1.5, 2, 2.5, 3, 3.5])

axes[1].hist(true_nondef_0.values, bins=bin_edges, edgecolor='dimgray', color='lightskyblue', lw=0.1, alpha=0.6, density=True)
axes[1].hist(synth_nondef_0.values, bins=bin_edges, edgecolor='dimgray', color='sandybrown', lw=0.1, alpha=0.6, density=True)
axes[1].set_xlabel('Feature 0 - non-defaulters')
axes[1].grid(axis='x')
axes[1].set_yticks([0, 0.5, 1, 1.5, 2, 2.5, 3, 3.5])

fig.savefig('../outputs/figures/tvae_hist_tot.pdf', dpi=300, bbox_inches='tight');