In [None]:
import os
import warnings

warnings.filterwarnings(action='ignore', message='numpy.dtype size changed')
warnings.filterwarnings(action='ignore', message='compiletime version 3.5 of module')

if not 'workbookDir' in globals():
    workbookDir = os.getcwd()
os.chdir(os.path.split(workbookDir)[0])

%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
from scipy.stats import ks_2samp
import seaborn as sns
from synthesized import BasicSynthesizer
from synthesized.testing.synthetic_distributions import *
from synthesized.testing.synthetic_distributions import _plot_data

In [None]:
def log_values(synthesizer):
    print(' ', 'value types:')
    for value in synthesizer.values:
        print(' ', value.name, value)
    print()

def log_distance(df_original, df_synthesized):
    distances = list()
    for name in df_original:
        distance = ks_2samp(df_original[name], df_synthesized[name]).statistic
        suffix = ''
        if distance < 0.05:
            suffix += '*'
        if distance < 0.01:
            suffix += '*'
        distances.append('{:.3f}{}'.format(distance, suffix))
    print('/'.join(distances), end='  ')

In [None]:
size = 100000

def synthesize(df_original, num_iterations, num_logging, fn_logging=log_distance, **kwargs):
    start = time.time()
    with BasicSynthesizer(df_original, **kwargs) as synthesizer:
        log_values(synthesizer)
        value_types = {value.name: type(value) for value in synthesizer.values}
        df_synthesized = synthesizer.synthesize(num_rows=len(df_original))
        fn_logging(df_original, df_synthesized)
        for _ in range(num_iterations // num_logging):
            synthesizer.learn(data=df_original, num_iterations=num_logging)
            df_synthesized = synthesizer.synthesize(num_rows=len(df_original))
            assert len(df_synthesized) == len(df_original)
            fn_logging(df_original, df_synthesized)
        print()
        print(' ', 'took', time.time() - start, 's')
        return synthesizer.synthesize(num_rows=len(df_original)), value_types

def plot(data, synthesized, value_types):
#     distances = [ks_2samp(data[col], synthesized[col])[0] for col in data.columns]
#     avg_distance = np.mean(distances)
#     evaluation[name + '_avg_distance'] = avg_distance
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5), sharex=True, sharey=True)
    ax1.set_title('original')
    ax2.set_title('synthesized')
    _plot_data(data, ax=ax1, value_types=value_types)
    _plot_data(synthesized, ax=ax2, value_types=value_types)

## 1. Single-column categorical

Interesting hyperparameters: entropy regularization, learning rate and decay, beta loss.

Only change between different categorical distributions: entropy regularization.

### (a) Bernoulli(0.3)

In [None]:
df_original = create_bernoulli(probability=0.3, size=size)
df_synthesized, value_types = synthesize(
    df_original=df_original, summarizer='summaries', num_iterations=1000, num_logging=20,
    # VAE distribution
    distribution='normal', latent_size=512,
    # Network
    network='mlp', capacity=256, depth=4, batchnorm=True, activation='relu',
    # Optimizer
    optimizer='adam', learning_rate=1e-4, decay_steps=500, decay_rate=0.5,
    clip_gradients=1.0, batch_size=128,
    # Losses
    categorical_weight=1.0, continuous_weight=0.1, beta=5e-3, weight_decay=3e-5,
    # Categorical
    smoothing=0.0, moving_average=False, similarity_regularization=0.0,
    entropy_regularization=0.0
)

### (b) Bernoulli(0.1)

In [None]:
df_original = create_bernoulli(probability=0.1, size=size)
df_synthesized, value_types = synthesize(
    df_original=df_original, summarizer='summaries', num_iterations=1000, num_logging=20,
    # VAE distribution
    distribution='normal', latent_size=512,
    # Network
    network='mlp', capacity=256, depth=4, batchnorm=True, activation='relu',
    # Optimizer
    optimizer='adam', learning_rate=1e-4, decay_steps=500, decay_rate=0.5,
    clip_gradients=1.0, batch_size=128,
    # Losses
    categorical_weight=1.0, continuous_weight=0.1, beta=5e-3, weight_decay=3e-5,
    # Categorical
    smoothing=0.0, moving_average=False, similarity_regularization=0.0,
    entropy_regularization=0.0
)

### (c) Categorical([0.5, 0.25, 0.125, ...])

In [29]:
df_original = create_categorical(probabilities=[0.5, 0.25, 0.125, 0.0625, 0.0625], size=size)
df_synthesized, value_types = synthesize(
    df_original=df_original, summarizer='summaries', num_iterations=1000, num_logging=20,
    # VAE distribution
    distribution='normal', latent_size=512,
    # Network
    network='mlp', capacity=256, depth=4, batchnorm=True, activation='relu',
    # Optimizer
    optimizer='adam', learning_rate=3e-4, decay_steps=300, decay_rate=0.5,
    initial_boost=True, clip_gradients=1.0, batch_size=128,
    # Losses
    categorical_weight=40.0, continuous_weight=12.0, beta=5e-3, weight_decay=0.0,
    # Categorical
    temperature=0.2, smoothing=0.0, moving_average=False,
    similarity_regularization=0.0, entropy_regularization=0.0
)

  value types:
  x categorical5-229

0.479  0.051  0.065  0.045*  0.051  0.064  0.043*  0.039*  0.026*  0.031*  0.033*  0.033*  0.026*  0.033*  0.031*  0.028*  0.033*  0.031*  0.027*  0.030*  0.035*  0.030*  0.025*  0.017*  0.009**  0.007**  0.010**  0.014*  0.012*  0.013*  0.014*  0.014*  0.015*  0.016*  0.016*  0.016*  0.016*  0.016*  0.015*  0.013*  0.013*  0.015*  0.014*  0.015*  0.014*  0.013*  0.011*  0.010*  0.010**  0.011*  0.012*  
  took 77.79881310462952 s


## 2. Single-column continuous

Interesting hyperparameters: learning rate and decay, beta loss

### (a) Gaussian(0, 1)

In [None]:
df_original = create_1d_gaussian(mean=0.0, std=1.0, size=size)
df_synthesized, value_types = synthesize(
    df_original=df_original, summarizer='summaries', num_iterations=1000, num_logging=20,
    # VAE distribution
    distribution='normal', latent_size=512,
    # Network
    network='mlp', capacity=256, depth=4, batchnorm=True, activation='relu',
    # Optimizer
    optimizer='adam', learning_rate=1e-4, decay_steps=500, decay_rate=0.5,
    clip_gradients=1.0, batch_size=128,
    # Losses
    categorical_weight=1.0, continuous_weight=20.0, beta=1.0, weight_decay=0.0,
    # Categorical
    smoothing=0.0, moving_average=False, similarity_regularization=0.0,
    entropy_regularization=0.0
)

  value types:
  x continuous-gamma

0.249  0.147  0.176  0.151  0.156  0.156  0.159  0.134  0.140  0.140  0.129  0.126  0.147  0.115  0.133  0.119  0.106  0.104  0.112  0.098  

### (b) Gaussian mixture N(0, 1) and N(3, 1)

Please note that fitting of distributions is not stable and for the same parameters of original distribution we can get a different fitted distribution. We actually should not fit any distribution in this case. Shouldn't we decrease threshold for distribution fitting?

In [None]:
df_original = create_two_gaussian_mixtures(mean1=0.0, std1=1.0, mean2=3.0, std2=1.0, size=size)
df_synthesized, value_types = synthesize(
    df_original=df_original, summarizer='summaries', num_iterations=1000, num_logging=20,
    # VAE distribution
    distribution='normal', latent_size=512,
    # Network
    network='mlp', capacity=256, depth=4, batchnorm=True, activation='relu',
    # Optimizer
    optimizer='adam', learning_rate=1e-3, decay_steps=500, decay_rate=0.5,
    clip_gradients=1.0, batch_size=128,
    # Losses
    categorical_weight=1.0, continuous_weight=0.1, beta=5e-3, weight_decay=3e-5,
    # Categorical
    smoothing=0.0, moving_average=False, similarity_regularization=0.0,
    entropy_regularization=0.0
)

#### (i) Production-like settings which sould produce reasonable result

In [None]:
df_original = create_two_gaussian_mixtures(mean1=0.0, std1=1.0, mean2=3.0, std2=1.0, size=size)
df_synthesized, value_types = synthesize(
    df_original=df_original, summarizer=None, num_iterations=5000, num_logging=100,
    # VAE distribution
    distribution='normal', latent_size=512,
    # Network
    network='mlp', capacity=128, depth=2, batchnorm=True, activation='relu',
    # Optimizer
    optimizer='adam', learning_rate=3e-4, decay_steps=1000, decay_rate=0.9,
    clip_gradients=1.0, batch_size=64,
    # Losses
    categorical_weight=1.0, continuous_weight=0.1, beta=0.001, weight_decay=1e-5,
    # Categorical
    smoothing=0.0, moving_average=True, similarity_regularization=0.0,
    entropy_regularization=0.0
)

In [None]:
df_original.hist(bins=100)
df_synthesized.hist(bins=100)

## 3. Two-column product distributions

### (a) Bernoulli(0.1) x Bernoulli(0.1)

Same hyperparameters as for single-column case, but slightly increased entropy regularization value.

In [None]:
df_original = product(
    df1=create_bernoulli(probability=0.1, size=size),
    df2=create_bernoulli(probability=0.1, size=size)
)
df_synthesized, value_types = synthesize(
    df_original=df_original, summarizer=None, num_iterations=1000, num_logging=20,
    # VAE distribution
    distribution='normal', latent_size=512,
    # Network
    network='mlp', capacity=256, depth=2, batchnorm=True, activation='relu',
    # Optimizer
    optimizer='adam', learning_rate=1e-5,
    decay_steps=200,  # !!!
    decay_rate=0.5, clip_gradients=1.0, batch_size=128,
    # Losses
    categorical_weight=1.0, continuous_weight=0.1, beta=5.0, weight_decay=1e-3,
    # Categorical
    smoothing=0.0, moving_average=True, similarity_regularization=0.0,
    entropy_regularization=0.06
)

### (b) Gaussian(1, 1) x Gaussian(-1, 1)

In [None]:
df_original = product(
    df1=create_1d_gaussian(mean=1.0, std=1.0, size=size),
    df2=create_1d_gaussian(mean=-1.0, std=1.0, size=size)
)
df_synthesized, value_types = synthesize(
    df_original=df_original, num_iterations=1000, num_logging=20,
    # encoder/decoder
    network_type='mlp', capacity=512, depth=2, layer_type='dense',
    batchnorm=True, activation='relu', weight_decay=1e-3,
    # encoding
    encoding_type='variational', encoding_size=512, encoding_kwargs=dict(beta=0.0004),
    # optimizer
    optimizer='adam', learning_rate=3e-3, decay_steps=50, decay_rate=0.5,
    clip_gradients=1.0, batch_size=128
)

### (c) Bernoulli x Gaussian

#### (i) Uniform loss weights

In [None]:
df_original = product(
    df1=create_bernoulli(probability=0.1, size=size),
    df2=create_1d_gaussian(mean=0.0, std=1.0, size=size)
)
df_synthesized, value_types = synthesize(
    df_original=df_original, num_iterations=1000, num_logging=20,
    # encoder/decoder
    network_type='mlp', capacity=512, depth=2, layer_type='dense',
    batchnorm=True, activation='relu', weight_decay=1e-3,
    # encoding
    encoding_type='variational', encoding_size=512, encoding_kwargs=dict(beta=5.0),
    # optimizer
    optimizer='adam', learning_rate=1e-5, decay_steps=200, decay_rate=0.5,
    clip_gradients=1.0, batch_size=128,
    # losses
    categorical_weight=1.0, continuous_weight=1.0,
    # categorical
    smoothing=0.0, moving_average=True, similarity_regularization=0.0,
    entropy_regularization=0.05
)

#### (ii) Biased loss weights

In [None]:
df_original = product(
    df1=create_bernoulli(probability=0.1, size=size),
    df2=create_1d_gaussian(mean=0.0, std=1.0, size=size)
)
df_synthesized, value_types = synthesize(
    df_original=df_original, summarizer='summaries', num_iterations=1000, num_logging=20,
    # encoder/decoder
    network_type='mlp', capacity=512, depth=4, layer_type='dense',
    batchnorm=True, activation='relu', weight_decay=0.0,
    # encoding
    encoding_type='variational', encoding_size=512, encoding_kwargs=dict(beta=2e-5),
    # optimizer
    optimizer='adam', learning_rate=3e-4, decay_steps=300, decay_rate=0.5,
    clip_gradients=1.0, batch_size=512,
    # losses
    categorical_weight=1.0, continuous_weight=0.04,
    # categorical
    smoothing=0.0, moving_average=True, similarity_regularization=0.0,
    entropy_regularization=0.0
)

In [None]:
sns.countplot(df_original['x1'])

In [None]:
sns.countplot(df_synthesized['x1'])

In [None]:
sns.distplot(df_original['x2'])
sns.distplot(df_synthesized['x2'])