In [None]:
import os
import warnings

warnings.filterwarnings(action='ignore', message='numpy.dtype size changed')
warnings.filterwarnings(action='ignore', message='compiletime version 3.5 of module')

if not 'workbookDir' in globals():
    workbookDir = os.getcwd()
os.chdir(os.path.split(workbookDir)[0])

%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from scipy.stats import ks_2samp
from synthesized.testing.synthetic_distributions import *

Copyright (C) Synthesized Ltd. - All Rights Reserved
License key: EE6B-6720-67A2-32F3-3139-2D31-322D-B531
Expires at: 2019-12-31 00:00:00

For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.



In [3]:
def log_values(synthesizer):
    print(' ', 'value types:')
    for value in synthesizer.values:
        print(' ', value.name, value)
    print()

def log_distance(data, synthesized):
    distance = ks_2samp(data['x'], synthesized['x'])[0]
    print(' ', 'distance:', distance)

In [4]:
def synthesize(data, num_iterations, num_logging, fn_logging=log_distance, **kwargs):
    start = time.time()
    with BasicSynthesizer(data, **kwargs) as synthesizer:
        log_values(synthesizer)
        synthesized = synthesizer.synthesize(n=len(data))
        fn_logging(data, synthesized)
        for _ in range(num_iterations // num_logging):
            synthesizer.learn(data=data, num_iterations=num_logging)
            synthesized = synthesizer.synthesize(n=len(data))
            fn_logging(data, synthesized)
        print()
        print(' ', 'took', time.time() - start, 's')
        return synthesizer.synthesize(n=len(data))

def plot():
    value_types = {value.name: type(value) for value in synthesizer.values}
    distances = [ks_2samp(data[col], synthesized[col])[0] for col in data.columns]
    avg_distance = np.mean(distances)
    evaluation[name + '_avg_distance'] = avg_distance
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5), sharex=True, sharey=True)
    ax1.set_title('original')
    ax2.set_title('synthesized')
    _plot_data(data, ax=ax1, value_types=value_types)
    _plot_data(synthesized, ax=ax2, value_types=value_types)

In [5]:
size = 10000
def fn_logging(data, synthesized):
    distance = ks_2samp(data['x'], synthesized['x']).statistic
    suffix = ''
    if distance < 0.05:
        suffix += '*'
    if distance < 0.01:
        suffix += '*'
    print('  {:.3f}{}'.format(distance, suffix), end=' ')

### Bernoulli

In [6]:
data = create_bernoulli(probability=0.1, size=size)
synthesized = synthesize(
    data=data, summarizer=True, num_iterations=1000, num_logging=20, fn_logging=fn_logging,
    # encoder/decoder
    network_type='mlp', capacity=512, depth=2, layer_type='dense',
    batchnorm=True, activation='relu', weight_decay=1e-3,
    # encoding
    encoding_type='variational', encoding_size=512, encoding_kwargs=dict(beta=5.0),
    # optimizer
    optimizer='adam', learning_rate=1e-5, decay_steps=200, decay_rate=0.5,
    clip_gradients=1.0, batch_size=128,
    # categorical
    smoothing=0.0, moving_average=True, similarity_regularization=0.0,
    entropy_regularization=0.05
)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Use tf.cast instead.
  value types:
  x categorical2-281

  0.456   0.322   0.227   0.164   0.110   0.079   0.060   0.048*   0.037*   0.031*   0.025*   0.016*   0.018*   0.015*   0.008**   0.012*   0.009**   0.009**   0.006**   0.010*   0.006**   0.003**   0.006**   0.006**   0.006**   0.009**   0.012*   0.004**   0.000**   0.002**   0.004**   0.002**   0.007**   0.001**   0.003**   0.001**   0.002**   0.003**   0.002**   0.000**   0.001**   0.003**   0.002**   0.001**   0.001**   0.004**   0.004**   0.003**   0.003**   0.000**   0.001** 
  took 28.573664665222168 s


### Categorical

In [7]:
data = create_categorical(probabilities=[0.5, 0.25, 0.125, 0.0625, 0.0625], size=size)
synthesized = synthesize(
    data=data, summarizer=True, num_iterations=1000, num_logging=20, fn_logging=fn_logging,
    # encoder/decoder
    network_type='mlp', capacity=512, depth=2, layer_type='dense',
    batchnorm=True, activation='relu', weight_decay=1e-3,
    # encoding
    encoding_type='variational', encoding_size=512, encoding_kwargs=dict(beta=5.0),
    # optimizer
    optimizer='adam', learning_rate=1e-5, decay_steps=200, decay_rate=0.5,
    clip_gradients=1.0, batch_size=128,
    # categorical
    smoothing=0.0, moving_average=True, similarity_regularization=0.0,
    entropy_regularization=0.16
)

  value types:
  x categorical5-458

  0.449   0.400   0.336   0.266   0.211   0.163   0.125   0.091   0.080   0.070   0.061   0.056   0.049*   0.049*   0.041*   0.041*   0.035*   0.032*   0.023*   0.030*   0.017*   0.018*   0.020*   0.018*   0.012*   0.011*   0.012*   0.007**   0.010*   0.010*   0.008**   0.014*   0.009**   0.009**   0.014*   0.016*   0.017*   0.014*   0.015*   0.017*   0.022*   0.016*   0.022*   0.023*   0.016*   0.023*   0.014*   0.014*   0.015*   0.019*   0.020* 
  took 30.78738307952881 s


### 1-d Gaussian

In [None]:
data = create_1d_gaussian(mean=1000,  std=100, size=10000)
synthesize_and_plot(data, name='1d_gaussian', evaluation=evaluation)