In [34]:
import os
import warnings

warnings.filterwarnings(action='ignore', message='numpy.dtype size changed')
warnings.filterwarnings(action='ignore', message='compiletime version 3.5 of module')

if not 'workbookDir' in globals():
    workbookDir = os.getcwd()
os.chdir(os.path.split(workbookDir)[0])

%load_ext autoreload
%autoreload 2
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [35]:
from scipy.stats import ks_2samp
from synthesized.testing.synthetic_distributions import *

In [36]:
def log_values(synthesizer):
    print(' ', 'value types:')
    for value in synthesizer.values:
        print(' ', value.name, value)
    print()

def log_distance(data, synthesized):
    distance = ks_2samp(data['x'], synthesized['x'])[0]
    print(' ', 'distance:', distance)

In [37]:
def synthesize(data, num_iterations, num_logging, fn_logging=log_distance, **kwargs):
    start = time.time()
    with BasicSynthesizer(data, **kwargs) as synthesizer:
        log_values(synthesizer)
        synthesized = synthesizer.synthesize(n=len(data))
        fn_logging(data, synthesized)
        for _ in range(num_iterations // num_logging):
            synthesizer.learn(data=data, num_iterations=num_logging)
            synthesized = synthesizer.synthesize(n=len(data))
            fn_logging(data, synthesized)
        print()
        print(' ', 'took', time.time() - start, 's')
        return synthesizer.synthesize(n=len(data))

def plot():
    value_types = {value.name: type(value) for value in synthesizer.values}
    distances = [ks_2samp(data[col], synthesized[col])[0] for col in data.columns]
    avg_distance = np.mean(distances)
    evaluation[name + '_avg_distance'] = avg_distance
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5), sharex=True, sharey=True)
    ax1.set_title('original')
    ax2.set_title('synthesized')
    _plot_data(data, ax=ax1, value_types=value_types)
    _plot_data(synthesized, ax=ax2, value_types=value_types)

### Bernoulli

In [42]:
size = 10000
def fn_logging(data, synthesized):
    counts = data['x'].value_counts()
    num_true1 = counts[True] if True in counts else 0
    counts = synthesized['x'].value_counts()
    num_true2 = counts[True] if True in counts else 0
    distance = abs(num_true1 - num_true2) / size
    print(
        ' ',
        # num_true1 / size, num_true2 / size,
        str(distance) + ('*' if distance < 0.01 else ''),
        end=' '
    )

data = create_bernoulli(probability=0.1, size=size)
synthesize(
    data=data, summarizer=True, num_iterations=1000, num_logging=20, fn_logging=fn_logging,
    network_type='mlp', capacity=512, depth=2, layer_type='dense',
    batchnorm=True, activation='relu', weight_decay=1e-3,
    encoding_type='variational', encoding_size=512, encoding_kwargs=dict(beta=5.0),
    optimizer='adam', learning_rate=1e-5, decay_steps=200, decay_rate=0.5,
    clip_gradients=1.0, batch_size=128
)

INFO:tensorflow:Summary name synthesizer/x/embeddings:0-gradient-norm is illegal; using synthesizer/x/embeddings_0-gradient-norm instead.
INFO:tensorflow:Summary name synthesizer/encoder/layer0/weight:0-gradient-norm is illegal; using synthesizer/encoder/layer0/weight_0-gradient-norm instead.
INFO:tensorflow:Summary name synthesizer/encoder/layer0/bias:0-gradient-norm is illegal; using synthesizer/encoder/layer0/bias_0-gradient-norm instead.
INFO:tensorflow:Summary name synthesizer/encoder/layer0/offset:0-gradient-norm is illegal; using synthesizer/encoder/layer0/offset_0-gradient-norm instead.
INFO:tensorflow:Summary name synthesizer/encoder/layer0/scale:0-gradient-norm is illegal; using synthesizer/encoder/layer0/scale_0-gradient-norm instead.
INFO:tensorflow:Summary name synthesizer/encoder/layer1/weight:0-gradient-norm is illegal; using synthesizer/encoder/layer1/weight_0-gradient-norm instead.
INFO:tensorflow:Summary name synthesizer/encoder/layer1/bias:0-gradient-norm is illegal;

INFO:tensorflow:Summary name synthesizer/encoder/layer1/scale:0-gradient-norm is illegal; using synthesizer/encoder/layer1/scale_0-gradient-norm instead.
INFO:tensorflow:Summary name synthesizer/encoding/mean/weight:0-gradient-norm is illegal; using synthesizer/encoding/mean/weight_0-gradient-norm instead.
INFO:tensorflow:Summary name synthesizer/encoding/mean/bias:0-gradient-norm is illegal; using synthesizer/encoding/mean/bias_0-gradient-norm instead.
INFO:tensorflow:Summary name synthesizer/encoding/stddev/weight:0-gradient-norm is illegal; using synthesizer/encoding/stddev/weight_0-gradient-norm instead.
INFO:tensorflow:Summary name synthesizer/encoding/stddev/bias:0-gradient-norm is illegal; using synthesizer/encoding/stddev/bias_0-gradient-norm instead.
INFO:tensorflow:Summary name synthesizer/decoder/layer0/weight:0-gradient-norm is illegal; using synthesizer/decoder/layer0/weight_0-gradient-norm instead.
INFO:tensorflow:Summary name synthesizer/decoder/layer0/bias:0-gradient-no

Unnamed: 0,x
0,False
1,False
2,False
3,True
4,True
5,False
6,False
7,False
8,False
9,False


### Categorical

In [None]:
data = create_categorical(probabilities=[0.5, 0.25, 0.125, 0.0625, 0.0625], size=10000)
synthesize_and_plot(data, name='categorical', evaluation=evaluation)

### 1-d Gaussian

In [None]:
data = create_1d_gaussian(mean=1000,  std=100, size=10000)
synthesize_and_plot(data, name='1d_gaussian', evaluation=evaluation)