In [None]:
! jt -t grade3 -nf opensans

<h1><center>1. Load Data</center></h1>



In [None]:
import os
import warnings
import pandas as pd

warnings.filterwarnings(action='ignore', message='numpy.dtype size changed')
warnings.filterwarnings(action='ignore', message='compiletime version 3.5 of module')

if not 'workbookDir' in globals():
    workbookDir = os.getcwd()
os.chdir(os.path.split(workbookDir)[0])

%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
from synthesized.testing.evaluation import Evaluation
#evaluation = Evaluation(config_path='configs/evaluation/dataset_evaluation.json', name='james') # use this line if you want to run this notbook manually
evaluation = Evaluation()

In [None]:
data = pd.read_csv(evaluation.config['data'])
data = data.drop(evaluation.config['ignore_columns'], axis=1)
data.dropna(inplace=True)
data.head(5)





<h1><center>2. Train model and generate synthetic data</center></h1>

In [None]:
from sklearn.model_selection import train_test_split
import synthesized

In [None]:
train, test = train_test_split(data, test_size=0.2, random_state=0)

In [None]:
def train_and_synthesize(synthesizer_class=evaluation.config["synthesizer_class"]):
    assert synthesizer_class in {"BasicSynthesizer", "SeriesSynthesizer"}
    synthesizer_constructor = getattr(synthesized, synthesizer_class)
    loss_history = list()
    
    def callback(synth, iteration, losses):
        if len(loss_history) == 0:
            loss_history.append(losses)
        else:
            loss_history.append({name: losses[name] for name in loss_history[0]})
    
    with synthesizer_constructor(df=data, **evaluation.config['params']) as synthesizer:
        synthesizer.learn(
            df_train=train, num_iterations=evaluation.config['num_iterations'],
            callback=callback, callback_freq=100
        )
        if synthesizer_class == "BasicSynthesizer":
            synthesized_data = synthesizer.synthesize(num_rows=len(test))
        else:
            series_lengths = data.groupby(evaluation.config["params"]["identifier_label"]).count().to_numpy().transpose()
            series_lengths = list(series_lengths[0])
            synthesized_data = synthesizer.synthesize(series_lengths=series_lengths)
        return synthesizer, synthesized_data, loss_history

In [None]:
%%capture
synthesizers_and_results = [train_and_synthesize() for i in range(evaluation.config['num_passes'])]
synthesizer = synthesizers_and_results[0][0]

## Plot losses

In [None]:
loss_history = synthesizers_and_results[0][2]
pd.DataFrame.from_records(loss_history).plot(figsize=(15,7))

## Display aggregated statistics 

In [None]:
from scipy.stats import ks_2samp
import numpy as np
import seaborn as sns

In [None]:
def plot_avg_distances():
    result = []
    for i, (synthesizer, synthesized_, _) in enumerate(synthesizers_and_results):
        test_ = synthesizer.preprocess(test)
        synthesized_ = synthesizer.preprocess(synthesized_)
        distances = [ks_2samp(test_[col], synthesized_[col])[0] for col in synthesized_.columns]
        avg_distance = np.mean(distances)
        print('run: {}, AVG distance: {}'.format(i+1, avg_distance))
        result.append({'run': i+1, 'avg_distance': avg_distance})
        evaluation['avg_distance'] = avg_distance
    df = pd.DataFrame.from_records(result)
    df['run'] = df['run'].astype('category')
    g = sns.barplot(y='run', x='avg_distance', data=df)
    g.set_xlim(0.0, 1.0)

In [None]:
plot_avg_distances()

## Details for the 1st Run 

In [None]:
from synthesized.testing import UtilityTesting
testing = UtilityTesting(synthesizer, train, test, synthesizers_and_results[0][1])

In [None]:
testing.show_distribution_distances()

In [None]:
testing.show_distributions(remove_outliers=0.01)

## Display correlations

In [None]:
testing.show_corr_distances()

In [None]:
testing.show_corr_matrices()

## Demonstrate the utility for training ML models

In [None]:
try:
    evaluation['utility'] = testing.utility(target=evaluation.config['target'])
except:
    evaluation['utility'] = 0.0

In [None]:
evaluation.write_metrics()