In [None]:
! jt -t grade3 -nf opensans

<h1><center>1. Load Data</center></h1>



In [None]:
import os
import json
import warnings
import pandas as pd
import time


warnings.filterwarnings(action='ignore', message='numpy.dtype size changed')
warnings.filterwarnings(action='ignore', message='compiletime version 3.5 of module')

if not 'workbookDir' in globals():
    workbookDir = os.getcwd()
os.chdir(os.path.split(workbookDir)[0])

%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
from synthesized.testing.evaluation import Evaluation
#evaluation = Evaluation(config_path='configs/evaluation/dataset_evaluation.json', name='james') # use this line if you want to run this notbook manually
evaluation_name = os.environ.get('EVALUATION_NAME', 'n/a')
branch = os.environ.get('EVALUATION_BRANCH', 'n/a')
revision = os.environ.get('EVALUATION_REVISION', 'n/a')
evaluation = Evaluation(branch=branch, revision=revision, group="dataset_evaluation")

config_path = os.environ.get('EVALUATION_CONFIG_PATH', 'n/a')
with open(config_path, 'r') as f:
    configs = json.load(f)
    evaluation.record_config(evaluation=evaluation_name, config=configs["instances"][evaluation_name])
        

In [None]:
data = pd.read_csv(evaluation.configs[evaluation_name]['data'])
data = data.drop(evaluation.configs[evaluation_name]['ignore_columns'], axis=1)
data.dropna(inplace=True)
data.head(5)





<h1><center>2. Train model and generate synthetic data</center></h1>

In [None]:
from sklearn.model_selection import train_test_split
import synthesized

In [None]:
train, test = train_test_split(data, test_size=0.2, random_state=0)

In [None]:
def train_and_synthesize(synthesizer_class=evaluation.configs[evaluation_name]["synthesizer_class"]):
    assert synthesizer_class in {"HighDimSynthesizer", "SeriesSynthesizer"}
    synthesizer_constructor = getattr(synthesized, synthesizer_class)
    loss_history = list()
    
    def callback(synth, iteration, losses):
        if len(loss_history) == 0:
            loss_history.append(losses)
        else:
            loss_history.append({name: losses[name] for name in loss_history[0]})
    
    with synthesizer_constructor(df=data, **evaluation.configs[evaluation_name]['params']) as synthesizer:
        t = time.time()
        synthesizer.learn(
            df_train=train, num_iterations=evaluation.configs[evaluation_name]['num_iterations'],
            callback=callback, callback_freq=100
        )
        training_time = time.time() - t
        if synthesizer_class == "HighDimSynthesizer":
            synthesized_data = synthesizer.synthesize(num_rows=len(test))
        else:
            series_lengths = data.groupby(evaluation.configs[evaluation_name]["params"]["identifier_label"]).count().to_numpy().transpose()
            series_lengths = list(series_lengths[0])
            synthesized_data = synthesizer.synthesize(series_lengths=series_lengths)
        return synthesizer, synthesized_data, loss_history, training_time


In [None]:
%%capture
synthesizers_and_results = [train_and_synthesize() for i in range(evaluation.configs[evaluation_name]['num_passes'])]
synthesizer = synthesizers_and_results[0][0]

## Plot losses

In [None]:
loss_history = synthesizers_and_results[0][2]
pd.DataFrame.from_records(loss_history).plot(figsize=(15,7))

### Register training time

In [None]:
training_time = synthesizers_and_results[0][3]
evaluation.record_metric(evaluation=evaluation_name, key='training_time', value=training_time)

## Display aggregated statistics 

In [None]:
from scipy.stats import ks_2samp, wasserstein_distance
import numpy as np
import seaborn as sns

In [None]:
def plot_avg_distances():
    result = []
    for i, (synthesizer, synthesized_, _, _) in enumerate(synthesizers_and_results):
        test_ = synthesizer.preprocess(test)
        synthesized_ = synthesizer.preprocess(synthesized_)
        
        ks_distances = []
        wasserstein_distance = []
        for col in synthesized_.columns:
            try:
                ks_distances.append(ks_2samp(test_[col], synthesized_[col])[0])
                wasserstein_distance.append(wasserstein_distance(test_[col], synthesized_[col])[0])
            except:
                pass
            
        avg_ks_distance = np.mean(ks_distances)
        max_ks_distance = np.max(ks_disances)
        avg_wss_distance = np.mean(wasserstein_distance)
        try:
            corr = np.abs((orig.corr() - synth.corr()).to_numpy())
            avg_corr = corr.mean()
            max_corr = corr.max()            
        except:
            pass
        
        print('run: {}, AVG distance: {}'.format(i+1, avg_distance))
        result.append({'run': i+1, 'avg_distance': avg_distance})
        
        evaluation.record_metric(evaluation=evaluation_name, key='avg_ks_distance', value=avg_ks_distance)
        evaluation.record_metric(evaluation=evaluation_name, key='max_ks_distance', value=max_ks_distance)
        evaluation.record_metric(evaluation=evaluation_name, key='avg_wasserstein_distance', value=avg_wss_distance)
        evaluation.record_metric(evaluation=evaluation_name, key='avg_correlations', value=avg_corr)
        evaluation.record_metric(evaluation=evaluation_name, key='max_correlations', value=max_corr)
        
    df = pd.DataFrame.from_records(result)
    df['run'] = df['run'].astype('category')
    g = sns.barplot(y='run', x='avg_distance', data=df)
    g.set_xlim(0.0, 1.0)

In [None]:
plot_avg_distances()

## Details for the 1st Run 

In [None]:
from synthesized.testing import UtilityTesting
testing = UtilityTesting(synthesizer, train, test, synthesizers_and_results[0][1])

In [None]:
testing.show_distribution_distances()

In [None]:
testing.show_distributions(remove_outliers=0.01)

## Display correlations

In [None]:
testing.show_corr_distances()

In [None]:
testing.show_corr_matrices()

## Demonstrate the utility for training ML models

In [None]:
try:
    utility = testing.utility(target=evaluation.configs[evaluation_name]['target'])
except:
    utility = 0.0
evaluation.record_metric(evaluation=evaluation_name, key='utility', value=utility)

In [None]:
evaluation.write_metrics()