In [None]:
! jt -t grade3 -nf opensans

<h1><center>1. Load Data</center></h1>



In [None]:
import os
import json
import warnings
import pandas as pd

warnings.filterwarnings(action='ignore', message='numpy.dtype size changed')
warnings.filterwarnings(action='ignore', message='compiletime version 3.5 of module')

if not 'workbookDir' in globals():
    workbookDir = os.getcwd()
os.chdir(os.path.split(workbookDir)[0])

%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
from synthesized.testing.evaluation import Evaluation
from synthesized.testing import evaluation_utils as eval_utils
from synthesized.testing import plotting as syn_plot
#evaluation = Evaluation(config_path='configs/evaluation/dataset_evaluation.json', name='james') # use this line if you want to run this notbook manually
evaluation_name = os.environ.get('EVALUATION_NAME', 'n/a')
branch = os.environ.get('EVALUATION_BRANCH', 'n/a')
revision = os.environ.get('EVALUATION_REVISION', 'n/a')
evaluation = Evaluation(branch=branch, revision=revision, group="dataset_evaluation")

config_path = os.environ.get('EVALUATION_CONFIG_PATH', 'n/a')
with open(config_path, 'r') as f:
    configs = json.load(f)
    evaluation.record_config(evaluation=evaluation_name, config=configs["instances"][evaluation_name])
        

In [None]:
data = pd.read_csv(evaluation.configs[evaluation_name]['data'])
data = data.drop(evaluation.configs[evaluation_name]['ignore_columns'], axis=1)
data.dropna(inplace=True)
data.head(5)





<h1><center>2. Train model and generate synthetic data</center></h1>

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train, test = train_test_split(data, test_size=0.2, random_state=0)

In [None]:
%%capture
synthesizers_and_results = [eval_utils.train_and_synthesize(data=data, evaluation=evaluation, 
                                                            evaluation_name=evaluation_name, 
                                                            test=test, train=train) 
                            for i in range(evaluation.configs[evaluation_name]['num_passes'])]
synthesizer = synthesizers_and_results[0][0]

## Plot losses

In [None]:
loss_history = synthesizers_and_results[0][2]
pd.DataFrame.from_records(loss_history).plot(figsize=(15,7))

## Register training time

In [None]:
training_time = synthesizers_and_results[0][3]
evaluation.record_metric(evaluation=evaluation_name, key='training_time', value=training_time)

## Display aggregated statistics 

In [None]:
syn_plot.plot_avg_distances(evaluation=evaluation, evaluation_name=evaluation_name, 
                            test=test, results=synthesizers_and_results)

## Details for the 1st Run 

In [None]:
from synthesized.testing import UtilityTesting
testing = UtilityTesting(synthesizer, train, test, synthesizers_and_results[0][1])

In [None]:
testing.show_distribution_distances()

In [None]:
testing.show_distributions(remove_outliers=0.01)

## Display correlations

In [None]:
testing.show_corr_distances()

In [None]:
testing.show_corr_matrices()

In [None]:
testing.show_auto_associations()

## Demonstrate the utility for training ML models

In [None]:
try:
    utility = testing.utility(target=evaluation.configs[evaluation_name]['target'])
except:
    utility = 0.0
evaluation.record_metric(evaluation=evaluation_name, key='utility', value=utility)

In [None]:
evaluation.write_metrics()