In [None]:
from zenml.datasources import CSVDatasource
from zenml.pipelines import TrainingPipeline
from zenml.steps.evaluator import TFMAEvaluator
from zenml.steps.preprocesser import StandardPreprocesser
from zenml.steps.split import RandomSplit
from zenml.steps.trainer import TFFeedForwardTrainer
from zenml.repo import Repository

## Create first pipeline

In [None]:
training_pipeline = TrainingPipeline(name='Experiment 1')

#### Add a datasource. This will automatically track and version it.

In [None]:
try:
    ds = CSVDatasource(name='Pima Indians Diabetes', path='gs://zenml_quickstart/diabetes.csv')
except:
    repo: Repository = Repository.get_instance()
    ds = repo.get_datasource_by_name('Pima Indians Diabetes')
training_pipeline.add_datasource(ds)

#### Add a split step to partition data into train and eval

In [None]:
training_pipeline.add_split(RandomSplit(split_map={'train': 0.7, 'eval': 0.3}))

#### Add a preprocessing step to transform data to be ML-capable

In [None]:
training_pipeline.add_preprocesser(
    StandardPreprocesser(
        features=['times_pregnant', 'pgc', 'dbp', 'tst', 'insulin', 'bmi',
                  'pedigree', 'age'],
        labels=['has_diabetes'],
        overwrite={'has_diabetes': {
            'transform': [{'method': 'no_transform', 'parameters': {}}]}}
    ))

#### Add a trainer which defines model and training

In [None]:
training_pipeline.add_trainer(TFFeedForwardTrainer(
    loss='binary_crossentropy',
    last_activation='sigmoid',
    output_units=1,
    metrics=['accuracy'],
    epochs=5))

#### Add an evaluator to calculate slicing metrics

In [None]:
training_pipeline.add_evaluator(
    TFMAEvaluator(slices=[['has_diabetes']],
                  metrics={'has_diabetes': ['binary_crossentropy',
                                            'binary_accuracy']}))

#### Run and evaluate

In [None]:
training_pipeline.run()

In [None]:
training_pipeline.view_statistics(magic=True)

In [None]:
training_pipeline.evaluate(magic=True)

## Inspect datasource

#### Get reference to repository

In [None]:
from zenml.repo import Repository

repo: Repository = Repository.get_instance()

#### Load datasource as DataFrame

In [None]:
datasources = repo.get_datasources()
datasource = datasources[0]
print(datasource)

In [None]:
df = datasource.sample_data()
df.head()

In [None]:
df.columns

## Create another pipeline

#### Change one hyper-parameter

In [None]:
training_pipeline_2 = training_pipeline.copy('Experiment 2')
training_pipeline_2.add_trainer(TFFeedForwardTrainer(
    loss='binary_crossentropy',
    last_activation='sigmoid',
    output_units=1,
    metrics=['accuracy'],
    epochs=30))

In [None]:
training_pipeline_2.run()

In [None]:
training_pipeline_2.evaluate(magic=True)

## Post-training

#### Verify theres still only one datasource

In [None]:
datasources = repo.get_datasources()
print(f"We have {len(datasources)} datasources")

#### Compare pipelines

In [None]:
repo.compare_pipelines()