# Login

In [None]:
import cengine

client = cengine.Client(username='hamza@maiot.io', 
                        password='password')

# Providers

#### Conceptually, every provider configures where our pipelines are running and persisting artifacts

In [None]:
# BUCKET_NAME ='gs://BUCKET_NAME'
# SERVICE_ACCOUNT = '/SERVICE_ACCOUNT.json'
# PROVIDER_NAME = 'PROVIDER_NAME'

In [None]:
# my_provider = client.create_provider(name=PROVIDER_NAME, 
#                                      provider_type='gcp', 
#                                      args={'service_account': SERVICE_ACCOUNT, 
#                                            'artifact_store': BUCKET_NAME})

In [None]:
# taking the first provider on the list
my_provider = client.get_providers()[0]

print(my_provider)

# Workspaces

#### The concept of workspaces is created to maintain an organized and efficient structure within the ML workflow of your organization. 

In [None]:
# Create a workspace
# active_workspace = client.create_workspace(name="ShowcaseWorkspace", 
                                           # provider_id=my_provider.id)
# print(active_workspace)

In [None]:
# Get the first workspace. Ensure this is the `HelloWorkspace`
active_workspace = client.get_workspaces()[1]

print(active_workspace)

# Datasources

#### Datasources are configurable, and fully versionable

In [None]:
# new_datasource = client.create_datasource(name='QuickstartDataset',
#                                           provider_id=my_provider.id,
#                                           source='bq',
#                                           type='tabular', 
#                                           args={"dataset": "ml_datasets", 
#                                                 "table": "census_adult_income", 
#                                                 "project": "bigquery-public-data"})
                                                
# print(new_datasource)

In [None]:
# new_datasource_commit = client.commit_datasource(new_datasource.id)

# print(new_datasource_commit)

In [None]:
# Get the specific datasource
new_datasource = client.get_datasources()[1]
# Get the version of the data we want
new_datasource_commit = client.get_datasource_commits(new_datasource.id)[0]

print(new_datasource)

In [None]:
import pandas as pd

sample = client.peek_datasource_commit(new_datasource.id, 
                                       new_datasource_commit.id)
    
pd.DataFrame(sample)

# Pipelines

#### Pipelines define an end-to-end training experiment from splitting to evaluation.

In [None]:
from cengine import PipelineConfig

# Start with a template
c = PipelineConfig.from_datasource(client=client,
                                   datasource_id=new_datasource.id,
                                   commit_id=new_datasource_commit.id)

# Configure you dataset split
c.split.categorize.by = 'marital_status'  # group by marital status, to represent this equally in train/eval
c.split.ratio = {'train': 0.8, 'eval': 0.2}

# Each feature can undergo multuple, individual transforms
# For now, we configure a non-default preprocessing with a built-in method
c.features['education_num'].transform.add_methods(
    {'method':'compute_and_apply_vocabulary'})
    
# Configure your labels
del c.features.income_bracket
c.labels.add(['income_bracket'])

# Configure your evaluation
del c.features.native_country
c.evaluator.slices = [['native_country']]
c.evaluator.metrics = ['binary_accuracy']

#### We can define arbitrary custom code here for preprocessing

In [None]:
from cengine import Method

from transform import identity

# Custom function
c.features['hours_per_week'].transform.add_methods([Method.from_callable(client=client, 
                                                                         fn=identity,
                                                                         params={'param_1': 2,
                                                                                 'param_2': 'temp'})])

print(c.features)

#### Define our model function here

In [None]:
# Configure your training with your model
from cengine import Trainer
from model import custom_model

c.trainer = Trainer.from_callable(client=client, 
                                  fn=custom_model,
                                  params={'input_units': 13,
                                          'output_units': 1,
                                          'batch_size': 16,
                                          'loss': 'binary_crossentropy',
                                          'metrics': ['binary_accuracy'],
                                          'lr': 0.0005,
                                          'epochs': 10})

print(c.trainer)

#### Final config

In [None]:
# An immutable artifact that lets you reproduce precisely this experiment
print(c)

#### Register and train a pipeline

In [None]:
first_pipeline = client.push_pipeline(name='census_run_01',
                                      config=c,
                                      workspace_id=active_workspace.id)

In [None]:
first_pipeline_run = client.train_pipeline(pipeline_id=first_pipeline.id,
                                           datasource_commit_id=new_datasource_commit.id)

In [None]:
# As the above takes a few minutes, lets explore the options we have in the train_pipeline method

client.train_pipeline?

#### Check pipeline status

In [None]:
# Wait here until STATUS turns to 'SUCCEEEDED'

client.get_pipeline_status(workspace_id=active_workspace.id)[first_pipeline.id]

In [None]:
client.get_pipeline_run_logs(pipeline_id=first_pipeline.id, 
                             pipeline_run_id=first_pipeline_run.id)

#### Check the statistics

In [None]:
# Note: Execute twice if it does not appear the first time
client.get_statistics(pipeline_id=first_pipeline.id,
                      pipeline_run_id=first_pipeline_run.id,
                      magic=True)

#### See the results

#### The evaluator key had let us define metrics and slices. Now we can see the results across our 'native_country' slice

In [None]:
# Note: Execute slicing graphic twice if it does not appear the first time
# Note: Change slicing_column to native_country in the slicing code block

client.evaluate_single_pipeline(pipeline_id=first_pipeline.id,
                                pipeline_run_id=first_pipeline_run.id,
                                magic=True)

#### Download the model: You can extract the model to deploy it on your own or deploy it automatically on a serving_backend and logic of your choice. We take the former approach here and download the model

In [None]:
# Make sure to delete the 'model' directory before executing

import os 

client.download_model(pipeline_id=first_pipeline.id, 
                      pipeline_run_id=first_pipeline_run.id,
                      output_path=os.path.join(os.getcwd(), 'model'))

In [None]:
!find model

#### Iterate: Now comes the most powerful aspect of all this. Not only is this experiment fully persisted forever, anyone in the team #### can now pull this experiment pipeline and improve and compare results. All interim results are cached.

In [None]:
second_config = client.pull_pipeline(pipeline_id=first_pipeline.id)

# Lets double the batch size
second_config.trainer.params['batch_size'] = 32

In [None]:
second_pipeline = client.push_pipeline(name='census_run_02',
                                       config=second_config,
                                       workspace_id=active_workspace.id)

In [None]:
second_pipeline_run = client.train_pipeline(
    pipeline_id=second_pipeline.id,
    datasource_commit_id=new_datasource_commit.id)

In [None]:
# Wait here until STATUS turns to 'SUCCEEEDED'

client.get_pipeline_status(workspace_id=active_workspace.id)[second_pipeline.id]

In [None]:
#### Evaluate the second pipeline to see how we did by increasing batch size

# Note: Execute slicing graphic twice if it does not appear the first time
client.evaluate_single_pipeline(pipeline_id=second_pipeline.id,
                                pipeline_run_id=second_pipeline_run.id,
                                magic=True)

#### Compare multiple pipelines

In [None]:
client.compare_multiple_pipelines(active_workspace.id)