In [23]:
import sys
!{sys.executable} -m pip install kfp --upgrade



In [24]:
import json
import kfp
import kfp.dsl as dsl
import kfp.components as comp
from kfp.components import OutputPath

#### Pipeline Configurations

In [1]:
#The Google Cloud project that this pipeline runs in.
project_id = "your project id"
# The region that this pipeline runs in
region = "us-west1"
# Specify a Cloud Storage URI that your pipelines service account can access. The artifacts of your pipeline runs are stored within the pipeline root.
pipeline_root_path = "gs://dejads_temp"

#### Create Pipeline Components

We can create a component from Python functions (inline) and from a container. We will first try inline python functions. 

Step 1: Define the python function

Step 2:  Use **kfp.components.create_component_from_func** build the component. This function takes four parameters.

**1.func**: The Python function to convert.

**2.base_image**: (Optional.) Specify the Docker container image to run this function in. 

**3.output_component_file**: (Optional.) Writes your component definition to a file. 

**4.packages_to_install**: (Optional.) A list of versioned Python packages to install before running your function.

Another thing we need to consider is passing parameters between components. We can pass simple parameters such as integer, string, tuple, dict, and list by values. To pass the large datasets or complex configurations, we can use files. We can annotate the Python function’s parameters to indicate input or output files for the component. 

Refer to  https://www.kubeflow.org/docs/components/pipelines/sdk/python-function-components/ for more information.

#### Pipeline Component : Data Ingestion

In [110]:
from typing import Dict

def download_data(project_id: str, bucket: str, file_name: str) -> Dict:
    '''download data'''
    from google.cloud import storage
    import pandas as pd
    import logging 
    import sys
    
    logging.basicConfig(stream=sys.stdout, level=logging.INFO)
    
    client = storage.Client(project=project_id)
    bucket = client.get_bucket(bucket)
    blob = bucket.blob(file_name)
    local_path = '/tmp/'+ file_name
    blob.download_to_filename(local_path)
    logging.info('Downloaded Data!')
    dict_from_csv = pd.read_csv(local_path, index_col=None, squeeze=True).to_dict()
    logging.info('Returning Data as Dictionary Object!')
    return dict_from_csv

In [111]:
# create a KFP component for data ingestion
data_ingestion_comp = comp.create_component_from_func(
    download_data, output_component_file='data_ingestion.yaml', packages_to_install=['google-cloud-storage', 'pandas'])

#### Pipeline Component : Training-MLP 

In [122]:
from typing import NamedTuple, Dict
def train_mlp (features: Dict, project_id: str, model_repo: str) -> NamedTuple(
    'EvalResults',
    [
      ('mlmodel_metrics', 'Metrics')
    ]):
    '''train a MLP with default parameters'''
    import pandas as pd
    from google.cloud import storage
    from keras.layers import Dense
    from keras.models import Sequential
    import json
    import logging 
    import sys
    import os
    
    logging.basicConfig(stream=sys.stdout, level=logging.INFO)
    
    df = pd.DataFrame.from_dict(features)  
    
    logging.info(df.columns)
        
    # split into input (X) and output (Y) variables
    X = df.loc[:, ['ntp','age', 'bmi', 'dbp', 'dpf', 'pgc', 'si', 'tsft']].values
    Y = df.loc[:, ['class']].values
    # define model
    model = Sequential()
    model.add(Dense(12, input_dim=8, activation='relu'))
    model.add(Dense(8, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    # compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    # Fit the model
    model.fit(X, Y, epochs=150, batch_size=10, verbose=0)
    # evaluate the model
    scores = model.evaluate(X, Y, verbose=0)
    logging.info(model.metrics_names)
    metrics = {
        "accuracy:": scores[1],
        "loss": scores[0],
    }
   
    # Save the model localy
    local_file = '/tmp/local_model.h5'
    model.save(local_file)
    # Save to GCS as model.h5
    client = storage.Client(project=project_id)
    bucket = client.get_bucket(model_repo)
    blob = bucket.blob('model.h5')
        # Upload the locally saved model
    blob.upload_from_filename(local_file)
        # Clean up
    os.remove(local_file)
    print("Saved the model to GCP bucket : " + model_repo)
    
    from collections import namedtuple
    fun_output = namedtuple('EvalResults',
        ['mlmodel_metrics'])
    return fun_output(json.dumps(metrics))

In [123]:
# create a KFP component for training 
train_mlp_com = comp.create_component_from_func(
    train_mlp, output_component_file='training_mlp.yaml', packages_to_install=['google-cloud-storage', 'pandas', 'keras', 'tensorflow', 'h5py', 'scikit-learn'])

#### Define the Pipeline

In [124]:
# Define the workflow of the pipeline.
@kfp.dsl.pipeline(
    name="lab9-v1",
    pipeline_root=pipeline_root_path)
def pipeline(project_id: str, data_bucket: str, testset_filename: str, model_repo: str):
    
    # The first step    
    di_op = data_ingestion_comp(
        project_id=project_id,
        bucket=data_bucket,
        file_name=testset_filename
    )

    # The second step 
    training_job_run_op = train_mlp_com(
        project_id=project_id,
        model_repo=model_repo,       
        features=di_op.output
    )

#### Compile the pipeline into a JSON file

In [125]:
from kfp.v2 import compiler
compiler.Compiler().compile(pipeline_func=pipeline,
        package_path='diabetes_predictor_pipeline.json')

In [46]:
!{sys.executable} -m pip install google-cloud-aiplatform



#### Submit the pipeline run

In [126]:
import google.cloud.aiplatform as aip

job = aip.PipelineJob(
    display_name="lab9-v1-diabetes-predictor",
    template_path="diabetes_predictor_pipeline.json",
    pipeline_root=pipeline_root_path,
    parameter_values={
        'project_id': project_id,
        'data_bucket': 'dejads_input',
        'testset_filename': 'training_set.csv',
        'model_repo':'model_repo_de2021'
    }
)

job.run()

INFO:google.cloud.aiplatform.pipeline_jobs:Creating PipelineJob
INFO:google.cloud.aiplatform.pipeline_jobs:PipelineJob created. Resource name: projects/411957191814/locations/us-central1/pipelineJobs/lab9-v1-20211116143131
INFO:google.cloud.aiplatform.pipeline_jobs:To use this PipelineJob in another session:
INFO:google.cloud.aiplatform.pipeline_jobs:pipeline_job = aiplatform.PipelineJob.get('projects/411957191814/locations/us-central1/pipelineJobs/lab9-v1-20211116143131')
INFO:google.cloud.aiplatform.pipeline_jobs:View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/lab9-v1-20211116143131?project=411957191814
INFO:google.cloud.aiplatform.pipeline_jobs:PipelineJob projects/411957191814/locations/us-central1/pipelineJobs/lab9-v1-20211116143131 current state:
PipelineState.PIPELINE_STATE_RUNNING
INFO:google.cloud.aiplatform.pipeline_jobs:PipelineJob projects/411957191814/locations/us-central1/pipelineJobs/lab9-v1-20211116143131 current state: