In [18]:
import sys
!{sys.executable} -m pip install kfp --upgrade
import json
import kfp
import kfp.dsl as dsl
import kfp.components as comp
from kfp.components import OutputPath



In [19]:
#The Google Cloud project that this pipeline runs in.
project_id = "jads-de-2021"
# The region that this pipeline runs in
region = "us-west1-b"
# Specify a Cloud Storage URI that your pipelines service account can access. The artifacts of your pipeline runs are stored within the pipeline root.
pipeline_root_path = "gs://dejads_temp_yk"

In [20]:
from typing import Dict

def download_data(project_id: str, bucket: str, file_name: str) -> Dict:
    '''download data'''
    from google.cloud import storage
    import pandas as pd
    import logging 
    import sys
    
    logging.basicConfig(stream=sys.stdout, level=logging.INFO)
    
    client = storage.Client(project=project_id)
    bucket = client.get_bucket(bucket)
    blob = bucket.blob(file_name)
    local_path = '/tmp/'+ file_name
    blob.download_to_filename(local_path)
    logging.info('#######Downloaded Data!')
    dict_from_csv = pd.read_csv(local_path, index_col=None, squeeze=True).to_dict()
    logging.info('#######Returning Data as Dictionary Object!')
    return dict_from_csv

In [21]:
# create a KFP component for data ingestion
data_ingestion_comp = comp.create_component_from_func(
    download_data, output_component_file='data_ingestion.yaml', packages_to_install=['google-cloud-storage', 'pandas'])

In [37]:
from typing import NamedTuple, Dict
def train_mlp (features: Dict, project_id: str, model_repo: str) -> NamedTuple(
    'EvalResults_trainset',
    [
      ('mlmodel_metrics', 'Metrics')
    ]):
    '''train a MLP with default parameters'''
    import pandas as pd
    from google.cloud import storage
    from keras.layers import Dense
    from keras.models import Sequential
    import json
    import logging 
    import sys
    import os
    
    logging.basicConfig(stream=sys.stdout, level=logging.INFO)
    
    df = pd.DataFrame.from_dict(features)  
    
    logging.info(df.columns)
        
    # split into input (X) and output (Y) variables
    X = df.loc[:, ['ntp','age', 'bmi', 'dbp', 'dpf', 'pgc', 'si', 'tsft']].values
    Y = df.loc[:, ['class']].values
    # define model
    model = Sequential()
    model.add(Dense(12, input_dim=8, activation='relu'))
    model.add(Dense(8, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    # compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    # Fit the model
    model.fit(X, Y, epochs=150, batch_size=10, verbose=0)
    # evaluate the model
    scores = model.evaluate(X, Y, verbose=0)
    logging.info(model.metrics_names)
    metrics = {
        "accuracy:": scores[1],
        "loss": scores[0],
    }
   
    # Save the model localy
    local_file = '/tmp/local_model.h5'
    model.save(local_file)
    # Save to GCS as model.h5
    client = storage.Client(project=project_id)
    bucket = client.get_bucket(model_repo)
    blob = bucket.blob('model_lab9_1_ex_1.h5')
        # Upload the locally saved model
    blob.upload_from_filename(local_file)
        # Clean up
    os.remove(local_file)
    print("Saved the model to GCP bucket : " + model_repo)
    
    from collections import namedtuple
    fun_output = namedtuple('EvalResults_trainset',
        ['mlmodel_metrics'])
    
    logging.info(f'Evaluate on train set: {metrics}')
    return fun_output(json.dumps(metrics))

In [38]:
# create a KFP component for training 
train_mlp_com = comp.create_component_from_func(
    train_mlp, output_component_file='training_mlp_ex.yaml', packages_to_install=['google-cloud-storage', 'pandas', 'keras', 'tensorflow', 'h5py', 'scikit-learn'])

In [39]:
from typing import NamedTuple, Dict
def prediction (features: Dict, project_id: str, model_repo: str) -> NamedTuple(
    'EvalResults_testset',
    [
      ('mlmodel_metrics', 'Metrics')
    ]):
    '''train a MLP with default parameters'''
    import pandas as pd
    from google.cloud import storage
    from keras.layers import Dense
    from keras.models import load_model
    import json
    import logging 
    import sys
    import os
    
    logging.basicConfig(stream=sys.stdout, level=logging.INFO)
    
    df = pd.DataFrame.from_dict(features)  
    
    logging.info(df.columns)
        
    # split into input (X) and output (Y) variables
    X = df.loc[:, ['ntp','age', 'bmi', 'dbp', 'dpf', 'pgc', 'si', 'tsft']].values
    Y = df.loc[:, ['class']].values
    # load model
    client = storage.Client(project=project_id)
    bucket = client.get_bucket(model_repo)
    blob = bucket.blob('model_lab9_1_ex_1.h5')
    blob.download_to_filename('local_model_1.h5')
    model = load_model('local_model_1.h5')
    
    # evaluate the model
    scores = model.evaluate(X, Y, verbose=0)
    logging.info(model.metrics_names)
    metrics = {
        "accuracy:": scores[1],
        "loss": scores[0],
    }
    
    from collections import namedtuple
    fun_output = namedtuple('EvalResults_testset',
        ['mlmodel_metrics'])
    
    logging.info(f'Evaluate on test set: {metrics}')
    return fun_output(json.dumps(metrics))

In [40]:
# create a KFP component for testing
test_mlp_com = comp.create_component_from_func(
    prediction, output_component_file='testing_mlp_ex.yaml', packages_to_install=['google-cloud-storage', 'pandas', 'keras', 'tensorflow', 'h5py', 'scikit-learn'])

In [41]:
# Define the workflow of the pipeline.
@kfp.dsl.pipeline(
    name="lab9-v2-ex",
    pipeline_root=pipeline_root_path)
def pipeline(project_id: str, data_bucket: str, trainset_filename: str, model_repo: str, testset_filename: str):
    
    # The first step    
    di_train_op = data_ingestion_comp(
        project_id=project_id,
        bucket=data_bucket,
        file_name=trainset_filename
    )
    
    # The second step 
    di_predict_op = data_ingestion_comp(
        project_id=project_id,
        bucket=data_bucket,
        file_name=testset_filename
    )

    # The third step 
    training_job_run_op = train_mlp_com(
        project_id=project_id,
        model_repo=model_repo,       
        features=di_train_op.output
    )
    
    # The fourth step 
    testing_job_run_op = test_mlp_com(
        project_id=project_id,
        model_repo=model_repo,       
        features=di_predict_op.output
    )

In [42]:
from kfp.v2 import compiler
compiler.Compiler().compile(pipeline_func=pipeline,
        package_path='diabetes_predictor_pipeline_ex.json')

In [30]:
!{sys.executable} -m pip install google-cloud-aiplatform



In [43]:
import google.cloud.aiplatform as aip

job = aip.PipelineJob(
    display_name="lab9-v2-diabetes-predictor-ex",
    template_path="diabetes_predictor_pipeline_ex.json",
    pipeline_root=pipeline_root_path,
    parameter_values={
        'project_id': project_id,
        'data_bucket': 'dejads_input_yk',
        'trainset_filename': 'training_set.csv',
        'testset_filename': 'prediction_set.csv',
        'model_repo':'models-de-lab9'
    }
)

job.run()

INFO:google.cloud.aiplatform.pipeline_jobs:Creating PipelineJob
INFO:google.cloud.aiplatform.pipeline_jobs:PipelineJob created. Resource name: projects/668001595021/locations/us-central1/pipelineJobs/lab9-v2-ex-20211119084814
INFO:google.cloud.aiplatform.pipeline_jobs:To use this PipelineJob in another session:
INFO:google.cloud.aiplatform.pipeline_jobs:pipeline_job = aiplatform.PipelineJob.get('projects/668001595021/locations/us-central1/pipelineJobs/lab9-v2-ex-20211119084814')
INFO:google.cloud.aiplatform.pipeline_jobs:View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/lab9-v2-ex-20211119084814?project=668001595021
INFO:google.cloud.aiplatform.pipeline_jobs:PipelineJob projects/668001595021/locations/us-central1/pipelineJobs/lab9-v2-ex-20211119084814 current state:
PipelineState.PIPELINE_STATE_RUNNING
INFO:google.cloud.aiplatform.pipeline_jobs:PipelineJob projects/668001595021/locations/us-central1/pipelineJobs/lab9-v2-ex-20211119084814

RuntimeError: Job failed with:
code: 9
message: "The DAG failed because some tasks failed. The failed tasks are: [prediction].; Job (project_id = jads-de-2021, job_id = 560935648119226368) is failed due to the above error.; Failed to handle the job: {project_number = 668001595021, job_id = 560935648119226368}"
