In [64]:
import sys
!{sys.executable} -m pip install kfp --upgrade



In [65]:
import json
import kfp
import kfp.dsl as dsl
import kfp.components as comp
from kfp.components import OutputPath

#### Pipeline Configurations

In [110]:
#The Google Cloud project that this pipeline runs in.
project_id = "de2021-324520"
# The region that this pipeline runs in
region = "us-west1"
# Specify a Cloud Storage URI that your pipelines service account can access. The artifacts of your pipeline runs are stored within the pipeline root.
pipeline_root_path = "gs://dejads_temp"

#### Create Pipeline Components

We can create a component from Python functions (inline) and from a container. We will first try inline python functions. 

Step 1: Define the python function

Step 2:  Use **kfp.components.create_component_from_func** build the component. This function takes four parameters.

**1.func**: The Python function to convert.

**2.base_image**: (Optional.) Specify the Docker container image to run this function in. 

**3.output_component_file**: (Optional.) Writes your component definition to a file. 

**4.packages_to_install**: (Optional.) A list of versioned Python packages to install before running your function.

Another thing we need to consider is passing parameters between components. We can pass simple parameters such as integer, string, tuple, dict, and list by values. To pass the large datasets or complex configurations, we can use files. We can annotate the Python function’s parameters to indicate input or output files for the component. 

Refer to  https://www.kubeflow.org/docs/components/pipelines/sdk/python-function-components/ for more information.

#### Pipeline Component : Data Ingestion

In [67]:
from typing import Dict

def download_data(project_id: str, bucket: str, file_name: str) -> Dict:
    '''download data'''
    from google.cloud import storage
    import pandas as pd
    import logging 
    import sys
    
    logging.basicConfig(stream=sys.stdout, level=logging.INFO)
    
    client = storage.Client(project=project_id)
    bucket = client.get_bucket(bucket)
    blob = bucket.blob(file_name)
    local_path = '/tmp/'+ file_name
    blob.download_to_filename(local_path)
    logging.info('Downloaded Data!')
    dict_from_csv = pd.read_csv(local_path, index_col=None, squeeze=True).to_dict()
    logging.info('Returning Data as Dictionary Object!')
    return dict_from_csv

In [69]:
# create a KFP component for data ingestion
data_ingestion_comp = comp.create_component_from_func(
    download_data, output_component_file='data_ingestion.yaml', packages_to_install=['google-cloud-storage', 'pandas'])

#### Pipeline Component : Training-MLP 

In [90]:
from typing import NamedTuple, Dict
def train_mlp (features: Dict, project_id: str, model_repo: str) -> Dict:
    '''train a MLP with default parameters'''
    import pandas as pd
    from google.cloud import storage
    from keras.layers import Dense
    from keras.models import Sequential
    import json
    import logging 
    import sys
    import os
    
    
    logging.basicConfig(stream=sys.stdout, level=logging.INFO)
    
    df = pd.DataFrame.from_dict(features)  
    
    logging.info(df.columns)
        
    # split into input (X) and output (Y) variables
    X = df.loc[:, ['ntp','age', 'bmi', 'dbp', 'dpf', 'pgc', 'si', 'tsft']].values
    Y = df.loc[:, ['class']].values
    # define model
    model = Sequential()
    model.add(Dense(12, input_dim=8, activation='relu'))
    model.add(Dense(8, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    # compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    # Fit the model
    model.fit(X, Y, epochs=150, batch_size=10, verbose=0)
    # evaluate the model
    scores = model.evaluate(X, Y, verbose=0)
    logging.info(model.metrics_names)
    metrics = {
        "accuracy": scores[1],
        "loss": scores[0],
    }
   
    # Save the model localy
    local_file = '/tmp/local_model.h5'
    model.save(local_file)
     # write out output
  
    # Save to GCS as model.h5
    client = storage.Client(project=project_id)
    bucket = client.get_bucket(model_repo)
    blob = bucket.blob('model.h5')
        # Upload the locally saved model
    blob.upload_from_filename(local_file)
        # Clean up
    # os.remove(local_file)
    print("Saved the model to GCP bucket : " + model_repo)
    return metrics

In [91]:
# create a KFP component for training 
train_mlp_com = comp.create_component_from_func(
    train_mlp, output_component_file='training_mlp.yaml', packages_to_install=['google-cloud-storage', 'pandas', 'keras', 'tensorflow', 'h5py', 'scikit-learn'])

#### Pipeline Component : Training LogisticRegression

In [88]:
from typing import NamedTuple, Dict
def train_lr (features: Dict, project_id: str, model_repo: str) -> Dict:
    '''train a LogisticRegression with default parameters'''
    import pandas as pd
    from sklearn.linear_model import LogisticRegression
    from sklearn import metrics
    from sklearn.model_selection import train_test_split
    from google.cloud import storage
    import json
    import logging 
    import sys
    import os
    import joblib
        
    logging.basicConfig(stream=sys.stdout, level=logging.INFO)
    
    df = pd.DataFrame.from_dict(features)  
    
    logging.info(df.columns)        
    
    x_train, x_test, y_train, y_test = train_test_split(df.drop('class',axis=1), 
                                                    df['class'], test_size=0.30, 
                                                    random_state=101)
    model = LogisticRegression()
    model.fit(x_train,y_train)

    metrics = {
        "accuracy": model.score(x_test, y_test)
    }
    logging.info(metrics)
   
    # Save the model localy
    local_file = '/tmp/local_model.pkl'
    joblib.dump(model, local_file)
     # write out output
  
    # Save to GCS as model.h5
    client = storage.Client(project=project_id)
    bucket = client.get_bucket(model_repo)
    blob = bucket.blob('lrmodel.pkl')
        # Upload the locally saved model
    blob.upload_from_filename(local_file)
        # Clean up
    # os.remove(local_file)
    print("Saved the model to GCP bucket : " + model_repo)
    return metrics

In [89]:
# create a KFP component for training lr model
trail_lr_com = comp.create_component_from_func(
    train_lr, output_component_file='train_lr_model.yaml', packages_to_install=['google-cloud-storage', 'pandas', 'joblib', 'scikit-learn'])

#### Pipeline Component : Prediction-MLP

In [74]:
def predict(project_id: str, model_repo: str, features: Dict) -> Dict:
    import pandas as pd
    from keras.models import load_model
    from google.cloud import storage
    import json
    import logging
    import sys
    import os
    
    logging.basicConfig(stream=sys.stdout, level=logging.INFO)
    
    df = pd.DataFrame.from_dict(features)
    
    xNew = df.loc[:, ['ntp', 'age', 'bmi', 'dbp', 'dpf', 'pgc', 'si', 'tsft']].values
    client = storage.Client(project=project_id)
    bucket = client.get_bucket(model_repo)
    blob = bucket.blob('model.h5')
    blob.download_to_filename('/tmp/local_model.h5')
    model = load_model('/tmp/local_model.h5')
    dfcp = df.copy()
    result = model.predict(xNew)   
    y_classes = result.argmax(axis=-1)
    logging.info(y_classes)
    dfcp['pclass'] = y_classes.tolist()
    dic = dfcp.to_dict(orient='records')   
    return dic

In [75]:
# create a KFP component for prediction 
prediction_mpl_com = comp.create_component_from_func(
    predict, output_component_file='prediction_mlp_com.yaml', packages_to_install=['google-cloud-storage', 'pandas', 'keras', 'tensorflow', 'h5py', 'scikit-learn'])

#### Pipeline Component : Prediction-LR

In [76]:
def predict_lr(project_id: str, model_repo: str, features: Dict) -> Dict:
    import pandas as pd
    import joblib
    from google.cloud import storage
    import json
    import logging
    import sys
    import os
    
    logging.basicConfig(stream=sys.stdout, level=logging.INFO)
    
    df = pd.DataFrame.from_dict(features)    
    
    client = storage.Client(project=project_id)
    bucket = client.get_bucket(model_repo)
    blob = bucket.blob('lrmodel.pkl')
    filename = '/tmp/local_model.pkl'
    blob.download_to_filename(filename)
        
    #Loading the saved model with joblib
    model = joblib.load(filename)

    xNew = df[['ntp', 'age', 'bmi', 'dbp', 'dpf', 'pgc', 'si', 'tsft']]

    dfcp = df.copy()   
    y_classes = model.predict(xNew)
    logging.info(y_classes)
    dfcp['pclass'] = y_classes.tolist()
    dic = dfcp.to_dict(orient='records') 
    return dic

In [77]:
# create a KFP component for prediction LR 
prediction_lr_com = comp.create_component_from_func(
    predict_lr, output_component_file='prediction_lr_com.yaml', packages_to_install=['google-cloud-storage', 'pandas', 'joblib', 'scikit-learn'])

#### Pipeline Component : Algorithm Selection 

In [101]:
from typing import NamedTuple, Dict

def compare_model(mlp_metrics: Dict, lr_metrics: Dict) -> str:
    import logging
    import json
    import sys
    logging.basicConfig(stream=sys.stdout, level=logging.INFO)
    logging.info(mlp_metrics)
    logging.info(lr_metrics)
    if mlp_metrics.get("accuracy") > lr_metrics.get("accuracy"):
        return "MLP"
    else :
        return "LR"

In [102]:
# create a KFP component for selecting between MLP and LR
compare_model_com = comp.create_component_from_func(
    compare_model, output_component_file='alogo_selecion_com.yaml')

#### Define the Pipeline

In [111]:
# Define the workflow of the pipeline.
@kfp.dsl.pipeline(
    name="lab10-v1",
    pipeline_root=pipeline_root_path)
def pipeline(project_id: str, data_bucket: str, trainset_filename: str, model_repo: str, testset_filename: str, ):
    
    
    di_op = data_ingestion_comp(
        project_id=project_id,
        bucket=data_bucket,
        file_name=trainset_filename
    )

 
    training_mlp_job_run_op = train_mlp_com(
        project_id=project_id,
        model_repo=model_repo,       
        features=di_op.output
    )
    
     
    training_lr_job_run_op = trail_lr_com(
        project_id=project_id,
        model_repo=model_repo,       
        features=di_op.output
    )
    
    pre_di_op = data_ingestion_comp(
        project_id=project_id,
        bucket=data_bucket,
        file_name=testset_filename
    ).after(training_mlp_job_run_op, training_lr_job_run_op)
        
        
    comp_model__op = compare_model_com(training_mlp_job_run_op.output,
                                       training_lr_job_run_op.output).after(training_mlp_job_run_op, training_lr_job_run_op)  
    
    # defining the branching condition
    with dsl.Condition(comp_model__op.output=="MLP"):
        predict_mlp_job_run_op = prediction_mpl_com(
        project_id=project_id,
        model_repo=model_repo,       
        features=pre_di_op.output
        )
    with dsl.Condition(comp_model__op.output=="LR"):
        predict_lr_job_run_op = prediction_lr_com(
        project_id=project_id,
        model_repo=model_repo,       
        features=pre_di_op.output
       )

#### Compile the pipeline into a JSON file

In [112]:
from kfp.v2 import compiler
compiler.Compiler().compile(pipeline_func=pipeline,
        package_path='diabetes_predictor_pipeline3.json')

In [82]:
!{sys.executable} -m pip install google-cloud-aiplatform



#### Submit the pipeline run

In [None]:
import google.cloud.aiplatform as aip

job = aip.PipelineJob(
    display_name="lab10-v1-diabetes-predictor",
    template_path="diabetes_predictor_pipeline3.json",
    pipeline_root=pipeline_root_path,
    parameter_values={
        'project_id': project_id,
        'data_bucket': 'dejads_input',
        'trainset_filename': 'training_set.csv',
        'testset_filename': 'prediction_set.csv',
        'model_repo':'model_repo_de2021'
    }
)

job.run()

INFO:google.cloud.aiplatform.pipeline_jobs:Creating PipelineJob
INFO:google.cloud.aiplatform.pipeline_jobs:PipelineJob created. Resource name: projects/411957191814/locations/us-central1/pipelineJobs/lab9-v1-20211122162553
INFO:google.cloud.aiplatform.pipeline_jobs:To use this PipelineJob in another session:
INFO:google.cloud.aiplatform.pipeline_jobs:pipeline_job = aiplatform.PipelineJob.get('projects/411957191814/locations/us-central1/pipelineJobs/lab9-v1-20211122162553')
INFO:google.cloud.aiplatform.pipeline_jobs:View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/lab9-v1-20211122162553?project=411957191814
INFO:google.cloud.aiplatform.pipeline_jobs:PipelineJob run completed. Resource name: projects/411957191814/locations/us-central1/pipelineJobs/lab9-v1-20211122162553
