## Environment Setup

In [1]:
# Enable the Below  GCP services APIs if not enabled 
!gcloud services enable compute.googleapis.com         \
                       containerregistry.googleapis.com  \
                       aiplatform.googleapis.com  \
                       cloudbuild.googleapis.com \
                       cloudfunctions.googleapis.com

Operation "operations/acat.p2-451072423342-2847879f-55fe-4bb7-9541-27dc275e909e" finished successfully.


In [2]:
# Install the Kubeflow and GCP AI platform libraries and components if not installed.
USER_FLAG = "--user"
# Install ai platform and kfp
!pip3 install {USER_FLAG} google-cloud-aiplatform --upgrade
!pip3 install {USER_FLAG} kfp --upgrade
!pip3 install google_cloud_pipeline_components



In [3]:
# Setup global variables
import os
# Enter your GCP project id 
PROJECT_ID = "mathco-engineering-lab"
# Enter your GCP Bucket name 
BUCKET_NAME="mathco-engineering-lab"
GCS_BUCKET_NAME=f"gs://{BUCKET_NAME}"

# Enter the region where you want to deploy the Vertex pipeline servoces
REGION="us-central1"

PIPELINE_ROOT = f"{GCS_BUCKET_NAME}/pipeline_root/"
PIPELINE_ROOT

'gs://mathco-engineering-lab/pipeline_root/'

## Import Packages

In [4]:
# Import-Packages
from typing import NamedTuple
from kfp.v2 import dsl
from kfp.v2.dsl import (Artifact,
                        Dataset,
                        Input,
                        Model,
                        Output,
                        Metrics,
                        ClassificationMetrics,
                        component, 
                        OutputPath, 
                        InputPath)

from kfp.v2 import compiler
from google.cloud.aiplatform import pipeline_jobs

## Load Data

In [6]:
@component(
    packages_to_install=["pandas", "pyarrow",  "sklearn"],
    base_image="python:3.9",
    output_component_file="get_wine_data.yaml"
)

def get_wine_data(
    url: str,
    dataset_train: Output[Dataset],
    dataset_test: Output[Dataset],
    kpi_ouput: Output[Metrics]
)-> NamedTuple("output", [("train", int),("test",int)]):
    import pandas as pd
    import numpy as np
    from sklearn.model_selection import train_test_split as tts
    
    df_wine = pd.read_csv(url, delimiter=";")
    df_wine['best_quality'] = [ 1 if x>=7 else 0 for x in df_wine.quality] 
    df_wine['target'] = df_wine.best_quality
    df_wine = df_wine.drop(['quality', 'total sulfur dioxide', 'best_quality'], axis=1)
   
   
    train, test = tts(df_wine, test_size=0.3)
    train.to_csv(dataset_train.path + ".csv" , index=False, encoding='utf-8-sig')
    test.to_csv(dataset_test.path + ".csv" , index=False, encoding='utf-8-sig')
    kpi_ouput.log_metric("train_data_size", int(train.shape[0]))
    kpi_ouput.log_metric("test_data_size", int(test.shape[0]))
    return ((train.shape[0]),(test.shape[0]))

## Train Model

In [7]:
@component(
    packages_to_install = [
        "pandas",
        "sklearn",
    ], base_image="python:3.9",
)
def train_wine_quality(
    dataset:  Input[Dataset],
    model: Output[Model], 
):
    
    from sklearn.ensemble import RandomForestClassifier
    import pandas as pd
    import pickle

    data = pd.read_csv(dataset.path+".csv")
    model_rf = RandomForestClassifier(n_estimators=10)
    model_rf.fit(
        data.drop(columns=["target"]),
        data.target,
    )
    model.metadata["framework"] = "RF"
    file_name = model.path + f".pkl"
    with open(file_name, 'wb') as file:  
        pickle.dump(model_rf, file)

## Evaluate Model

In [8]:
@component(
    packages_to_install = [
        "pandas",
        "sklearn"
    ], base_image="python:3.9",
)
def wine_quality_evaluation(
    test_set:  Input[Dataset],
    rf_winequality_model: Input[Model],
    thresholds_dict_str: str,
    metrics: Output[ClassificationMetrics],
    kpi: Output[Metrics]
) -> NamedTuple("output", [("deploy", str)]):

    from sklearn.ensemble import RandomForestClassifier
    import pandas as pd
    import pickle
    from sklearn.metrics import roc_curve, confusion_matrix, accuracy_score
    import json

    
    def threshold_check(val1, val2):
        cond = "false"
        if val1 >= val2 :
            cond = "true"
        return cond

    data = pd.read_csv(test_set.path+".csv")
    model = RandomForestClassifier()
    file_name = rf_winequality_model.path + ".pkl"
    with open(file_name, 'rb') as file:  
        model = pickle.load(file)
    
    y_test = data.drop(columns=["target"])
    y_target=data.target
    y_pred = model.predict(y_test)
    

    y_scores =  model.predict_proba(data.drop(columns=["target"]))[:, 1]
    fpr, tpr, thresholds = roc_curve(
         y_true=data.target.to_numpy(), y_score=y_scores, pos_label=True
    )
    metrics.log_roc_curve(fpr.tolist(), tpr.tolist(), thresholds.tolist())  
    
    metrics.log_confusion_matrix(
       ["False", "True"],
       confusion_matrix(
           data.target, y_pred
       ).tolist(), 
    )
    
    accuracy = accuracy_score(data.target, y_pred.round())
    thresholds_dict = json.loads(thresholds_dict_str)
    rf_winequality_model.metadata["accuracy"] = float(accuracy)
    kpi.log_metric("accuracy", float(accuracy))
    kpi.log_metric("thresholds", float(thresholds_dict['roc']))
    deploy = threshold_check(float(accuracy), float(thresholds_dict['roc']))
    return (deploy,)

## Deploy Model

In [9]:
@component(
    packages_to_install=["google-cloud-aiplatform", "scikit-learn==1.0.0",  "kfp"],
    base_image="python:3.9",
    output_component_file="model_wine_quality_component.yml"
)
def deploy_winequality(
    model: Input[Model],
    project: str,
    region: str,
    serving_container_image_uri : str, 
    vertex_endpoint: Output[Artifact],
    vertex_model: Output[Model]
):
    from google.cloud import aiplatform
    aiplatform.init(project=project, location=region)

    DISPLAY_NAME  = "wine-quality"
    MODEL_NAME = "wine-quality-rf"
    ENDPOINT_NAME = "wine_quality_endpoint"
    
    def create_endpoint():
        endpoints = aiplatform.Endpoint.list(
        filter='display_name="{}"'.format(ENDPOINT_NAME),
        order_by='create_time desc',
        project=project, 
        location=region,
        )
        if len(endpoints) > 0:
            endpoint = endpoints[0]  # most recently created
        else:
            endpoint = aiplatform.Endpoint.create(
            display_name=ENDPOINT_NAME, project=project, location=region
        )
        return endpoint
    endpoint = create_endpoint()   
    
    
    #Import a model programmatically
    # ile_name = rf_winequality_model.path + ".pkl"
    model_upload = aiplatform.Model.upload(
        display_name = DISPLAY_NAME, 
        artifact_uri = model.uri.replace("model", ""),
        # artifact_uri=model.path + ".pkl",
        serving_container_image_uri =  serving_container_image_uri,
        serving_container_health_route=f"/v1/models/{MODEL_NAME}",
        serving_container_predict_route=f"/v1/models/{MODEL_NAME}:predict",
        serving_container_environment_variables={
        "MODEL_NAME": MODEL_NAME,
    },       
    )
    model_deploy = model_upload.deploy(
        machine_type="n1-standard-4", 
        endpoint=endpoint,
        traffic_split={"0": 100},
        deployed_model_display_name=DISPLAY_NAME,
    )

    # Save data to the output params
    vertex_model.uri = model_deploy.resource_name

## Create Pipeline

In [10]:
from datetime import datetime
TIMESTAMP =datetime.now().strftime("%Y%m%d%H%M%S")
DISPLAY_NAME = 'pipeline-wine-quality-job-{}'.format(TIMESTAMP)

@dsl.pipeline(
    # Default pipeline root. You can override it when submitting the pipeline.
    pipeline_root=PIPELINE_ROOT,
    # A name for the pipeline. Use to determine the pipeline Context.
    name="pipeline-wine-quality-job",
    
)
def pipeline(
    url: str = "http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv",
    project: str = PROJECT_ID,
    region: str = REGION, 
    display_name: str = DISPLAY_NAME,
    api_endpoint: str = REGION+"-aiplatform.googleapis.com",
    thresholds_dict_str: str = '{"roc":0.95}',
    serving_container_image_uri: str = "asia-docker.pkg.dev/vertex-ai/prediction/sklearn-cpu.0-24:latest"
    ):
    
    data_op = get_wine_data(url)
    train_model_op = train_wine_quality(data_op.outputs["dataset_train"])
    model_evaluation_op = wine_quality_evaluation(
        test_set=data_op.outputs["dataset_test"],
        rf_winequality_model=train_model_op.outputs["model"],
        thresholds_dict_str = thresholds_dict_str, # I deploy the model anly if the model performance is above the threshold
    )
    
    with dsl.Condition(
        model_evaluation_op.outputs["deploy"]=="true",
        name="deploy-wine-quality",
    ):
           
        deploy_model_op = deploy_winequality(
        model=train_model_op.outputs["model"],
        project=project,
        region=region, 
        serving_container_image_uri = serving_container_image_uri,
        )

In [11]:
compiler.Compiler().compile(pipeline_func=pipeline,
        package_path='pipeline-wine-quality-job.json')



In [12]:
start_pipeline = pipeline_jobs.PipelineJob(
    display_name="pipeline-wine-quality-job",
    template_path="pipeline-wine-quality-job.json",
    pipeline_root=PIPELINE_ROOT,
    project=PROJECT_ID,
    enable_caching=True,
    location=REGION,
)
start_pipeline.submit()

Creating PipelineJob
PipelineJob created. Resource name: projects/451072423342/locations/us-central1/pipelineJobs/pipeline-wine-quality-job-20221001135302
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/451072423342/locations/us-central1/pipelineJobs/pipeline-wine-quality-job-20221001135302')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/pipeline-wine-quality-job-20221001135302?project=451072423342
