## A simple Vertex AI pipeline demo
- Cont. demo2
- Component based solution

### Import package

In [None]:
import os
from pathlib import Path
import requests
from datetime import datetime

import kfp

from kfp.v2.dsl import pipeline
from kfp.v2 import compiler

import google.cloud.aiplatform as aiplatform


### Parameters for GCP and kubeflow

In [None]:
# NOTE: it's not the best practice
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = ""  # Your service credentials (assume json)
PROJECT_ID = ""  # Your project id
SERVICE_ACCOUNT = ""  # assume XXXX@YYYYY.iam.gserviceaccount.com
CSV_FILE_BUCKET_NAME = ""  # bucket name

In [None]:
# Cache pipeline results to avoid compute the same components for different runs
ENABLE_CACHING = False

PIPELINE_NAME = "my-kfp-on-gcp-demo-container-based"
# Your Kubeflow's detail
TEMPLATE_PATH = "ml_pipeline_from_containerised_components.json"
# GCS Bucket to store artefacts
PIPELINE_ROOT = f"gs://kfp-demo-bucket-{PROJECT_ID}"

# Params for ML
MODEL_FEATURE_LS = [
    "OverallQual",
    "GrLivArea",
    "GarageCars",
    "GarageArea",
    "TotalBsmtSF",
    "1stFlrSF",
    "FullBath",
    "TotRmsAbvGrd",
    "YearBuilt",
    "BsmtUnfSF_TotalBsmtSF_ratio",
]
MODEL_LABEL = "SalePrice"
MODEL_HYPER_PARAM = {"alpha": [0.9, 0.95, 1], "l1_ratio": [0.45, 0.5, 0.55]}

# Model serving param
BASE_CONTAINER_IMG = "us-docker.pkg.dev/vertex-ai/prediction/sklearn-cpu.1-0:latest"
ENDPOINT_MACHINE_TYPE = "n1-standard-4"
ENDPOINT_MODEL_NAME = "my_kf_elasticnet_model_from_component"

# NB: these parameters must be added for your pipeline's args
PIPELINE_PARAMS = {
    "project_id": PROJECT_ID,
    "original_bucket_id": CSV_FILE_BUCKET_NAME,
    "target_train_filename": "train.csv",
    "target_test_filename": "test.csv",
    "datasplit_seed": 10,
    "selected_features": MODEL_FEATURE_LS,
    "selected_label": MODEL_LABEL,
    "my_hyper_params_dict": MODEL_HYPER_PARAM,
    "serving_container_image_uri": BASE_CONTAINER_IMG,
    "endpoint_machine_type": ENDPOINT_MACHINE_TYPE,
    "endpoint_model_name": ENDPOINT_MODEL_NAME,
}

### 1. Load components

In [None]:
#Load the component
preprocess_op = kfp.components.load_component_from_file('component_based/preprocess/component.yaml')
train_op = kfp.components.load_component_from_file('component_based/train/component.yaml')
eval_op = kfp.components.load_component_from_file('component_based/eval/component.yaml')
deploy_op = kfp.components.load_component_from_file('component_based/deploy/component.yaml')

### 2. Create a pipeline

In [None]:
# Define a pipeline and create a task from a component:
@pipeline(name=PIPELINE_NAME, pipeline_root=PIPELINE_ROOT)
def tmp_pipe(
    project_id: str,
    original_bucket_id: str,
    target_train_filename: str,
    target_test_filename: str,
    datasplit_seed: int,
    selected_features: list,
    selected_label: str,
    my_hyper_params_dict: dict,
    serving_container_image_uri: str,
    endpoint_model_name: str,
    endpoint_machine_type: str,
):
    train_preprocess = preprocess_op(
        project_id=project_id,
        original_bucket_id=original_bucket_id,
        target_filename=target_train_filename,
    ).set_display_name("Preprocess train data")

    test_preprocess = preprocess_op(
        project_id=project_id,
        original_bucket_id=original_bucket_id,
        target_filename=target_test_filename,
    ).set_display_name("Preprocess test data")
    
    train_model = train_op(
        input_dataset=train_preprocess.outputs["output_dataset"],
        datasplit_seed=datasplit_seed,
        selected_features=selected_features,
        selected_label=selected_label,
        my_hyper_params_dict=my_hyper_params_dict,
    ).set_display_name("Train model")
    
    evaluation_model = eval_op(
        input_dataset=test_preprocess.outputs["output_dataset"],
        selected_features=selected_features,
        model=train_model.outputs["model"],
    ).set_display_name("Model evaluation")
    
    deploy_model = deploy_op(
        project_id=project_id,
        model=train_model.outputs["model"],
        serving_container_image_uri=serving_container_image_uri,
        endpoint_model_name=endpoint_model_name,
        endpoint_machine_type=endpoint_machine_type,
        model_deployment_flag=evaluation_model.outputs["model_deployment_flag"],
    ).set_display_name("Deploy model to endpoint")

In [None]:
compiler.Compiler().compile(
    pipeline_func=tmp_pipe, package_path=TEMPLATE_PATH
)

### 3. Deploy to endpoint

In [None]:
aiplatform.init(project=PROJECT_ID, staging_bucket=PIPELINE_ROOT)

In [None]:
TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")
JOBID = f"training-pipeline-{TIMESTAMP}"

# Pipeline job
pipeline_ = aiplatform.pipeline_jobs.PipelineJob(
    enable_caching=ENABLE_CACHING,
    display_name=PIPELINE_NAME,
    template_path=TEMPLATE_PATH,
    job_id=JOBID,
    parameter_values=PIPELINE_PARAMS,
)
# Submit
pipeline_.submit(service_account=SERVICE_ACCOUNT)

In [None]:
raise

### 4. Prediction (request to the endpoint)
- Wait 10-15 minutes until the endpoint is ready

In [None]:
# Update the endpoint location
vertex_ai_model_endpoint = "projects/YOUR_PROEJCT_NUMBER/locations/us-central1/endpoints/ENDPOINT_NUMBER"
endpoint = aiplatform.Endpoint(vertex_ai_model_endpoint)

In [None]:
# Test the endpoint
import pandas as pd
import numpy as np

# Get some values from the train
df = pd.read_csv("house-prices-advanced-regression-techniques/train.csv")

df_mod = df[
    [
        "OverallQual",
        "GrLivArea",
        "GarageCars",
        "GarageArea",
        "TotalBsmtSF",
        "1stFlrSF",
        "FullBath",
        "TotRmsAbvGrd",
        "YearBuilt",
        "SalePrice",
    ]
]

df_mod.loc[[0, 4, 100]]

In [None]:
# Sample input (NB: added 0.5 value at the end - feature engineered value)
input_instance = [
    [7, 1710, 2, 548, 856, 856, 2, 8, 2003, 0.5],
    [8, 2198, 3, 836, 1145, 1145, 2, 9, 2000, 0.5],
    [6, 1610, 2, 480, 1610, 1610, 2, 6, 1977, 0.5],
]

endpoint.predict(instances=input_instance)