## A simple Vertex AI pipeline demo
- Apply the same transform to train and test data
- Train model with hyperparam tuning
- Evaluate model with test data
- Deploy the model on Endpoint
- Request inference results to the Endpoint

### 0. Save the data on bucket
- Download data on your local machine from Kaggle's website https://www.kaggle.com/c/house-prices-advanced-regression-techniques
- Upload the data on GCS

In [None]:
import os
import google.cloud.storage as storage

In [None]:
# NOTE: it's not the best practice
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = ""  # Your service credentials (assume json)
PROJECT_ID = ""  # Your project id
SERVICE_ACCOUNT = ""  # assume XXXX@YYYYY.iam.gserviceaccount.com
CSV_FILE_BUCKET_NAME = ""  # bucket name

In [None]:
# Instantiates a client
storage_client = storage.Client()

bucket = storage_client.bucket(CSV_FILE_BUCKET_NAME)

# Creates the new bucket
# bucket = storage_client.create_bucket(CSV_FILE_BUCKET_NAME)

FILE_DIR = "house-prices-advanced-regression-techniques"
csv_files_in_dir = [x for x in os.listdir(FILE_DIR) if x.split(".")[-1] == "csv"]

for file in csv_files_in_dir:
    blob = bucket.blob(blob_name=file)
    blob.upload_from_filename(filename=f"{FILE_DIR}/{file}")

### Import package

In [None]:
import os
from typing import NamedTuple
from datetime import datetime

import google.cloud.aiplatform as aiplatform

from kfp.v2.dsl import pipeline
from kfp.v2.dsl import component
from kfp.v2.dsl import OutputPath
from kfp.v2.dsl import InputPath
from kfp.v2.dsl import Model
from kfp.v2.dsl import Input
from kfp.v2.dsl import Artifact
from kfp.v2.dsl import Output
from kfp.v2.dsl import Metrics
from kfp.v2.dsl import Dataset
from kfp.v2 import compiler
from kfp.v2.google.client import AIPlatformClient

### Parameters for GCP and kubeflow

In [None]:
# Cache pipeline results to avoid compute the same components for different runs
ENABLE_CACHING = True

PIPELINE_NAME = "my-kfp-on-gcp-demo-notebook"
# Your Kubeflow's detail
TEMPLATE_PATH = "ml_pipeline_from_notebook.json"
# GCS Bucket to store artefacts
PIPELINE_ROOT = f"gs://kfp-demo-bucket-{PROJECT_ID}"

# Run parameters
TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")
JOBID = f"training-pipeline-{TIMESTAMP}"

# Params for ML
MODEL_FEATURE_LS = [
    "OverallQual",
    "GrLivArea",
    "GarageCars",
    "GarageArea",
    "TotalBsmtSF",
    "1stFlrSF",
    "FullBath",
    "TotRmsAbvGrd",
    "YearBuilt",
    "BsmtUnfSF_TotalBsmtSF_ratio",
]
MODEL_LABEL = "SalePrice"
MODEL_HYPER_PARAM = {"alpha": [0.9, 0.95, 1], "l1_ratio": [0.45, 0.5, 0.55]}

# Model serving param
BASE_CONTAINER_IMG = "us-docker.pkg.dev/vertex-ai/prediction/sklearn-cpu.1-0:latest"
ENDPOINT_MACHINE_TYPE = "n1-standard-4"
ENDPOINT_MODEL_NAME = "my_kf_elasticnet_model_from_notebook"

# NB: these parameters must be added for your pipeline's args
PIPELINE_PARAMS = {
    "project_id": PROJECT_ID,
    "original_bucket_id": CSV_FILE_BUCKET_NAME,
    "target_train_filename": "train.csv",
    "target_test_filename": "test.csv",
    "datasplit_seed": 10,
    "selected_features": MODEL_FEATURE_LS,
    "selected_label": MODEL_LABEL,
    "my_hyper_params_dict": MODEL_HYPER_PARAM,
    "serving_container_image_uri": BASE_CONTAINER_IMG,
    "endpoint_machine_type": ENDPOINT_MACHINE_TYPE,
    "endpoint_model_name": ENDPOINT_MODEL_NAME,
}

# Package list and version
GCP_AI_PLATFORM = "google-cloud-aiplatform==1.18.1"
GCP_BUCKET = "google-cloud-storage==1.43.0"
PANDAS = "pandas==1.5.3"
PYTHON_BASE = "python:3.10"
SKLEARN = "scikit-learn==1.2.2"
NUMPY = "numpy==1.23.5"

### 1. Preprocess the data

In [None]:
@component(base_image=PYTHON_BASE, packages_to_install=[PANDAS, GCP_BUCKET])
def preprocess_my_data(
    project_id: str,
    original_bucket_id: str,
    target_filename: str,
    output_dataset: Output[Dataset],
) -> None:
    """
    Some preprocessing
    """
    import pandas as pd
    import io
    import google.cloud.storage as storage

    # ===============================
    #  Collect data from storage
    # ===============================
    # Instantiates a client
    storage_client = storage.Client()
    bucket = storage_client.bucket(original_bucket_id)

    # Download csv file from GCS
    blob = bucket.blob(target_filename)  # train.csv or test.csv
    data = blob.download_as_string()
    df = pd.read_csv(io.BytesIO(data))

    # ===============================
    #  Some feature engineering
    #  1. Mean fill + creating ratio
    # ===============================
    # Get mean value for the target column
    mean_target_col = df["TotalBsmtSF"].mean()
    # Replace 0 value to mean
    df["TotalBsmtSF_fillmean"] = df["TotalBsmtSF"].replace(0, mean_target_col)
    # Get mean value for the target column
    mean_target_col = df["BsmtUnfSF"].mean()
    # Replace 0 value to mean
    df["BsmtUnfSF_fillmean"] = df["BsmtUnfSF"].replace(0, mean_target_col)
    df["BsmtUnfSF_TotalBsmtSF_ratio"] = (
        df["BsmtUnfSF_fillmean"] / df["TotalBsmtSF_fillmean"]
    )

    # Create an output
    df.to_csv(output_dataset.path, index=False, header=True)

### 2. Train the model

In [None]:
@component(
    base_image=PYTHON_BASE,
    packages_to_install=[
        PANDAS,
        SKLEARN,
        NUMPY,
    ],
)
def train_my_ml_model(
    input_dataset: Input[Dataset],
    datasplit_seed: int,
    selected_features: list,
    selected_label: str,
    my_hyper_params_dict: dict,
    eval_metrics: Output[Metrics],
    model: Output[Model],
) -> NamedTuple("Outputs", [("val_mse", float), ("val_mae", float), ('best_hyperparam', dict)]):
    """
    Some training
    """
    from sklearn.model_selection import (
        train_test_split,
        RandomizedSearchCV,
        StratifiedKFold,
    )
    from sklearn.metrics import mean_absolute_error, mean_squared_error
    import numpy as np
    from sklearn.linear_model import ElasticNet
    import pandas as pd
    import json
    import joblib
    import pickle

    # Read table from upstream compnents
    df = pd.read_csv(input_dataset.path)

    # Split Features and Labels
    X = df[selected_features]
    assert X.isna().sum().sum() == 0  # NB: not the best practice
    y = df[selected_label]

    # Split the data for train (80%) and validation (20%)
    X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=0.2, random_state=datasplit_seed
    )

    # Train the model
    elastic_net_reg = ElasticNet(random_state=42)
    # Hyper-param tuning
    random_search = RandomizedSearchCV(
        elastic_net_reg,
        param_distributions=my_hyper_params_dict,
        n_iter=8,
        scoring="neg_mean_squared_error",
        n_jobs=4,
        cv=5,
        verbose=3,
        random_state=1001,
    )
    random_search.fit(X_train, y_train)
    # Get the best
    elastic_net_reg_best = random_search.best_estimator_
    best_hyperparam = random_search.best_params_

    # Get the trained model performance with validation data
    y_val_pred = elastic_net_reg_best.predict(X_val)
    val_mse = mean_absolute_error(y_val, y_val_pred)
    val_mae = mean_squared_error(y_val, y_val_pred)
    metrics_dict = {"val_mse": val_mse, "val_mae": val_mae, "best_hyper_param": best_hyperparam}

    # dumping metrics_dict
    with open(eval_metrics.path, "w") as f:
        json.dump(metrics_dict, f)

    # Save the model
    # NB: Apr-2023, joblib file had problem to deploy on endpoint
    # NB: Model path should end model.pkl or model.joblib (i.e., model.path)
    # https://cloud.google.com/vertex-ai/docs/training/exporting-model-artifacts#pickle_1
    # joblib.dump(elastic_net_reg_best, f"{model.path}.joblib")
    with open(f"{model.path}.pkl", "wb") as f:
        pickle.dump(elastic_net_reg_best, f)

    return (val_mse, val_mae, best_hyperparam)

### 3. Evaluate the model

In [None]:
@component(
    base_image=PYTHON_BASE,
    packages_to_install=[
        PANDAS,
        SKLEARN,
        NUMPY,
    ],
)
def eval_my_ml_model(
    input_dataset: Input[Dataset],
    selected_features: list,
    model: Input[Model],
    pred_output_csv: Output[Dataset],
    model_deployment_flag: Output[Artifact],
) -> None:
    """
    Some validation
    """
    import numpy as np
    import pandas as pd
    import json
    import joblib
    import pickle

    # load the test data
    df = pd.read_csv(input_dataset.path)

    # Split Features and Labels
    X = df[selected_features]
    X.dropna(inplace=True)  # NB: not the best practice
    assert X.isna().sum().sum() == 0  # NB: not the best practice

    # load model
    # elastic_net_reg = pickle.load(f"{model.path}.pkl")
    with open(f"{model.path}.pkl", "rb") as f:
        elastic_net_reg = pickle.load(f)

    # Prediction towards to test data
    y_test_pred = elastic_net_reg.predict(X)

    # Create an output
    np.savetxt(f"{pred_output_csv.path}", y_test_pred.round(1), delimiter=",")

    # dumping metrics_dict
    score = 4678  # NB: RANDOM number for demo
    metrics_dict = {"model_eval_passed": True, "threshould_or_score": score}
    with open(model_deployment_flag.path, "w") as f:
        json.dump(metrics_dict, f)

### 4. Deploy the model to endpoint

In [None]:
@component(
    base_image=PYTHON_BASE,
    packages_to_install=[PANDAS, NUMPY, GCP_AI_PLATFORM],
)
def deploy_my_ml_model(
    project_id: str,
    model: Input[Model],
    serving_container_image_uri: str,
    endpoint_machine_type: str,
    endpoint_model_name: str,
    model_deployment_flag: Input[Artifact],
    vertex_endpoint: Output[Artifact],
    vertex_model: Output[Model],
) -> None:
    """
    Some deploying models
    """
    from google.cloud import aiplatform
    import json
    import os

    # Get the flag value from upstream components
    with open(model_deployment_flag.path, "r") as f:
        str_flag = f.readline()
    model_deployment_flag_json = json.loads(str_flag)

    if not model_deployment_flag_json["model_eval_passed"]:
        print("Model failed to pass the threshold")
        model_deployment_flag.uri = "N/A"
        vertex_endpoint.uri = "N/A"
        vertex_model.uri = "N/A"

    else:
        aiplatform.init(project=project_id)

        # List of pre-build docker images:
        # https://cloud.google.com/vertex-ai/docs/predictions/pre-built-containers
        # https://aiinpractice.com/gcp-mlops-vertex-ai-pipeline-scikit-learn/
        # NOTE: using the base image, and this must be model.joblib or model.pkl
        deployed_model = aiplatform.Model.upload(
            display_name=endpoint_model_name,
            artifact_uri=os.path.dirname(model.uri),
            serving_container_image_uri=serving_container_image_uri,
        )
        endpoint = deployed_model.deploy(machine_type=endpoint_machine_type)

        # Save data to the output params
        vertex_endpoint.uri = endpoint.resource_name
        vertex_model.uri = deployed_model.resource_name

### 5. Create a pipeline

In [None]:
# Define a pipeline and create a task from a component:
@pipeline(name=PIPELINE_NAME, pipeline_root=PIPELINE_ROOT)
def tmp_pipe(
    project_id: str,
    original_bucket_id: str,
    target_train_filename: str,
    target_test_filename: str,
    datasplit_seed: int,
    selected_features: list,
    selected_label: str,
    my_hyper_params_dict: dict,
    serving_container_image_uri: str,
    endpoint_model_name: str,
    endpoint_machine_type: str,
):
    train_preprocess = preprocess_my_data(
        project_id=project_id,
        original_bucket_id=original_bucket_id,
        target_filename=target_train_filename,
    ).set_display_name("Preprocess train data")

    test_preprocess = preprocess_my_data(
        project_id=project_id,
        original_bucket_id=original_bucket_id,
        target_filename=target_test_filename,
    ).set_display_name("Preprocess test data")

    train_model = train_my_ml_model(
        input_dataset=train_preprocess.outputs["output_dataset"],
        datasplit_seed=datasplit_seed,
        selected_features=selected_features,
        selected_label=selected_label,
        my_hyper_params_dict=my_hyper_params_dict,
    ).set_display_name("Train model")

    evaluation_model = eval_my_ml_model(
        input_dataset=test_preprocess.outputs["output_dataset"],
        selected_features=selected_features,
        model=train_model.outputs["model"],
    ).set_display_name("Model evaluation")

    deploy_model = deploy_my_ml_model(
        project_id=project_id,
        model=train_model.outputs["model"],
        serving_container_image_uri=serving_container_image_uri,
        endpoint_model_name=endpoint_model_name,
        endpoint_machine_type=endpoint_machine_type,
        model_deployment_flag=evaluation_model.outputs["model_deployment_flag"],
    ).set_display_name("Deploy model to endpoint")

In [None]:
compiler.Compiler().compile(
    pipeline_func=tmp_pipe, package_path=TEMPLATE_PATH
)

### 6. Deploy to endpoint

In [None]:
aiplatform.init(project=PROJECT_ID, staging_bucket=PIPELINE_ROOT)

In [None]:
TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")
JOBID = f"training-pipeline-{TIMESTAMP}"

# Pipeline job
pipeline_ = aiplatform.pipeline_jobs.PipelineJob(
    enable_caching=ENABLE_CACHING,
    display_name=PIPELINE_NAME,
    template_path=TEMPLATE_PATH,
    job_id=JOBID,
    parameter_values=PIPELINE_PARAMS,
)
# Submit
pipeline_.submit(service_account=SERVICE_ACCOUNT)

In [None]:
raise

### 7. Prediction (request to the endpoint)
- Wait 10-15 minutes until the endpoint is ready

In [None]:
# Update the endpoint location
vertex_ai_model_endpoint = "projects/YOUR_PROEJCT_NUMBER/locations/us-central1/endpoints/ENDPOINT_NUMBER"
endpoint = aiplatform.Endpoint(vertex_ai_model_endpoint)

In [None]:
# Test the endpoint
import pandas as pd
import numpy as np

# Get some values from the train
df = pd.read_csv("house-prices-advanced-regression-techniques/train.csv")

df_mod = df[
    [
        "OverallQual",
        "GrLivArea",
        "GarageCars",
        "GarageArea",
        "TotalBsmtSF",
        "1stFlrSF",
        "FullBath",
        "TotRmsAbvGrd",
        "YearBuilt",
        "SalePrice",
    ]
]

df_mod.loc[[0, 4, 100]]

In [None]:
# Sample input (NB: added 0.5 value at the end - feature engineered value)
input_instance = [
    [7, 1710, 2, 548, 856, 856, 2, 8, 2003, 0.5],
    [8, 2198, 3, 836, 1145, 1145, 2, 9, 2000, 0.5],
    [6, 1610, 2, 480, 1610, 1610, 2, 6, 1977, 0.5],
]

endpoint.predict(instances=input_instance)