## A simple Vertex AI pipeline demo
- Cont. demo2
- Component based solution

In [None]:
import os
from pathlib import Path
import requests
from datetime import datetime

import kfp

from kfp.v2.dsl import pipeline
from kfp.v2 import compiler

import google.cloud.aiplatform as aiplatform


In [None]:
# NOTE: it's not the best practice
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = ""  # Your service credentials (assume json)
PROJECT_ID = ""  # Your project id
SERVICE_ACCOUNT = ""  # assume XXXX@YYYYY.iam.gserviceaccount.com
CSV_FILE_BUCKET_NAME = ""  # bucket name

In [None]:
# NOTE: it's not the best practice
os.environ[
    "GOOGLE_APPLICATION_CREDENTIALS"
] = "/Users/takashinakamura/Library/CloudStorage/OneDrive-Deloitte(O365D)/Documents/GitHub/ml-pipe-gcp-456-6c3b79fe7c95.json"

PROJECT_ID = "ml-pipe-gcp-456"
SERVICE_ACCOUNT = "admin-all@ml-pipe-gcp-456.iam.gserviceaccount.com"
CSV_FILE_BUCKET_NAME = f"my-csv-file-{PROJECT_ID}"

In [None]:
# Cache pipeline results to avoid compute the same components for different runs
ENABLE_CACHING = True

PIPELINE_NAME = "my-kfp-on-gcp-demo2-container-based"
# Your Kubeflow's detail
TEMPLATE_PATH = "ml_pipeline_2.json"
# GCS Bucket to store artefacts
PIPELINE_ROOT = f"gs://kfp-demo-bucket-{PROJECT_ID}"

# Params for ML
MODEL_FEATURE_LS = [
    "OverallQual",
    "GrLivArea",
    "GarageCars",
    "GarageArea",
    "TotalBsmtSF",
    "1stFlrSF",
    "FullBath",
    "TotRmsAbvGrd",
    "YearBuilt",
    "BsmtUnfSF_TotalBsmtSF_ratio",
]
MODEL_LABEL = "SalePrice"
MODEL_HYPER_PARAM = {"alpha": [0.9, 0.95, 1], "l1_ratio": [0.45, 0.5, 0.55]}

# Run parameters
TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")
JOBID = f"training-pipeline-{TIMESTAMP}"

PIPELINE_PARAMS = {
    "project_id": PROJECT_ID,
    "original_bucket_id": CSV_FILE_BUCKET_NAME,
    "target_train_filename": "train.csv",
    "target_test_filename": "test.csv",
    "datasplit_seed": 10,
    "selected_features": MODEL_FEATURE_LS,
    "selected_label": MODEL_LABEL,
    "my_hyper_params_dict": MODEL_HYPER_PARAM,
}

In [None]:
#Load the component
preprocess_op = kfp.components.load_component_from_file('preprocess/component.yaml')
train_op = kfp.components.load_component_from_file('train/component.yaml')

In [None]:
# Define a pipeline and create a task from a component:
@pipeline(name=PIPELINE_NAME, pipeline_root=PIPELINE_ROOT)
def tmp_pipe(
    project_id: str,
    original_bucket_id: str,
    target_train_filename: str,
    target_test_filename: str,
    datasplit_seed: int,
    selected_features: list,
    selected_label: str,
    my_hyper_params_dict: dict,
):
    train_preprocess = preprocess_op(
        project_id=project_id,
        original_bucket_id=original_bucket_id,
        target_filename=target_train_filename,
    ).set_display_name("Preprocess train data")

    test_preprocess = preprocess_op(
        project_id=project_id,
        original_bucket_id=original_bucket_id,
        target_filename=target_test_filename,
    ).set_display_name("Preprocess test data")
    
    train_model = train_op(
        input_dataset=train_preprocess.outputs["output_dataset"],
        datasplit_seed=datasplit_seed,
        selected_features=selected_features,
        selected_label=selected_label,
        my_hyper_params_dict=my_hyper_params_dict,
    ).set_display_name("Train model")

In [None]:
compiler.Compiler().compile(
    pipeline_func=tmp_pipe, package_path=TEMPLATE_PATH
)

In [None]:
aiplatform.init(project=PROJECT_ID, staging_bucket=PIPELINE_ROOT)

In [None]:
TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")
JOBID = f"training-pipeline-{TIMESTAMP}"

# Pipeline job
pipeline_ = aiplatform.pipeline_jobs.PipelineJob(
    enable_caching=ENABLE_CACHING,
    display_name=PIPELINE_NAME,
    template_path=TEMPLATE_PATH,
    job_id=JOBID,
    parameter_values=PIPELINE_PARAMS,
)
# Submit
pipeline_.submit(service_account=SERVICE_ACCOUNT)