In [1]:
import os
import json
from typing import NamedTuple
from datetime import datetime
import google.cloud.aiplatform as aiplatform

In [3]:
from kfp.v2.dsl import pipeline
from kfp.v2.dsl import component
from kfp.v2.dsl import OutputPath
from kfp.v2.dsl import InputPath
from kfp.v2.dsl import Model
from kfp.v2.dsl import Input
from kfp.v2.dsl import Artifact


from kfp.v2.dsl import Output
from kfp.v2.dsl import Metrics

from kfp.v2 import compiler
from kfp.v2.google.client import AIPlatformClient

In [None]:
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = ""
PROJECT_ID = ""
SERVICE_ACCOUNT = ""

In [4]:
ENABLE_CACHING = False

PIPELINE_NAME = "my-kfp-on-gcp-demo"
TEMPLATE_PATH = "ml_pipeline_w_mlf_demo.json"
PIPELINE_ROOT = f"gs://kfp-demo-bucket-{PROJECT_ID}"

TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")

JOBID = f"training-pipeline-{TIMESTAMP}"

DATASET_ID = "sklearn_default_housing"
TABLE_ID = "NA"
COL_LABEL = "MedHouseVal" 
COL_TRAINING=["some_list"]

PIPELINE_PARAMS = {"project_id": PROJECT_ID,
                   "table_id": TABLE_ID,
                   "dataset_id": DATASET_ID,
                   "col_label": COL_LABEL,
                   "col_training": COL_TRAINING,
                  "datasplit_seed": 10}

### 1. Load data

#### 1.a: Traditional way to write

In [5]:
from sklearn.datasets import fetch_california_housing

# Function to return the orignal data
def load_my_data_local():
    '''
    Load data
    '''
    # Load data
    cal_data = fetch_california_housing(as_frame=True)
    df = cal_data.frame

    # In reality, the data load process would be
    # requests, or pulling data from data warehouse
    # df = pd.read_csv(f"{loc}/{tablename}")

    return df

df = load_my_data_local()
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


#### 1.b: Kubeflow

In [6]:
"""
Load data from target resources
"""
# Function to return the orignal data
@component(base_image='python:3.10', packages_to_install=['pandas==1.5.3', 'scikit-learn==1.2.2'])
def load_my_data(output_csv: OutputPath('CSV')) -> None:
    '''
    Load data
    '''
    import pandas as pd
    from sklearn.datasets import fetch_california_housing
    
    # Load data
    cal_data = fetch_california_housing(as_frame=True)
    df = cal_data.frame

    # In reality, the data load process would be
    # requests, or pulling data from data warehouse
    # df = pd.read_csv(f"{loc}/{tablename}")
    
    # Create an output     
    df.to_csv(output_csv, index=False, header=True)


### 2. Preprocess the data

#### 2.a: Tradtional way to write

In [7]:
"""
Preprocess data for ml model
"""
import pandas as pd

def preprocess_my_data_local(df) -> pd.DataFrame:
    """
    Some preprocessing
    """
    # Create a new feature
    df['AveRooms_bin'] = pd.to_numeric(pd.cut(x = df['AveRooms'], 
                            bins = [0, 3, 5, 7, 10, 300], 
                            labels = [1, 2, 3, 4, 5]
                            ))

    return df

df_processed = preprocess_my_data_local(df)
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal,AveRooms_bin
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526,3
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585,3
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521,4
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413,3
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422,3


#### 2.b: Kubeflow pipeline

In [8]:
@component(base_image='python:3.10', packages_to_install=['pandas==1.5.3'])
def preprocess_my_data(input_csv: InputPath('CSV'), output_csv: OutputPath('CSV')):
    """
    Some preprocessing
    """
    import pandas as pd
    
    df = pd.read_csv(input_csv)
    # Create a new feature
    df['AveRooms_bin'] = pd.to_numeric(pd.cut(x = df['AveRooms'], 
                            bins = [0, 3, 5, 7, 10, 300], 
                            labels = [1, 2, 3, 4, 5]
                            ))

    # Create an output     
    df.to_csv(output_csv, index=False, header=True)


### 3. Train model

In [9]:
@component(base_image='python:3.10', packages_to_install=['pandas==1.5.3', 'scikit-learn==1.2.2', 'numpy==1.23.5', 'mlflow==2.2.2'])
def train_ml_model_with_mlflow(input_csv: InputPath('CSV'), datasplit_seed: int) -> NamedTuple('Outputs', [('val_mse', float), ('val_mae', float), ('model_location', str)]):
    """
    Some training
    """
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import (
        mean_absolute_error,
        mean_squared_error
    )
    import numpy as np
    from sklearn.linear_model import ElasticNet
    import pandas as pd
    import mlflow
    
    df = pd.read_csv(input_csv)

    # Split Features and Labels
    X = df.drop(['AveRooms', 'MedHouseVal'], axis=1)
    y = df['MedHouseVal']

    # Split the data for train (80%) and validation (20%)
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=datasplit_seed)
    
    with mlflow.start_run(run_name='my_kfp_house_reg'):
        elastic_net_reg = ElasticNet(alpha=0.5, l1_ratio=0.5, random_state=42)
        elastic_net_reg.fit(X_train, y_train)
        result = mlflow.sklearn.log_model(elastic_net_reg, "model")        

    y_val_pred = elastic_net_reg.predict(X_val)

    val_mse = mean_absolute_error(y_val, y_val_pred)
    val_mae = mean_squared_error(y_val, y_val_pred)
    
    return (val_mse, val_mae, f"{mlflow.get_artifact_uri()}/{result.artifact_path}")


In [10]:
# Define a pipeline and create a task from a component:
@pipeline(name=PIPELINE_NAME, pipeline_root=PIPELINE_ROOT)
def my_ml_pipeline_with_kfp(project_id: str, 
                            table_id: str,
                            dataset_id: str,
                            col_label: str,
                            col_training: list,
                            datasplit_seed: int):
    load_task = load_my_data()
    preprocess_task = preprocess_my_data(input_csv=load_task.outputs['output_csv'])
    train_task = train_ml_model_with_mlflow(input_csv=preprocess_task.outputs['output_csv'], datasplit_seed=datasplit_seed)

In [11]:
compiler.Compiler().compile(
    pipeline_func=my_ml_pipeline_with_kfp,
    package_path=TEMPLATE_PATH)



In [12]:
aiplatform.init(project=PROJECT_ID, staging_bucket=PIPELINE_ROOT)

In [13]:
pipeline_ = aiplatform.pipeline_jobs.PipelineJob(
    enable_caching=ENABLE_CACHING,
    display_name=PIPELINE_NAME,
    template_path=TEMPLATE_PATH,
    job_id=JOBID,
    parameter_values=PIPELINE_PARAMS)

In [17]:
pipeline_.submit(service_account=SERVICE_ACCOUNT)

Creating PipelineJob
PipelineJob created. Resource name: projects/909974886605/locations/us-central1/pipelineJobs/training-pipeline-20230418171242
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/909974886605/locations/us-central1/pipelineJobs/training-pipeline-20230418171242')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/training-pipeline-20230418171242?project=909974886605
