## A simple ML pipeline demo

### 0. Save the data on bucket

In [1]:
import os
import google.cloud.storage as storage

In [2]:
# NOTE: it's not the best practice
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = ""  # Your service credentials (assume json)
PROJECT_ID = ""  # Your project id
SERVICE_ACCOUNT = ""  # assume XXXX@YYYYY.iam.gserviceaccount.com

In [7]:
# Instantiates a client
storage_client = storage.Client()

bucket = storage_client.bucket(CSV_FILE_BUCKET_NAME)

# Creates the new bucket
# bucket = storage_client.create_bucket(CSV_FILE_BUCKET_NAME)

FILE_DIR = "house-prices-advanced-regression-techniques"
csv_files_in_dir = [x for x in os.listdir(FILE_DIR) if x.split(".")[-1] == "csv"]

for file in csv_files_in_dir:
    blob = bucket.blob(blob_name=file)
    blob.upload_from_filename(filename=f"{FILE_DIR}/{file}")

### Import package

In [90]:
import os
from typing import NamedTuple
from datetime import datetime

import google.cloud.aiplatform as aiplatform

from kfp.v2.dsl import pipeline
from kfp.v2.dsl import component
from kfp.v2.dsl import OutputPath
from kfp.v2.dsl import InputPath
from kfp.v2.dsl import Model
from kfp.v2.dsl import Input
from kfp.v2.dsl import Artifact
from kfp.v2.dsl import Output
from kfp.v2.dsl import Metrics
from kfp.v2.dsl import Dataset
from kfp.v2 import compiler
from kfp.v2.google.client import AIPlatformClient

### Parameters for GCP and kubeflow

In [40]:
ENABLE_CACHING = False

PIPELINE_NAME = "my-kfp-on-gcp-demo2"
# Your Kubeflow's detail
TEMPLATE_PATH = "ml_pipeline_2.json"
# GCS Bucket to store artefacts
PIPELINE_ROOT = f"gs://kfp-demo-bucket-{PROJECT_ID}"

TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")

JOBID = f"training-pipeline-{TIMESTAMP}"

# Parameters for ML
DATASET_ID = "sklearn_default_housing"
TABLE_ID = "NA"
COL_LABEL = "MedHouseVal"
COL_TRAINING = ["some_list"]

GCP_BUCKET = "google-cloud-storage==1.43.0"
PANDAS = "pandas==1.5.3"
PYTHON_BASE = "python:3.10"
SKLEARN = "scikit-learn==1.2.2"
NUMPY = "numpy==1.23.5"

### 1. Preprocess the data

In [17]:
import pandas as pd
import io 

original_bucket_id = CSV_FILE_BUCKET_NAME
target_filename = "train.csv"

# Instantiates a client
storage_client = storage.Client()
bucket = storage_client.bucket(original_bucket_id)

# Download csv file     
blob = bucket.blob(target_filename)
data = blob.download_as_string()
df = pd.read_csv(io.BytesIO(data))
    

In [18]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [60]:
df.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [95]:
TARGET_LABEL = 'SalePrice'
TARGET_FEATURES = ['OverallQual', 
                   'GrLivArea', 
                   'GarageCars', 
                   'GarageArea',
                   'TotalBsmtSF', 
                   '1stFlrSF', 
                   'FullBath', 
                   'TotRmsAbvGrd', 
                   'YearBuilt',
                   "BsmtUnfSF_TotalBsmtSF_ratio"]

In [96]:
df_mod = df[TARGET_FEATURES]
assert df_mod.isna().sum().sum() == 0  # not the best practice

In [97]:
@component(base_image=PYTHON_BASE, packages_to_install=[PANDAS, GCP_BUCKET])
def preprocess_my_data(
    project_id: str,
    original_bucket_id: str,
    ml_bucket_id: str,
    target_filename: str,
    output_dataset: Output[Dataset]) -> None:
    """
    Some preprocessing
    """
    import pandas as pd
    import io
    import google.cloud.storage as storage
    
    
    # =============================== 
    #  Collect data from storage
    # =============================== 
    # Instantiates a client
    storage_client = storage.Client()
    bucket = storage_client.bucket(original_bucket_id)

    # Download csv file from GCS
    blob = bucket.blob(target_filename)  # train.csv or test.csv
    data = blob.download_as_string()
    df = pd.read_csv(io.BytesIO(data))
    
    # =============================== 
    #  Some feature engineering
    #  1. Mean fill + creating ratio     
    # =============================== 
    # Get mean value for the target column
    mean_target_col = df["TotalBsmtSF"].mean()
    # Replace 0 value to mean
    df["TotalBsmtSF_fillmean"] = df["TotalBsmtSF"].replace(0, mean_target_col)
    # Get mean value for the target column
    mean_target_col = df["BsmtUnfSF"].mean()
    # Replace 0 value to mean
    df["BsmtUnfSF_fillmean"] = df["BsmtUnfSF"].replace(0, mean_target_col)
    df["BsmtUnfSF_TotalBsmtSF_ratio"] = df["BsmtUnfSF_fillmean"] / df["TotalBsmtSF_fillmean"]    

    # Create an output
    df.to_csv(output_dataset.path, index=False, header=True)

In [98]:
@component(
    base_image=PYTHON_BASE,
    packages_to_install=[
        PANDAS,
        SKLEARN,
        NUMPY,
    ],
)
def train_my_ml_model(
    input_dataset: Input[Dataset], 
    datasplit_seed: int, 
    selected_features: list, 
    selected_label: str, 
    eval_metrics: Output[Metrics],
) -> NamedTuple(
    "Outputs", [("val_mse", float), ("val_mae", float)]
):
    """
    Some training
    """
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import mean_absolute_error, mean_squared_error
    import numpy as np
    from sklearn.linear_model import ElasticNet
    import pandas as pd
    import json

    df = pd.read_csv(input_dataset.path)

    # Split Features and Labels
    X = df[selected_features]
    assert X.isna().sum().sum() == 0  # not the best practice
    y = df[selected_label]

    # Split the data for train (80%) and validation (20%)
    X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=0.2, random_state=datasplit_seed
    )

    # Train the model     
    elastic_net_reg = ElasticNet(alpha=0.5, l1_ratio=0.5, random_state=42)
    elastic_net_reg.fit(X_train, y_train)
    
    # Get the trained model performance with validation data     
    y_val_pred = elastic_net_reg.predict(X_val)
    val_mse = mean_absolute_error(y_val, y_val_pred)
    val_mae = mean_squared_error(y_val, y_val_pred)
    metrics_dict = {"val_mse": val_mse, "val_mae": val_mae}
    
    # dumping metrics_dict
    with open(eval_metrics.path, "w") as f:
        json.dump(metrics_dict, f)
    
    # Save the model    

    return (val_mse, val_mae)

In [101]:
# Define a pipeline and create a task from a component:
@pipeline(name=PIPELINE_NAME, pipeline_root=PIPELINE_ROOT)
def tmp_pipe(
    project_id: str,
    original_bucket_id: str,
    ml_bucket_id: str,
    target_train_filename: str,
    target_test_filename: str,
    datasplit_seed: int,
    selected_features: list, 
    selected_label: str,
    
):
    train_preprocess = preprocess_my_data(project_id=project_id, 
                             original_bucket_id=original_bucket_id,
                             ml_bucket_id=ml_bucket_id,
                             target_filename=target_train_filename,).set_display_name("Preprocess train data")
    
    test_preprocess = preprocess_my_data(project_id=project_id, 
                             original_bucket_id=original_bucket_id,
                             ml_bucket_id=ml_bucket_id,
                             target_filename=target_test_filename,).set_display_name("Preprocess test data")
    
    train_model = train_my_ml_model(
        input_dataset=train_preprocess.outputs['output_dataset'], 
        datasplit_seed=datasplit_seed,
        selected_features=selected_features,
        selected_label=selected_label,
    )

In [102]:
compiler.Compiler().compile(
    pipeline_func=tmp_pipe, package_path=TEMPLATE_PATH
)

In [103]:
aiplatform.init(project=PROJECT_ID, staging_bucket=PIPELINE_ROOT)

In [104]:
TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")
JOBID = f"training-pipeline-{TIMESTAMP}"

# NB: these parameters must be added for your pipeline's args
PIPELINE_PARAMS = {
    "project_id": PROJECT_ID,
    "original_bucket_id": CSV_FILE_BUCKET_NAME,
    "ml_bucket_id": PIPELINE_ROOT,
    "target_train_filename": "train.csv",
    "target_test_filename": "test.csv",
    "datasplit_seed": 10,
    "selected_features": ['OverallQual', 
                   'GrLivArea', 
                   'GarageCars', 
                   'GarageArea',
                   'TotalBsmtSF', 
                   '1stFlrSF', 
                   'FullBath', 
                   'TotRmsAbvGrd', 
                   'YearBuilt',
                   "BsmtUnfSF_TotalBsmtSF_ratio"],
    "selected_label": 'SalePrice'
}


pipeline_ = aiplatform.pipeline_jobs.PipelineJob(
    enable_caching=ENABLE_CACHING,
    display_name=PIPELINE_NAME,
    template_path=TEMPLATE_PATH,
    job_id=JOBID,
    parameter_values=PIPELINE_PARAMS,
)

In [105]:
pipeline_.submit(service_account=SERVICE_ACCOUNT)

Creating PipelineJob
PipelineJob created. Resource name: projects/909974886605/locations/us-central1/pipelineJobs/training-pipeline-20230419131111
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/909974886605/locations/us-central1/pipelineJobs/training-pipeline-20230419131111')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/training-pipeline-20230419131111?project=909974886605
