# Vertex AI Pipeline: train with TPU, register trained model, deploy registered model to endpoint 


## config

In [1]:
import sys
import default_config as default

# default.get_config()

In [None]:
PROJECT_ID = "hybrid-vertex"
LOCATION = "us-central1"
BUCKET_URI = f"gs://tpu-pipeline-{PROJECT_ID}"
EMAIL_RECIPIENTS = [ "jordantotten@google.com" ]

In [2]:
SERVICE_ACCOUNT = "[your-service-account]"

IS_COLAB = "google.colab" in sys.modules
if (
    SERVICE_ACCOUNT == ""
    or SERVICE_ACCOUNT is None
    or SERVICE_ACCOUNT == "[your-service-account]"
):
    # Get your service account from gcloud
    if not IS_COLAB:
        shell_output = !gcloud auth list 2>/dev/null
        SERVICE_ACCOUNT = shell_output[2].replace("*", "").strip()
    else:  # IS_COLAB:
        shell_output = ! gcloud projects describe  $PROJECT_ID
        project_number = shell_output[-1].split(":")[1].strip().replace("'", "")
        SERVICE_ACCOUNT = f"{project_number}-compute@developer.gserviceaccount.com"
        
print("Service Account:", SERVICE_ACCOUNT)

Service Account: 934903580331-compute@developer.gserviceaccount.com


> If your bucket doesn't already exist: Run the following cell to create your Cloud Storage bucket.

In [3]:
# ! gsutil mb -l {LOCATION} -p {PROJECT_ID} {BUCKET_URI}

> Set service account access for Vertex AI Pipelines

In [4]:
! gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.objectCreator $BUCKET_URI

! gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.objectViewer $BUCKET_URI

No changes made to gs://tpu-pipeline-hybrid-vertex/
No changes made to gs://tpu-pipeline-hybrid-vertex/


In [5]:
config = default.get_config()
config.project_id = PROJECT_ID
config.location = LOCATION
config.bucket_uri = BUCKET_URI
config.service_account = SERVICE_ACCOUNT

## Import libraries and define constants

In [6]:
import os
import time
from typing import Any, Dict, List
from pprint import pprint

import kfp
from google.cloud import aiplatform
from google.cloud.aiplatform import gapic
from google_cloud_pipeline_components.types import artifact_types
from google_cloud_pipeline_components.v1.custom_job.component import \
    custom_training_job as CustomTrainingJobOp
from google_cloud_pipeline_components.v1.endpoint import (EndpointCreateOp,
                                                          ModelDeployOp)
from google_cloud_pipeline_components.v1.model import ModelUploadOp
from kfp import compiler
from kfp.dsl import importer_node
from kfp import dsl

# init vertex SDK
aiplatform.init(project=PROJECT_ID, location=LOCATION, staging_bucket=BUCKET_URI)

## Set train and deploy compute 

You can set hardware accelerators for both training and prediction:

* Set the variables `TRAIN_TPU/TRAIN_NTPU` to use a training container image supporting a TPU and the number of TPUs allocated 
* Set `DEPLOY_GPU/DEPLOY_NGPU` to use a deployment container image supporting a GPU and the number of GPUs allocated to the virtual machine (VM) instance (otherwise specify `(None, None)` to use a container image to run on a CPU)

> *note: TPU VMs don't require VCPU definition*

**accelerators**

In [7]:
# accelerators
TRAIN_TPU, TRAIN_NTPU = (
    gapic.AcceleratorType.TPU_V2,
    8,
)  # Using TPU_V2 with 8 accelerators
DEPLOY_GPU, DEPLOY_NGPU = (
    gapic.AcceleratorType.NVIDIA_TESLA_T4,
    1,
)  # Using Tesla T4 with 1 accelerator

# compute
TRAIN_MACHINE_TYPE = "cloud-tpu"
TRAIN_COMPUTE = TRAIN_MACHINE_TYPE
DEPLOY_MACHINE_TYPE = "n1-standard"
VCPU = "4"
DEPLOY_COMPUTE = DEPLOY_MACHINE_TYPE + "-" + VCPU

REPOSITORY = "my-tpu-repo"

# containers
IMAGE_NAME = "tpu-train"
TRAIN_IMAGE = f"{LOCATION}-docker.pkg.dev/{PROJECT_ID}/{REPOSITORY}/{IMAGE_NAME}:latest"
DEPLOY_VERSION = "tf2-gpu.2-13"
DEPLOY_IMAGE = f"us-docker.pkg.dev/cloud-aiplatform/prediction/{DEPLOY_VERSION}:latest"

print(f"Train compute...")
print(f"  accelerator_type  : {TRAIN_TPU.name}")
print(f"  accelerator_count : {TRAIN_NTPU}")
print(f"  machine_type      : {TRAIN_COMPUTE}")
print(f"  image             : {TRAIN_IMAGE}\n")

print(f"Deploy compute...")
print(f"  accelerator_type  : {DEPLOY_GPU.name}")
print(f"  accelerator_count : {DEPLOY_NGPU}")
print(f"  machine_type      : {DEPLOY_COMPUTE}")
print(f"  image             : {DEPLOY_IMAGE}\n")

Train compute...
  accelerator_type  : TPU_V2
  accelerator_count : 8
  machine_type      : cloud-tpu
  image             : us-central1-docker.pkg.dev/hybrid-vertex/my-tpu-repo/tpu-train:latest

Deploy compute...
  accelerator_type  : NVIDIA_TESLA_T4
  accelerator_count : 1
  machine_type      : n1-standard-4
  image             : us-docker.pkg.dev/cloud-aiplatform/prediction/tf2-gpu.2-13:latest



In [8]:
# update config
config.repository = REPOSITORY

config.train_tpu = TRAIN_TPU.name
config.train_tpu_count = TRAIN_NTPU
config.train_compute = TRAIN_COMPUTE
config.train_image = TRAIN_IMAGE

config.deploy_gpu = DEPLOY_GPU.name
config.deploy_gpu_count = DEPLOY_NGPU
config.deploy_compute = DEPLOY_COMPUTE
config.deploy_image = DEPLOY_IMAGE

## Training container build artifacts

#### custom docker image

*note: create a directory for writing the container build artifacts*

In [9]:
CONTAINER_ARTIFACTS_DIR = "tpu-container-artifacts"

! rm -rf {CONTAINER_ARTIFACTS_DIR}
! mkdir {CONTAINER_ARTIFACTS_DIR}

In [10]:
%%writefile $CONTAINER_ARTIFACTS_DIR/Dockerfile
FROM python:3.10

WORKDIR /root

# Download and install `tensorflow`.
RUN pip install https://storage.googleapis.com/cloud-tpu-tpuvm-artifacts/tensorflow/tf-2.13.0/tensorflow-2.13.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl

# Download and install `libtpu`.
# You must save `libtpu.so` in the '/lib' directory of the container image.
RUN curl -L https://storage.googleapis.com/cloud-tpu-tpuvm-artifacts/libtpu/1.7.0/libtpu.so -o /lib/libtpu.so

# Download and install tensorflow-datasets
RUN pip3 install tensorflow-datasets tensorboard tensorboard-plugin-profile tensorboard-plugin-wit tensorboard-data-server tensorflow-io
RUN pip3 install google-cloud-aiplatform[cloud_profiler]

# Copies the trainer code to the docker image.
COPY train.py /root/train.py

ENTRYPOINT ["python3", "train.py"]

Writing tpu-container-artifacts/Dockerfile


#### training script

> In the next cell, write the contents of the training script to *train.py*. 

In summary, your training script does the following:

- Gets the directory where to save the model artifacts from the environment variable `AIP_MODEL_DIR`. This variable is set by the training service.
- Loads CIFAR10 dataset from TF Datasets (tfds).
- Builds a model using TF.Keras model API.
- Compiles the model.
- Sets a training distribution strategy according to the argument `args.distribute`.
- Trains the model with epochs and steps according to the arguments `args.epochs` and `args.steps`
- Saves the trained model to the specified model directory.
- Runs TPU specific tasks:
    - Finds the TPU cluster, connects to the cluster, and sets the training strategy to TPUStrategy.
    - Saves the trained TPU model to the local device so that the model can be saved to the location in `AIP_MODEL_DIR`.

In [11]:
%%writefile {CONTAINER_ARTIFACTS_DIR}/train.py
# Single, Mirror and Multi-Machine Distributed Training for CIFAR-10

import tensorflow_datasets as tfds
import tensorflow as tf
from tensorflow.python.client import device_lib
import argparse
import os
import sys, traceback

# gcp
from google.cloud import aiplatform
from google.cloud.aiplatform.training_utils import cloud_profiler

tfds.disable_progress_bar()

parser = argparse.ArgumentParser()
parser.add_argument('--lr', dest='lr',
                    default=0.01, type=float,
                    help='Learning rate.')
parser.add_argument('--epochs', dest='epochs',
                    default=10, type=int,
                    help='Number of epochs.')
parser.add_argument('--steps', dest='steps',
                    default=200, type=int,
                    help='Number of steps per epoch.')
parser.add_argument('--distribute', dest='distribute', type=str, default='single',
                    help='distributed training strategy')
parser.add_argument('--batch_size',default=128, 
                    type=int, help='non-global')
parser.add_argument('--project', type=str)
parser.add_argument('--location', dest='location',
                    default="us-central1", type=str,)
parser.add_argument('--tb_instance', type=str)
parser.add_argument('--experiment_name', type=str)
parser.add_argument('--experiment_run', type=str)

args = parser.parse_args()

print('Python Version = {}'.format(sys.version))
print('TensorFlow Version = {}'.format(tf.__version__))
print('TF_CONFIG = {}'.format(os.environ.get('TF_CONFIG', 'Not found')))
print('DEVICES', device_lib.list_local_devices())

# Single Machine, single compute device
if args.distribute == 'single':
    if tf.test.is_gpu_available():
        strategy = tf.distribute.OneDeviceStrategy(device="/gpu:0")
    else:
        strategy = tf.distribute.OneDeviceStrategy(device="/cpu:0")
# Single Machine, multiple TPU devices
elif args.distribute == 'tpu':
    cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu="local")
    tf.config.experimental_connect_to_cluster(cluster_resolver)
    tf.tpu.experimental.initialize_tpu_system(cluster_resolver)
    strategy = tf.distribute.TPUStrategy(cluster_resolver)
    print("All devices: ", tf.config.list_logical_devices('TPU'))
# Single Machine, multiple compute device
elif args.distribute == 'mirror':
    strategy = tf.distribute.MirroredStrategy()
# Multiple Machine, multiple compute device
elif args.distribute == 'multi':
    strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()

# Multi-worker configuration
print('num_replicas_in_sync = {}'.format(strategy.num_replicas_in_sync))

# Initialize the profiler.
print('Initialize the profiler ...')
try:
    cloud_profiler.init()
except:
    ex_type, ex_value, ex_traceback = sys.exc_info()
    print("*** Unexpected:", ex_type.__name__, ex_value)
    traceback.print_tb(ex_traceback, limit=10, file=sys.stdout)
print('The profiler initiated.')

# initialize Vertex AI sdk
aiplatform.init(
    project=args.project, 
    location=args.location,
    experiment=args.experiment_name,
    experiment_tensorboard=args.tb_instance,
)

# set job directories
MODEL_DIR = os.getenv("AIP_MODEL_DIR")
print(f"MODEL_DIR = {MODEL_DIR}")

log_dir = "logs"
if 'AIP_TENSORBOARD_LOG_DIR' in os.environ:
    log_dir = os.environ['AIP_TENSORBOARD_LOG_DIR']
print(f"log_dir = {log_dir}")

print('Setting up the TensorBoard callback ...')
tensorboard_callback = tf.keras.callbacks.TensorBoard(
    log_dir=log_dir,
    update_freq="epoch",
    histogram_freq=1,
    # embeddings_freq=1,
    # profile_batch=20,
    # write_graph=True,
)

# Preparing dataset
BUFFER_SIZE = 10000
# BATCH_SIZE = 64

def make_datasets_unbatched():
    # Scaling CIFAR10 data from (0, 255] to (0., 1.]
    
    def scale(image, label):
        image = tf.cast(image, tf.float32)
        image /= 255.0
        return image, label

    datasets, info = tfds.load(
        name='cifar10', with_info=True, as_supervised=True
    )
    
    return datasets['train'].map(scale).cache().shuffle(BUFFER_SIZE).repeat()


# Build the Keras model
def build_and_compile_cnn_model():
    model = tf.keras.Sequential(
        [
            tf.keras.layers.Conv2D(32, 3, activation='relu', input_shape=(32, 32, 3)),
            tf.keras.layers.MaxPooling2D(),
            tf.keras.layers.Conv2D(32, 3, activation='relu'),
            tf.keras.layers.MaxPooling2D(),
            tf.keras.layers.Flatten(),
            tf.keras.layers.Dense(10, activation='softmax')
        ]
    )
    model.compile(
        loss=tf.keras.losses.sparse_categorical_crossentropy,
        optimizer=tf.keras.optimizers.SGD(learning_rate=args.lr),
        metrics=['accuracy']
    )
    return model

# Train the model
NUM_WORKERS = strategy.num_replicas_in_sync
# Here the batch size scales up by number of workers since
# `tf.data.Dataset.batch` expects the global batch size.
GLOBAL_BATCH_SIZE = args.batch_size * NUM_WORKERS

print(f"NUM_WORKERS = {NUM_WORKERS}")
print(f"BATCH_SIZE  = {args.batch_size}")
print(f"GLOBAL_BATCH_SIZE = {GLOBAL_BATCH_SIZE}")

train_dataset = make_datasets_unbatched().batch(GLOBAL_BATCH_SIZE)

with strategy.scope():
    # Creation of dataset, and model building/compiling need to be within
    # `strategy.scope()`.
    model = build_and_compile_cnn_model()

model.fit(
    x=train_dataset, 
    epochs=args.epochs, 
    steps_per_epoch=args.steps,
    callbacks=[tensorboard_callback],
)

if args.distribute=="tpu":
    save_locally = tf.saved_model.SaveOptions(experimental_io_device='/job:localhost')
    model.save(MODEL_DIR, options=save_locally)
else:
    model.save(MODEL_DIR)
    
# print('uploading TB logs ...')
# aiplatform.upload_tb_log(
#     tensorboard_experiment_name=args.experiment_name,
#     logdir=log_dir,
#     run_name_prefix=f"{args.experiment_run}-",
#     allowed_plugins=["profile"],
# )

Writing tpu-container-artifacts/train.py


In [12]:
!tree

[01;34m.[00m
├── 01_train_deploy_pipeline.ipynb
├── README.md
├── default_config.py
├── [01;34mtpu-container-artifacts[00m
│   ├── Dockerfile
│   └── train.py
└── tpu_train_cifar10_pipeline.json

1 directory, 6 files


### Build the training container image

Now, build and push the training container image to Artifact Registry using the Dockerfile.

In this section, you run the following steps:

1. Enable the Artifact Registry API.
2. Create a private repository in Artifact Registry.
3. Configure authentication to Artifact Registry.
4. Submit the training container image using Cloud Build.

**Enable Artifact Registry API**

> You must enable the Artifact Registry API service for your project.

<a href="https://cloud.google.com/artifact-registry/docs/enable-service">Learn more about Enabling service</a>.

In [31]:
# ! gcloud config set project {PROJECT_ID}
# ! gcloud services enable artifactregistry.googleapis.com

# if os.getenv("IS_TESTING"):
#     ! sudo apt-get update --yes && sudo apt-get --only-upgrade --yes install google-cloud-sdk-cloud-run-proxy google-cloud-sdk-harbourbridge google-cloud-sdk-cbt google-cloud-sdk-gke-gcloud-auth-plugin google-cloud-sdk-kpt google-cloud-sdk-local-extract google-cloud-sdk-minikube google-cloud-sdk-app-engine-java google-cloud-sdk-app-engine-go google-cloud-sdk-app-engine-python google-cloud-sdk-spanner-emulator google-cloud-sdk-bigtable-emulator google-cloud-sdk-nomos google-cloud-sdk-package-go-module google-cloud-sdk-firestore-emulator kubectl google-cloud-sdk-datastore-emulator google-cloud-sdk-app-engine-python-extras google-cloud-sdk-cloud-build-local google-cloud-sdk-kubectl-oidc google-cloud-sdk-anthos-auth google-cloud-sdk-app-engine-grpc google-cloud-sdk-pubsub-emulator google-cloud-sdk-datalab google-cloud-sdk-skaffold google-cloud-sdk google-cloud-sdk-terraform-tools google-cloud-sdk-config-connector
#     ! gcloud components update --quiet

# # Create the repository (only run once)
# ! gcloud artifacts repositories create $REPOSITORY --repository-format=docker --location=$LOCATION

Updated property [core/project].
Create request issued for: [my-tpu-repo]
Waiting for operation [projects/hybrid-vertex/locations/us-central1/operations/
7ede907a-43c3-4b44-925e-3a7d07ac7d0a] to complete...done.                      
Created repository [my-tpu-repo].


In [11]:
# ! gcloud auth configure-docker $LOCATION-docker.pkg.dev --quiet

## Submit container to Cloud Build

> Submit the training container image using Cloud Build. The image gets saved to the repository path that is provided in the tag.

In [13]:
!gcloud builds submit {CONTAINER_ARTIFACTS_DIR} --region={LOCATION} --tag={TRAIN_IMAGE}

Creating temporary archive of 2 file(s) totalling 6.5 KiB before compression.
Uploading tarball of [tpu-container-artifacts] to [gs://hybrid-vertex_cloudbuild/source/1736824521.534143-89c23a59fee74575bcb7284c35e10d55.tgz]
Created [https://cloudbuild.googleapis.com/v1/projects/hybrid-vertex/locations/us-central1/builds/100a219e-e3d7-4bd5-938f-8bf65a2da9b6].
Logs are available at [ https://console.cloud.google.com/cloud-build/builds;region=us-central1/100a219e-e3d7-4bd5-938f-8bf65a2da9b6?project=934903580331 ].
Waiting for build to complete. Polling interval: 1 second(s).
----------------------------- REMOTE BUILD OUTPUT ------------------------------
starting build "100a219e-e3d7-4bd5-938f-8bf65a2da9b6"

FETCHSOURCE
Fetching storage object: gs://hybrid-vertex_cloudbuild/source/1736824521.534143-89c23a59fee74575bcb7284c35e10d55.tgz#1736824521814929
Copying gs://hybrid-vertex_cloudbuild/source/1736824521.534143-89c23a59fee74575bcb7284c35e10d55.tgz#1736824521814929...
/ [1 files][  2.7 KiB

# Build pipeline

## Set Vertex Experiment

In [16]:
EXP_VERSION="v11"
RUN_TAG = "b128" # b128 | b256 | b512 | b1024

In [17]:
EXPERIMENT_NAME   = f'tpu-train-{EXP_VERSION}'
invoke_time       = time.strftime("%Y%m%d-%H%M%S")

# new experiment run
if RUN_TAG:
    RUN_NAME      = f'run-{RUN_TAG}'
else:
    RUN_NAME      = f'run-{invoke_time}'

# output dirs
EXPERIMENT_DIR    = os.path.join(BUCKET_URI, EXPERIMENT_NAME)
CHECKPT_DIR       = os.path.join(EXPERIMENT_DIR, "chkpoint")
BASE_OUTPUT_DIR   = os.path.join(EXPERIMENT_DIR, RUN_NAME)
LOG_DIR           = os.path.join(BASE_OUTPUT_DIR, "logs")
DATA_DIR          = os.path.join(BASE_OUTPUT_DIR, "data")
ARTIFACTS_DIR     = os.path.join(BASE_OUTPUT_DIR, "model")

aiplatform.init(
    project=PROJECT_ID,
    # staging_bucket=BUCKET_URI,
    location=LOCATION,
    experiment=EXPERIMENT_NAME,
    experiment_tensorboard=True,
)

tensorboard = aiplatform.Experiment(EXPERIMENT_NAME).get_backing_tensorboard_resource()
TB_INSTANCE = tensorboard.resource_name

print(f"EXPERIMENT_NAME : {EXPERIMENT_NAME}")
print(f"RUN_NAME        : {RUN_NAME}\n")
print(f"CHECKPT_DIR     : {CHECKPT_DIR}")
print(f"BASE_OUTPUT_DIR : {BASE_OUTPUT_DIR}")
print(f"LOG_DIR         : {LOG_DIR}")
print(f"DATA_DIR        : {DATA_DIR}")
print(f"ARTIFACTS_DIR   : {ARTIFACTS_DIR}\n")
print(f"TB_INSTANCE     : {TB_INSTANCE}")

EXPERIMENT_NAME : tpu-train-v11
RUN_NAME        : run-b128

CHECKPT_DIR     : gs://tpu-pipeline-hybrid-vertex/tpu-train-v11/chkpoint
BASE_OUTPUT_DIR : gs://tpu-pipeline-hybrid-vertex/tpu-train-v11/run-b128
LOG_DIR         : gs://tpu-pipeline-hybrid-vertex/tpu-train-v11/run-b128/logs
DATA_DIR        : gs://tpu-pipeline-hybrid-vertex/tpu-train-v11/run-b128/data
ARTIFACTS_DIR   : gs://tpu-pipeline-hybrid-vertex/tpu-train-v11/run-b128/model

TB_INSTANCE     : projects/934903580331/locations/us-central1/tensorboards/257140585364717568


In [18]:
# config.experiment_name = EXPERIMENT_NAME
# config.run_name = RUN_NAME
# config.experiment_dir = EXPERIMENT_DIR
# config.checkpoint_dir = CHECKPT_DIR
# config.base_output_dir = BASE_OUTPUT_DIR
# config.log_dir = LOG_DIR
# config.data_dir = DATA_DIR
# config.artifacts_dir = ARTIFACTS_DIR
config.tb_instance = TB_INSTANCE

## Define pipeline

> The components required for the key tasks of the pipeline are defined using  [`google_cloud_pipeline_components`](https://github.com/kubeflow/pipelines/tree/master/components/google-cloud). These tasks involve: upload the model, create an endpoint, and deploy the model to the endpoint.

The pipeline has four main steps:

1) The `CustomTrainingJobOp` runs the docker container image which executes the training task using TPU environment.
2) The `ModelUploadOp` uploads the trained model to Vertex AI Model Registry.
3) The `EndpointCreateOp` creates a Vertex AI endpoint resource.
4) Finally, the `ModelDeployOp` deploys the model to the endpoint.

**Note:** The `ModelDeployOp` component creates an endpoint if one isn't provided.

In [19]:
PIPELINE_ROOT = f"{EXPERIMENT_DIR}/pipeline_root/tpu_cifar10_pipeline"
PIPELINE_NAME = f"train-endpoint-deploy-{EXP_VERSION}"

MODEL_DISPLAY_NAME = f"cifar10-model-{RUN_NAME}"

# update config
config.pipeline_root = PIPELINE_ROOT
config.pipeline_name = PIPELINE_NAME
config.model_display_name = MODEL_DISPLAY_NAME

print(f"PIPELINE_ROOT      : {PIPELINE_ROOT}")
print(f"PIPELINE_NAME      : {PIPELINE_NAME}")
print(f"MODEL_DISPLAY_NAME : {MODEL_DISPLAY_NAME}")

PIPELINE_ROOT      : gs://tpu-pipeline-hybrid-vertex/tpu-train-v11/pipeline_root/tpu_cifar10_pipeline
PIPELINE_NAME      : train-endpoint-deploy-v11
MODEL_DISPLAY_NAME : cifar10-model-run-b128


In [20]:
from kfp import dsl
from kfp.dsl import importer
# from kfp.dsl import OneOf
from google_cloud_pipeline_components.types import artifact_types

# Define the pipeline
@kfp.dsl.pipeline(name=PIPELINE_NAME)
def pipeline(
    worker_pool_specs: list,
    model_display_name: str,
    serving_container_image_uri: str,
    model_artifact_uri: str,
    deployment_machine_type: str,
    deployment_accelerator_type: str,
    deployment_accelerator_count: int,
    project: str,
    tensorboard_resource: str,
    service_account: str,
    existing_endpoint: bool = False,
    endpoint_resource_uri: str = None,
    endpoint_resource_name: str = None,
    base_output_directory: str = None,
    # experiment_name: str = None,
    # experiment_run: str = None,
):
    
    # Notification task
    notify_task = VertexNotificationEmailOp(
        recipients= EMAIL_RECIPIENTS
    )
    
    with dsl.ExitHandler(notify_task, name='TPU Train Pipeline'):

    # Run the custom training job
    custom_job_task = CustomTrainingJobOp(
        display_name=model_display_name,
        worker_pool_specs=worker_pool_specs,
        base_output_directory=base_output_directory,
        tensorboard=tensorboard_resource,
        service_account=service_account,
        enable_web_access=True,
    )

    # # Import the trained model
    # import_unmanaged_model_task = importer_node.importer(
    #     artifact_uri=model_artifact_uri,
    #     artifact_class=artifact_types.UnmanagedContainerModel,
    #     metadata={
    #         "containerSpec": {"imageUri": serving_container_image_uri},
    #     },
    # ).after(custom_job_task)
    
    # Import the trained model
    import_unmanaged_model_task = importer(
        artifact_uri=model_artifact_uri,
        artifact_class=artifact_types.UnmanagedContainerModel,
        metadata={
            "containerSpec": {"imageUri": serving_container_image_uri},
        },
    ).after(custom_job_task)

    # Upload the model
    model_upload_op = ModelUploadOp(
        project=project,
        display_name=model_display_name,
        unmanaged_container_model=import_unmanaged_model_task.outputs["artifact"],
    )
    
    with dsl.If(existing_endpoint == True):
        
        # Import existing endpoint
        endpoint = importer(
            artifact_uri=endpoint_resource_uri,
            artifact_class=artifact_types.VertexEndpoint,
            metadata={"resourceName": endpoint_resource_name},
        )
        # Deploy model to existing endpoint
        _ = ModelDeployOp(
            endpoint=endpoint.output,
            model=model_upload_op.outputs["model"],
            deployed_model_display_name=model_display_name,
            dedicated_resources_machine_type=deployment_machine_type,
            dedicated_resources_min_replica_count=1,
            dedicated_resources_max_replica_count=1,
            dedicated_resources_accelerator_type=deployment_accelerator_type,
            dedicated_resources_accelerator_count=deployment_accelerator_count,
            traffic_split={"0": 100},
        )
        
    with dsl.Else():
        # Create an endpoint
        endpoint_create_op = EndpointCreateOp(
            project=project,
            display_name="tpu-pipeline-created-endpoint",
        )

        # Deploy the model to new endpoint
        _ = ModelDeployOp(
            endpoint=endpoint_create_op.outputs["endpoint"],
            model=model_upload_op.outputs["model"],
            deployed_model_display_name=model_display_name,
            dedicated_resources_machine_type=deployment_machine_type,
            dedicated_resources_min_replica_count=1,
            dedicated_resources_max_replica_count=1,
            dedicated_resources_accelerator_type=deployment_accelerator_type,
            dedicated_resources_accelerator_count=deployment_accelerator_count,
        )

## Compile the pipeline

Next, compile the pipeline to a JSON file.

In [21]:
# Set the file name
PIPELINE_PACKAGE_FILE = "tpu_train_cifar10_pipeline.json"

# Compile the pipeline
compiler.Compiler().compile(
    pipeline_func=pipeline,
    package_path=PIPELINE_PACKAGE_FILE,
)

config.pipeline_local_json = PIPELINE_PACKAGE_FILE

## Run the pipeline

Before you run the pipeline, define worker pool specs and other parameters required for running the pipeline.

The above defined pipeline takes the following parameters:

- `worker_pool_specs`: The worker pool specs required for running the training job.
- `model_display_name`: A display name for the uploaded model.
- `serving_container_image_uri`: Container image used for model deployment.
- `model_artifact_uri`: Artifact location of the trained model.
- `deployment_machine_type`: Machine type for model deployment.
- `deployment_accelerator_type`: Accelerator type for model deployment.
- `deployment_accelerator_count`: Number of accelerators required for model deployment.
- `project`: ID of the Google Cloud project where the pipeline runs.

### Set train args

In [22]:
EPOCHS = 10
STEPS = 10_000
BATCH_SIZE = 128 # 128 256 512 1024

if not TRAIN_NTPU or TRAIN_NTPU < 2:
    TRAIN_STRATEGY = "single"
else:
    TRAIN_STRATEGY = "tpu"

TRAINER_ARGS = [
    "--epochs=" + str(EPOCHS),
    "--steps=" + str(STEPS),
    "--distribute=" + TRAIN_STRATEGY,
    "--project=" + PROJECT_ID,
    "--location=" + LOCATION,
    "--tb_instance=" + TB_INSTANCE,
    "--experiment_name=" + EXPERIMENT_NAME,
    "--experiment_run=" + RUN_NAME,
    "--batch_size=" + str(BATCH_SIZE),
]

# Define the worker pool specs required for custom training job
WORKER_POOL_SPECS = [
    {
        "container_spec": {
            "args": TRAINER_ARGS,
            "env": [
                {"name": "AIP_MODEL_DIR", "value": ARTIFACTS_DIR},
                {"name": "AIP_TENSORBOARD_LOG_DIR", "value": LOG_DIR}
            ],
            "image_uri": TRAIN_IMAGE,
        },
        "replica_count": 1,
        "machine_spec": {
            "machine_type": TRAIN_COMPUTE,
            "accelerator_type": TRAIN_TPU,
            "accelerator_count": TRAIN_NTPU,
        },
    }
]
pprint(WORKER_POOL_SPECS)

[{'container_spec': {'args': ['--epochs=10',
                              '--steps=10000',
                              '--distribute=tpu',
                              '--project=hybrid-vertex',
                              '--tb_instance=projects/934903580331/locations/us-central1/tensorboards/257140585364717568',
                              '--experiment_name=tpu-train-v11',
                              '--experiment_run=run-b128',
                              '--batch_size=128'],
                     'env': [{'name': 'AIP_MODEL_DIR',
                              'value': 'gs://tpu-pipeline-hybrid-vertex/tpu-train-v11/run-b128/model'},
                             {'name': 'AIP_TENSORBOARD_LOG_DIR',
                              'value': 'gs://tpu-pipeline-hybrid-vertex/tpu-train-v11/run-b128/logs'}],
                     'image_uri': 'us-central1-docker.pkg.dev/hybrid-vertex/my-tpu-repo/tpu-train:latest'},
  'machine_spec': {'accelerator_count': 8,
                   'acce

### set deploy args

In [23]:
# existing endpoint 
EXISTING_ENDPOINT_BOOL = True
ENDPOINT_ID = "4267284891747483648"
ENDPOINT_RESOURCE_NAME=f"projects/934903580331/locations/us-central1/endpoints/{ENDPOINT_ID}"
ENDPOINT_RESOURCE_URI = f"https://us-central1-aiplatform.googleapis.com/v1/{ENDPOINT_RESOURCE_NAME}"

if not EXISTING_ENDPOINT_BOOL:
    ENDPOINT_RESOURCE_NAME = None
    ENDPOINT_RESOURCE_URI = None

print(f"EXISTING_ENDPOINT_BOOL : {EXISTING_ENDPOINT_BOOL}")
print(f"ENDPOINT_RESOURCE_NAME : {ENDPOINT_RESOURCE_NAME}")
print(f"ENDPOINT_RESOURCE_URI  : {ENDPOINT_RESOURCE_URI}")

EXISTING_ENDPOINT_BOOL : True
ENDPOINT_RESOURCE_NAME : projects/934903580331/locations/us-central1/endpoints/4267284891747483648
ENDPOINT_RESOURCE_URI  : https://us-central1-aiplatform.googleapis.com/v1/projects/934903580331/locations/us-central1/endpoints/4267284891747483648


### update config

In [24]:
config.train_strategy = TRAIN_STRATEGY
config.epochs = EPOCHS
config.steps = STEPS
config.trainer_args = TRAINER_ARGS
config.endpoint_id = ENDPOINT_ID

## submit pipeline job

In [25]:
# Create the pipeline job
job = aiplatform.PipelineJob(
    display_name=PIPELINE_NAME,
    template_path=PIPELINE_PACKAGE_FILE,
    pipeline_root=PIPELINE_ROOT,
    parameter_values={
        "worker_pool_specs": WORKER_POOL_SPECS,
        "model_display_name": MODEL_DISPLAY_NAME,
        "serving_container_image_uri": DEPLOY_IMAGE,
        "model_artifact_uri": ARTIFACTS_DIR,
        "deployment_machine_type": DEPLOY_COMPUTE,
        "deployment_accelerator_type": DEPLOY_GPU.name,
        "deployment_accelerator_count": DEPLOY_NGPU,
        "project": PROJECT_ID,
        "tensorboard_resource": TB_INSTANCE,
        "service_account": SERVICE_ACCOUNT,
        "existing_endpoint": EXISTING_ENDPOINT_BOOL,
        "endpoint_resource_uri": ENDPOINT_RESOURCE_URI,
        "endpoint_resource_name": ENDPOINT_RESOURCE_NAME,
        "base_output_directory": BASE_OUTPUT_DIR,
        # "experiment_name": EXPERIMENT_NAME,
        # "experiment_run": RUN_NAME,
    },
)

# # Run the pipeline job
# job.run(sync=False)

job.submit(
    experiment=EXPERIMENT_NAME,
    service_account=SERVICE_ACCOUNT
)

Creating PipelineJob
PipelineJob created. Resource name: projects/934903580331/locations/us-central1/pipelineJobs/train-endpoint-deploy-v11-20250109173831
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/934903580331/locations/us-central1/pipelineJobs/train-endpoint-deploy-v11-20250109173831')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/train-endpoint-deploy-v11-20250109173831?project=934903580331
Associating projects/934903580331/locations/us-central1/pipelineJobs/train-endpoint-deploy-v11-20250109173831 to Experiment: tpu-train-v11


### inspect config

In [65]:
config

bucket_uri: gs://tpu-pipeline-hybrid-vertex
deploy_compute: n1-standard-4
deploy_gpu: NVIDIA_TESLA_T4
deploy_gpu_count: 1
deploy_image: us-docker.pkg.dev/cloud-aiplatform/prediction/tf2-gpu.2-13:latest
endpoint_id: '4267284891747483648'
epochs: 15
location: us-central1
model_display_name: cifar10-class-model
pipeline_local_json: tpu_train_cifar10_pipeline.json
pipeline_name: train-endpoint-deploy-v10
pipeline_root: gs://tpu-pipeline-hybrid-vertex/pipeline_root/tpu_cifar10_pipeline
project_id: hybrid-vertex
repository: my-tpu-repo
seed: null
service_account: 934903580331-compute@developer.gserviceaccount.com
steps: 10000
tb_instance: projects/934903580331/locations/us-central1/tensorboards/257140585364717568
train_compute: cloud-tpu
train_image: us-central1-docker.pkg.dev/hybrid-vertex/my-tpu-repo/tpu-train:latest
train_strategy: tpu
train_tpu: TPU_V2
train_tpu_count: 8
trainer_args:
- --epochs=15
- --steps=10000
- --distribute=tpu
- --project=hybrid-vertex
- --tb_instance=projects/9349

## Get pipeline task details

In [None]:
def get_task_detail(
    task_details: List[Dict[str, Any]], task_name: str
) -> List[Dict[str, Any]]:
    """Function to fetch the details from the specifed task name"""
    for task_detail in task_details:
        if task_detail.task_name == task_name:
            return task_detail

In [None]:
# Get the pipeline task details
pipeline_task_details = (
    job.gca_resource.job_detail.task_details
)  # fetch pipeline task details

pipeline_task_details

In [None]:
# # Fetch endpoint from pipeline
# endpoint_task = get_task_detail(pipeline_task_details, "endpoint-create")
# endpoint_resourceName = (
#     endpoint_task.outputs["endpoint"].artifacts[0].metadata["resourceName"]
# )
# endpoint = aiplatform.Endpoint(endpoint_resourceName)
# endpoint

In [26]:
# # Fetch model from pipeline
# model_task = get_task_detail(pipeline_task_details, "model-upload")
# model_resourceName = model_task.outputs["model"].artifacts[0].metadata["resourceName"]
# model = aiplatform.Model(model_resourceName)

# model

### delete if necessary 

In [None]:
# # Undeploy model from endpoint
# endpoint.undeploy_all()

# # Delete the endpoint
# endpoint.delete()

In [None]:
# # Delete the model
# model.delete()

In [None]:
# # Delete the pipeline job
# job.delete()

**Finished**