## CircularNet Vertex AI Retraining Pipeline

The goal is to train CircularNet model on Vertex AI using published checkpoints and configuration file.

CircularNet team already open sourced the model on GitHub. But if users want to train or fine tune the model with their own training images, user can ran this notebook to launch a training job on Vertex AI. After training is completed, checkpoints will be output to the GCP storage bucket. User can then export the checkpoints to a saved TF model.

##Import Libraries and Setup Environment

In [1]:
if "google.colab" in str(get_ipython()):
    # install google cloud API SDK
    ! pip3 install -q --upgrade google-cloud-aiplatform[tensorboard]
    # install model-garden official
    ! pip3 install -q tf-models-official


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/5.7 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/5.7 MB[0m [31m50.5 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m5.7/5.7 MB[0m [31m77.1 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m5.7/5.7 MB[0m [31m77.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.7/5.7 MB[0m [31m46.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.2/289.2 kB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m51.1 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source 

In [2]:
import os
from datetime import datetime
from google.cloud import aiplatform
from google.cloud.aiplatform import hyperparameter_tuning as hpt

In [3]:
# authenticate google colab
if "google.colab" in str(get_ipython()):

    from google.colab import auth as google_auth

    google_auth.authenticate_user()

## Configure and Launch training job on Vertex AI

In [4]:
# you can set the train job name prefix
TRAINING_JOB_PREFIX = "training_material_model" # @param {type:"string"}
OBJECTIVE = "iod"

def get_job_name_with_datetime(prefix: str):
  """create a unique job name
  Args:
    prefix: prefix string for the training job name
  Returns:
    a unique training job name by appending a timestamp to prefix
  """
  return prefix + datetime.now().strftime("_%Y%m%d_%H%M%S")

train_job_name = get_job_name_with_datetime(TRAINING_JOB_PREFIX + "_" + OBJECTIVE)

In [5]:
train_job_name

'training_material_model_iod_20241218_190038'

In [6]:
PROJECT_ID = "waste-identification-ml-330916"  # @param {type:"string"}
BUCKET_URI = "gs://circularnet"  # @param {type:"string"}
REGION = "us-central1"  # @param {type:"string"}

model_dir = os.path.join(BUCKET_URI, train_job_name)

STAGING_BUCKET = os.path.join(BUCKET_URI, "temporal")
CHECKPOINT_BUCKET = os.path.join(BUCKET_URI, "ckpt")

tensorboard_name = get_job_name_with_datetime("tensorboard")

tensorboard = aiplatform.Tensorboard.create(
        display_name=tensorboard_name,
        project=PROJECT_ID,
        location=REGION,
)

aiplatform.init(project=PROJECT_ID,
                location=REGION,
                staging_bucket=STAGING_BUCKET,
                experiment_tensorboard=tensorboard)

INFO:google.cloud.aiplatform.tensorboard.tensorboard_resource:Creating Tensorboard
INFO:google.cloud.aiplatform.tensorboard.tensorboard_resource:Create Tensorboard backing LRO: projects/372354466754/locations/us-central1/tensorboards/3436752291032465408/operations/7568047832309956608
INFO:google.cloud.aiplatform.tensorboard.tensorboard_resource:Tensorboard created. Resource name: projects/372354466754/locations/us-central1/tensorboards/3436752291032465408
INFO:google.cloud.aiplatform.tensorboard.tensorboard_resource:To use this Tensorboard in another session:
INFO:google.cloud.aiplatform.tensorboard.tensorboard_resource:tb = aiplatform.Tensorboard('projects/372354466754/locations/us-central1/tensorboards/3436752291032465408')


In [7]:
model_dir

'gs://circularnet/training_material_model_iod_20241218_190038'

In [8]:
tensorboard.resource_name

'projects/372354466754/locations/us-central1/tensorboards/3436752291032465408'

In [9]:
OBJECTIVE = 'iod'
REGION_PREFIX = REGION.split("-")[0]
assert REGION_PREFIX in (
    "us",
    "europe",
    "asia",
), f'{REGION} is not supported. It must be prefixed by "us", "asia", or "europe".'

# set the Training constants.
TRAINING_JOB_PREFIX = "train"
TRAIN_CONTAINER_URI = f"{REGION_PREFIX}-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/tfvision-oss-v2"
TRAIN_MACHINE_TYPE = "n1-highmem-16"
TRAIN_ACCELERATOR_TYPE = "NVIDIA_TESLA_V100"
TRAIN_NUM_GPU = 4

# set the Evaluation constants.
EVALUATION_METRIC = "mean_iou"


In [10]:
# set the path to model training TF records, checkpoints and config
input_train_data_path = 'gs://circularnet/vertex_training/test0731/benjamin_30july2024/tfrecords_train/*.tfrecord' # @param {type:"string"}
input_validation_data_path = 'gs://circularnet/vertex_training/test0731/benjamin_30july2024/tfrecords_val/*.tfrecord' # @param {type:"string"}
init_checkpoint_path = 'gs://circularnet/ckpt/transfer-learning/material_form/ckpt-582000' # @param {type:"string"}
config_file_path = 'gs://circularnet/config/config_transferLearning_V5_7_31.yaml' # @param {type:"string"}
#total material form model labels
num_classes = 39 # @param {type:"integer"}

# set the path of initial checkpoint and config yaml file
# all the args are configurable based on you specific use case
experiment_container_args_dict = {
    # maskrcnn experiment args.
     "maskrcnn_resnetfpn_coco": {
        "experiment": "maskrcnn_resnetfpn_coco",
        "init_checkpoint": init_checkpoint_path,
        "config_file": config_file_path,
        "input_train_data_path": input_train_data_path,
        "input_validation_data_path": input_validation_data_path,
        "objective": OBJECTIVE,
        "model_dir": f"{model_dir}/trained_model",
        "num_classes": num_classes,
        "global_batch_size": 4,
        "prefetch_buffer_size": 12,
        "train_steps": 100,
    }
}


experiment = "maskrcnn_resnetfpn_coco"
experiment_container_args = experiment_container_args_dict[experiment]

#configure the training VM, GPU type, # of GPUs
worker_pool_specs = [
    {
        "machine_spec": {
            "machine_type": TRAIN_MACHINE_TYPE,
            "accelerator_type": TRAIN_ACCELERATOR_TYPE,
            "accelerator_count": TRAIN_NUM_GPU,
        },
        "replica_count": 1,
        "container_spec": {
            "image_uri": TRAIN_CONTAINER_URI,
            "args": [
                "--mode=train_and_eval",
            ]
            + ["--{}={}".format(k, v) for k, v in experiment_container_args.items()],
        },
    },
]

metric_spec = {"model_performance": "maximize"}


train_custom_job = aiplatform.CustomJob(
    display_name=train_job_name,
    project=PROJECT_ID,
    worker_pool_specs=worker_pool_specs,
    staging_bucket=STAGING_BUCKET,
)

LEARNING_RATES = [0.001]

MAX_TRIAL_COUNT = len(LEARNING_RATES)

parameter_spec = {
    "learning_rate": hpt.DiscreteParameterSpec(values=LEARNING_RATES, scale="linear"),
}

# create the Vertex AI hyperparameter training job
train_hpt_job = aiplatform.HyperparameterTuningJob(
    display_name=train_job_name,
    custom_job=train_custom_job,
    metric_spec=metric_spec,
    parameter_spec=parameter_spec,
    max_trial_count=MAX_TRIAL_COUNT,
    parallel_trial_count=1,
    project=PROJECT_ID,
    search_algorithm=None,
)

# please change it to your own service_account
train_hpt_job.run()

INFO:google.cloud.aiplatform.jobs:Creating HyperparameterTuningJob
INFO:google.cloud.aiplatform.jobs:HyperparameterTuningJob created. Resource name: projects/372354466754/locations/us-central1/hyperparameterTuningJobs/5866830459098365952
INFO:google.cloud.aiplatform.jobs:To use this HyperparameterTuningJob in another session:
INFO:google.cloud.aiplatform.jobs:hpt_job = aiplatform.HyperparameterTuningJob.get('projects/372354466754/locations/us-central1/hyperparameterTuningJobs/5866830459098365952')
INFO:google.cloud.aiplatform.jobs:View HyperparameterTuningJob:
https://console.cloud.google.com/ai/platform/locations/us-central1/training/5866830459098365952?project=372354466754
INFO:google.cloud.aiplatform.jobs:HyperparameterTuningJob projects/372354466754/locations/us-central1/hyperparameterTuningJobs/5866830459098365952 current state:
JobState.JOB_STATE_PENDING
INFO:google.cloud.aiplatform.jobs:HyperparameterTuningJob projects/372354466754/locations/us-central1/hyperparameterTuningJobs/

RuntimeError: Job failed with:
code: 3
message: "Hyperparameter Tuning Trial #1 Failed before any other successful trials were completed. The failed trial had parameters: learning_rate=0.001, .  The trial\'s error message was: The replica workerpool0-0 exited with a non-zero status of 1. Termination reason: Error. To find out more about why your job exited please check the logs: https://console.cloud.google.com/logs/viewer?project=372354466754&resource=ml_job%2Fjob_id%2F5866830459098365952&advancedFilter=resource.type%3D%22ml_job%22%0Aresource.labels.job_id%3D%225866830459098365952%22"


In [None]:
# load train logs into tensorboard

tensorboard_log_dir = f"{model_dir}/trained_model/trial_1/train"

aiplatform.start_upload_tb_log(
      tensorboard_id=tensorboard.name,
      tensorboard_experiment_name=train_hpt_job.name,
      logdir=tensorboard_log_dir,
      description="train"
  )
aiplatform.end_upload_tb_log()

In [None]:
# load validation logs into tensorboard

tensorboard_log_dir = f"{model_dir}/trained_model/trial_1/validation"

aiplatform.start_upload_tb_log(
      tensorboard_id=tensorboard.name,
      tensorboard_experiment_name=train_hpt_job.name,
      logdir=tensorboard_log_dir,
      description="validation"
  )
aiplatform.end_upload_tb_log()