# BERT fine-tuning with Vertex AI

* Building example from [this notebook example](https://github.com/RajeshThallam/vertex-ai-labs/blob/main/03-distributed-training-text/03-distributed-training-vertex-ai-bert-finetuning.ipynb)

This notebook demonstrates how to configure Hyperparameter Tuning and Distributed training in a single training script. 

After tuning, examples for scaling training code across the following configurations:

* 1 replica, 1 GPU
* 1 replica, 2 GPUs each
* 2 replicas, 1 GPU each
* 2 replicas, 1 GPU each + Reduction Server
* 2 replicas, 2 GPUs each + Reduction Server

## Setup

### pips

In [142]:
# !pip install --user --upgrade google-cloud-aiplatform -q
# !pip install --user --upgrade kfp -q
# !pip install --user --upgrade google-cloud-pipeline-components -q
# !pip install --user --upgrade google-cloud-bigquery-datatransfer -q
# !pip install --user tf-models-official==2.11.0 tensorflow-text==2.11.0 -q
# pip install tensorflow_io

In [2]:
# import IPython
# app = IPython.Application.instance()
# app.kernel.do_shutdown(True)

In [2]:
!pwd

/home/jupyter/raj_sample/vertex-training


### imports

In [4]:
import os

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

import shutil
import sys
import pprint
import pandas as pd
import tensorflow as tf
import time
import csv
from datetime import datetime

import matplotlib.pyplot as plt

import google.auth
from google.protobuf.json_format import MessageToDict
from google.protobuf import json_format
from google.protobuf.struct_pb2 import Value


from google.cloud import aiplatform as vertex_ai
from google.cloud.aiplatform import hyperparameter_tuning as hpt
from google.cloud.aiplatform_v1beta1 import types
from google.cloud import bigquery
from google.cloud import exceptions

from google.cloud.aiplatform.utils import JobClientWithOverride

import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental import preprocessing

from tensorflow_io import bigquery as tfio_bq

import logging
logging.disable(logging.WARNING)

### set vars

In [5]:
PREFIX = 'jtv9'

In [6]:
# creds, PROJECT_ID = google.auth.default()
GCP_PROJECTS             = !gcloud config get-value project
PROJECT_ID               = GCP_PROJECTS[0]

PROJECT_NUM              = !gcloud projects describe $PROJECT_ID --format="value(projectNumber)"
PROJECT_NUM              = PROJECT_NUM[0]

VERTEX_SA = f'{PROJECT_NUM}-compute@developer.gserviceaccount.com' # 934903580331

REGION = 'us-central1'

STAGING_BUCKET = f'gs://{PREFIX}-{PROJECT_ID}-bucket'

print(f"PROJECT_ID     = {PROJECT_ID}")
print(f"PROJECT_NUM    = {PROJECT_NUM}")
print(f"STAGING_BUCKET = {STAGING_BUCKET}")
print(f"VERTEX_SA      = {VERTEX_SA}")

PROJECT_ID     = hybrid-vertex
PROJECT_NUM    = 934903580331
STAGING_BUCKET = gs://jtv9-hybrid-vertex-bucket
VERTEX_SA      = 934903580331-compute@developer.gserviceaccount.com


### create staging GCS bucket

In [7]:
! gsutil mb -l $REGION $STAGING_BUCKET

Creating gs://jtv9-hybrid-vertex-bucket/...


In [8]:
# TENSORBOARD_NAME = f"tb-bert-tune-{PREFIX}"  # @param {type:"string"}

# if (
#     TENSORBOARD_NAME == ""
#     or TENSORBOARD_NAME is None
#     or TENSORBOARD_NAME == "[your-tensorboard-name]"
# ):
#     TENSORBOARD_NAME = PROJECT_ID + "-tb-" #+ UUID

# tensorboard = vertex_ai.Tensorboard.create(
#     display_name=TENSORBOARD_NAME, project=PROJECT_ID, location=REGION
# )
# TENSORBOARD = tensorboard.gca_resource.name
# print("TENSORBOARD:", TENSORBOARD)

## Data prep

In [9]:
local_dir = os.path.expanduser('~')
local_dir = f'{local_dir}/distributed-training/datasets'

if tf.io.gfile.exists(local_dir):
    tf.io.gfile.rmtree(local_dir)
tf.io.gfile.makedirs(local_dir)

url = 'https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
local_path = f'{local_dir}/aclImdb_v1.tar.gz'
print(f'local_path: {local_path}')

dataset = tf.keras.utils.get_file(
    local_path
    , url
    , untar=True
    , cache_dir=local_dir
    , cache_subdir='.'
)
dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')
print(f'dataset_dir: {dataset_dir}')

train_dir = os.path.join(dataset_dir, 'train')
print(f'train_dir: {train_dir}')

# remove unused folders to make it easier to load the data
remove_dir = os.path.join(train_dir, 'unsup')
shutil.rmtree(remove_dir)

local_path: /home/jupyter/distributed-training/datasets/aclImdb_v1.tar.gz
Downloading data from https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
dataset_dir: /home/jupyter/distributed-training/datasets/aclImdb
train_dir: /home/jupyter/distributed-training/datasets/aclImdb/train


### data splits

In [10]:
def create_splits(train_dir, test_dir, val_split, seed):
    
    train_ds = tf.keras.preprocessing.text_dataset_from_directory(
        train_dir,
        validation_split=val_split,
        subset='training',
        seed=seed)

    class_names = train_ds.class_names
    
    train_ds = train_ds.unbatch()

    val_ds = tf.keras.preprocessing.text_dataset_from_directory(
        train_dir,
        validation_split=val_split,
        subset='validation',
        seed=seed).unbatch()

    test_ds = tf.keras.preprocessing.text_dataset_from_directory(
        test_dir).unbatch()

    return train_ds, val_ds, test_ds, class_names

In [11]:
seed = 42
val_split = 0.2
test_dir = f'{dataset_dir}/test'

train_ds, val_ds, test_ds, class_names = (
    create_splits(train_dir, test_dir, val_split, seed)
)

Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Found 25000 files belonging to 2 classes.
Using 5000 files for validation.
Found 25000 files belonging to 2 classes.


In [12]:
for text, label in train_ds.take(2):
    print(f'Review: {text.numpy()}')
    label = label.numpy()
    print(f'Label : {label} ({class_names[label]})')

Review: b'"Pandemonium" is a horror movie spoof that comes off more stupid than funny. Believe me when I tell you, I love comedies. Especially comedy spoofs. "Airplane", "The Naked Gun" trilogy, "Blazing Saddles", "High Anxiety", and "Spaceballs" are some of my favorite comedies that spoof a particular genre. "Pandemonium" is not up there with those films. Most of the scenes in this movie had me sitting there in stunned silence because the movie wasn\'t all that funny. There are a few laughs in the film, but when you watch a comedy, you expect to laugh a lot more than a few times and that\'s all this film has going for it. Geez, "Scream" had more laughs than this film and that was more of a horror film. How bizarre is that?<br /><br />*1/2 (out of four)'
Label : 0 (neg)
Review: b"David Mamet is a very interesting and a very un-equal director. His first movie 'House of Games' was the one I liked best, and it set a series of films with characters whose perspective of life changes as they

### create TF Records

In [13]:
def serialize_example(text_fragment, label):
    """Serializes text fragment and label in tf.Example."""
    
    def _bytes_feature(value):
        """Returns a bytes_list from a string / byte."""
        if isinstance(value, type(tf.constant(0))):
            value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
        return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

    def _int64_feature(value):
        """Returns an int64_list from a bool / enum / int / uint."""
        return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
    
    feature = {
        'text_fragment': _bytes_feature(text_fragment),
        'label': _int64_feature(label)
    }
    
    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()
    
def tf_serialize_example(text_fragment, label):
  tf_string = tf.py_function(
    serialize_example,
    (text_fragment, label), 
    tf.string)      
  return tf.reshape(tf_string, ()) 

In [14]:
tfrecords_folder = '{}/tfrecords'.format(os.path.expanduser('~'))
if tf.io.gfile.exists(tfrecords_folder):
    tf.io.gfile.rmtree(tfrecords_folder)
tf.io.gfile.makedirs(tfrecords_folder)

filenames = ['train.tfrecords', 'valid.tfrecords', 'test.tfrecords']
for file_name, dataset in zip(filenames, [train_ds, val_ds, test_ds]):
    writer = tf.data.experimental.TFRecordWriter(os.path.join(tfrecords_folder, file_name))
    writer.write(dataset.map(tf_serialize_example))

In [15]:
for record in tf.data.TFRecordDataset([os.path.join(tfrecords_folder, file_name)]).take(2):
    print(record)

tf.Tensor(b'\n\xbd\x07\n\x0e\n\x05label\x12\x05\x1a\x03\n\x01\x01\n\xaa\x07\n\rtext_fragment\x12\x98\x07\n\x95\x07\n\x92\x07This scene shows how Wallace\'s experiment by using his brain manipulation invention goes terribly wrong, creating the "Were Rabbit". His desire as a social entrepreneur is to improve society for the better, therefore, created a "Brain Manipulator" machine. He risked his own life to help solve Tottington\'s pests\' rabbit problem and more importantly to overcome the overcrowding of rabbits being collected and stored in his basement. Though he thought his experiment worked, however, it resulted in placing more pressure on him and Gromit to find a solution before the Annual Vegetable Competition again risking his life. Gromit, who is a silent faithful dog and a loyal helper finds himself continuously thinking of innovative ways to save his master, from his radical crazy inventions going terribly wrong. What is interesting in this movie, is trying to identify: who is

### copy to GCS bucket

In [16]:
gcs_paths = [f'{STAGING_BUCKET}/bert-finetuning/imdb/tfrecords/train',
             f'{STAGING_BUCKET}/bert-finetuning/imdb/tfrecords/valid',
             f'{STAGING_BUCKET}/bert-finetuning/imdb/tfrecords/test']

for filename, gcs_path in zip(filenames, gcs_paths):
    local_file_path = os.path.join(tfrecords_folder, filename)
    gcs_file_path = f'{gcs_path}/{filename}'
    !gsutil cp {local_file_path} {gcs_file_path}

Copying file:///home/jupyter/tfrecords/train.tfrecords [Content-Type=application/octet-stream]...
/ [1 files][ 26.5 MiB/ 26.5 MiB]                                                
Operation completed over 1 objects/26.5 MiB.                                     
Copying file:///home/jupyter/tfrecords/valid.tfrecords [Content-Type=application/octet-stream]...
/ [1 files][  6.6 MiB/  6.6 MiB]                                                
Operation completed over 1 objects/6.6 MiB.                                      
Copying file:///home/jupyter/tfrecords/test.tfrecords [Content-Type=application/octet-stream]...
/ [1 files][ 32.3 MiB/ 32.3 MiB]                                                
Operation completed over 1 objects/32.3 MiB.                                     


## Create Training package

### base image

In [17]:
TRAIN_BASE_IMAGE = 'us-docker.pkg.dev/vertex-ai/training/tf-gpu.2-11:latest'

### create train dir

In [18]:
! rm -rf scripts/trainer
! mkdir -p scripts/trainer
! touch scripts/trainer/__init__.py

### training script

In [19]:
%%writefile scripts/trainer/task.py
# Copyright 2021 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#            http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and

import os
import tensorflow as tf
from tensorflow.python.client import device_lib
import tensorflow_hub as hub
import tensorflow_text as text

import time

from absl import app
from absl import flags
from absl import logging
from official.nlp import optimization 

import random
import string

from google.cloud import aiplatform as vertex_ai
from hypertune import HyperTune


TFHUB_HANDLE_ENCODER = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3'
TFHUB_HANDLE_PREPROCESS = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'
LOCAL_TB_FOLDER = '/tmp/logs'
LOCAL_SAVED_MODEL_DIR = '/tmp/saved_model'

# ====================================================
# training args
# ====================================================

FLAGS = flags.FLAGS
flags.DEFINE_integer('steps_per_epoch', 625, 'Steps per training epoch')
flags.DEFINE_integer('eval_steps', 150, 'Evaluation steps')
flags.DEFINE_integer('epochs', 2, 'Nubmer of epochs')
flags.DEFINE_integer('per_replica_batch_size', 32, 'Per replica batch size')
flags.DEFINE_integer('TRAIN_NGPU', 1, '')
flags.DEFINE_integer('replica_count', 1, '')
flags.DEFINE_integer('reduction_cnt', 0, '')

flags.DEFINE_float('learning_rate', 0.001, '')

flags.DEFINE_string('training_data_path', f'/bert-finetuning/imdb/tfrecords/train', 'Training data GCS path')
flags.DEFINE_string('validation_data_path', f'/bert-finetuning/imdb/tfrecords/valid', 'Validation data GCS path')
flags.DEFINE_string('testing_data_path', f'/bert-finetuning/imdb/tfrecords/test', 'Testing data GCS path')

flags.DEFINE_string('job_dir', f'/jobs', 'A base GCS path for jobs')
flags.DEFINE_string('job_id', 'default', 'unique_id for experiment runs')
flags.DEFINE_string('TRAIN_GPU', 'NA', '')
flags.DEFINE_string('experiment_run', 'NA', '')
flags.DEFINE_string('experiment_name', 'NA', '')
flags.DEFINE_string('tuning', 'False', 'Tune model hyper parameters?')


flags.DEFINE_enum('strategy', 'multiworker', ['single', 'mirrored', 'multiworker'], 'Distribution strategy')
flags.DEFINE_enum('auto_shard_policy', 'auto', ['auto', 'data', 'file', 'off'], 'Dataset sharing strategy')

auto_shard_policy = {
    'auto': tf.data.experimental.AutoShardPolicy.AUTO,
    'data': tf.data.experimental.AutoShardPolicy.DATA,
    'file': tf.data.experimental.AutoShardPolicy.FILE,
    'off': tf.data.experimental.AutoShardPolicy.OFF,
}

# ====================================================
# helper functions
# ====================================================

def create_unbatched_dataset(tfrecords_folder):
    """Creates an unbatched dataset in the format required by the 
       sentiment analysis model from the folder with TFrecords files."""
    
    feature_description = {
        'text_fragment': tf.io.FixedLenFeature([], tf.string, default_value=''),
        'label': tf.io.FixedLenFeature([], tf.int64, default_value=0),
    }

    def _parse_function(example_proto):
        parsed_example = tf.io.parse_single_example(example_proto, feature_description)
        return parsed_example['text_fragment'], parsed_example['label']
  
    file_paths = [f'{tfrecords_folder}/{file_path}' for file_path in tf.io.gfile.listdir(tfrecords_folder)]
    dataset = tf.data.TFRecordDataset(file_paths)
    dataset = dataset.map(_parse_function)
    
    return dataset


def configure_dataset(ds, auto_shard_policy):
    """
    Optimizes the performance of a dataset.
    """
    
    options = tf.data.Options()
    options.experimental_distribute.auto_shard_policy = (
        auto_shard_policy
    )
    
    ds = ds.repeat(-1).cache()
    ds = ds.prefetch(buffer_size=tf.data.AUTOTUNE)
    ds = ds.with_options(options)
    return ds


def create_input_pipelines(train_dir, valid_dir, test_dir, batch_size, auto_shard_policy):
    """Creates input pipelines from Imdb dataset."""
    
    train_ds = create_unbatched_dataset(train_dir)
    train_ds = train_ds.batch(batch_size)
    train_ds = configure_dataset(train_ds, auto_shard_policy)
    
    valid_ds = create_unbatched_dataset(valid_dir)
    valid_ds = valid_ds.batch(batch_size)
    valid_ds = configure_dataset(valid_ds, auto_shard_policy)
    
    test_ds = create_unbatched_dataset(test_dir)
    test_ds = test_ds.batch(batch_size)
    test_ds = configure_dataset(test_ds, auto_shard_policy)

    return train_ds, valid_ds, test_ds


def build_classifier_model(tfhub_handle_preprocess, tfhub_handle_encoder):
    """Builds a simple binary classification model with BERT trunk."""
    
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
    preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
    encoder_inputs = preprocessing_layer(text_input)
    encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')
    outputs = encoder(encoder_inputs)
    net = outputs['pooled_output']
    net = tf.keras.layers.Dropout(0.1)(net)
    net = tf.keras.layers.Dense(1, activation=None, name='classifier')(net)
    
    return tf.keras.Model(text_input, net)


def copy_tensorboard_logs(local_path: str, gcs_path: str):
    """Copies Tensorboard logs from a local dir to a GCS location.
    
    After training, batch copy Tensorboard logs locally to a GCS location. This can result
    in faster pipeline runtimes over streaming logs per batch to GCS that can get bottlenecked
    when streaming large volumes.
    
    Args:
      local_path: local filesystem directory uri.
      gcs_path: cloud filesystem directory uri.
    Returns:
      None.
    """
    pattern = '{}/*/events.out.tfevents.*'.format(local_path)
    local_files = tf.io.gfile.glob(pattern)
    gcs_log_files = [local_file.replace(local_path, gcs_path) for local_file in local_files]
    for local_file, gcs_file in zip(local_files, gcs_log_files):
        tf.io.gfile.copy(local_file, gcs_file)

# ====================================================
# training main
# ====================================================

def main(argv):
    del argv
    
    def _is_chief(task_type, task_id):
        return ((task_type == 'chief' or task_type == 'worker') and task_id == 0) or task_type is None
        
    # ====================================================
    # set args
    # ====================================================
    
    logging.info('Setting up training.')
    logging.info('   epochs: {}'.format(FLAGS.epochs))
    logging.info('   steps_per_epoch: {}'.format(FLAGS.steps_per_epoch))
    logging.info('   eval_steps: {}'.format(FLAGS.eval_steps))
    logging.info('   strategy: {}'.format(FLAGS.strategy))
    logging.info('   job_id: {}'.format(FLAGS.job_id))
    logging.info('   TRAIN_GPU: {}'.format(FLAGS.TRAIN_GPU))
    logging.info('   TRAIN_NGPU: {}'.format(FLAGS.TRAIN_NGPU))
    logging.info('   replica_count: {}'.format(FLAGS.replica_count))
    logging.info('   reduction_cnt: {}'.format(FLAGS.reduction_cnt))
    logging.info('   experiment_name: {}'.format(FLAGS.experiment_name))
    logging.info('   experiment_run: {}'.format(FLAGS.experiment_run))
    logging.info('   learning_rate: {}'.format(FLAGS.learning_rate))
    logging.info('   tuning: {}'.format(FLAGS.tuning))
    
    tb_dir = os.getenv('AIP_TENSORBOARD_LOG_DIR', LOCAL_TB_FOLDER)
    model_dir = os.getenv('AIP_MODEL_DIR', LOCAL_SAVED_MODEL_DIR)
    logging.info(f'AIP_TENSORBOARD_LOG_DIR = {tb_dir}')
    logging.info(f'AIP_MODEL_DIR = {model_dir}')
    
    project_number = os.environ["CLOUD_ML_PROJECT_ID"]
    
    vertex_ai.init(
        project=project_number,
        location='us-central1',
        experiment=FLAGS.experiment_name
    )

    # ====================================================
    # set distribution strategy (tensorflow) 
    # ====================================================
    logging.info('DEVICES'  + str(device_lib.list_local_devices()))
    
    # Single Machine, single compute device
    if FLAGS.strategy == 'single':
        if tf.test.is_gpu_available():
            strategy = tf.distribute.OneDeviceStrategy(device="/gpu:0")
        else:
            strategy = tf.distribute.OneDeviceStrategy(device="/cpu:0")
        logging.info("Single device training")
    
    # Single Machine, multiple compute device
    elif FLAGS.strategy == 'mirrored':
        strategy = tf.distribute.MirroredStrategy()
        logging.info("Mirrored Strategy distributed training")
    
    # Multi Machine, multiple compute device
    elif FLAGS.strategy == 'multiworker':
        strategy = tf.distribute.MultiWorkerMirroredStrategy()
        logging.info("Multi-worker Strategy distributed training")
        logging.info('TF_CONFIG = {}'.format(os.environ.get('TF_CONFIG', 'Not found')))
   
    # Single Machine, multiple TPU devices
    elif FLAGS.strategy == 'tpu':
        cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu="local")
        tf.config.experimental_connect_to_cluster(cluster_resolver)
        tf.tpu.experimental.initialize_tpu_system(cluster_resolver)
        strategy = tf.distribute.TPUStrategy(cluster_resolver)
        print("All devices: ", tf.config.list_logical_devices('TPU'))

    logging.info('num_replicas_in_sync = {}'.format(strategy.num_replicas_in_sync))
    
    if strategy.cluster_resolver:    
        task_type, task_id = (strategy.cluster_resolver.task_type,
                              strategy.cluster_resolver.task_id)
    else:
        task_type, task_id = (None, None)
        
    logging.info('task_type = {}'.format(task_type))
    logging.info('task_id = {}'.format(task_id))
    
    global_batch_size = (
        strategy.num_replicas_in_sync *
        FLAGS.per_replica_batch_size
    )
    
    # ====================================================
    # data input pipeline
    # ====================================================
    
    train_ds, valid_ds, test_ds = create_input_pipelines(
        FLAGS.training_data_path,
        FLAGS.validation_data_path,
        FLAGS.testing_data_path,
        global_batch_size,
        auto_shard_policy[FLAGS.auto_shard_policy]
    )
        
    num_train_steps = FLAGS.steps_per_epoch * FLAGS.epochs
    num_warmup_steps = int(0.1*num_train_steps)
    init_lr = FLAGS.learning_rate # FLAGS.learning_rate 3e-5
    
    # ====================================================
    # build & compile model
    # ====================================================
    
    with strategy.scope():
        
        model = build_classifier_model(TFHUB_HANDLE_PREPROCESS, TFHUB_HANDLE_ENCODER)
        
        loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
        
        metrics = tf.metrics.BinaryAccuracy()
        
        optimizer = optimization.create_optimizer(
            init_lr=init_lr
            , num_train_steps=num_train_steps
            , num_warmup_steps=num_warmup_steps
            , optimizer_type='adamw'
        )

        model.compile(
            optimizer=optimizer
            , loss=loss
            , metrics=metrics
        )
        
    # ====================================================
    # set callbacks
    # ====================================================
        
    # Configure BackupAndRestore callback
    if FLAGS.strategy == 'single':
        callbacks = []
        logging.info("No backup and restore")
    else:
        backup_dir = '{}/backupandrestore'.format(FLAGS.job_dir)
        callbacks = [tf.keras.callbacks.experimental.BackupAndRestore(backup_dir=backup_dir)]
        logging.info(f"saved backup and restore t0: {backup_dir}")
    
    # Configure TensorBoard callback on Chief
    if _is_chief(task_type, task_id):
        callbacks.append(
            tf.keras.callbacks.TensorBoard(
                log_dir=tb_dir
                , update_freq='batch'
                , histogram_freq=1
            )
        )
        
#     if FLAGS.tuning == "True":
#         # Instantiate the HyperTune reporting object
#         hpt = HyperTune()

#         # Reporting callback
#         class HPTCallback(tf.keras.callbacks.Callback):

#             def on_epoch_end(self, epoch, logs=None):
#                 hpt.report_hyperparameter_tuning_metric(
#                     hyperparameter_metric_tag='binary_accuracy',
#                     metric_value=logs['val_binary_accuracy'],
#                     global_step=epoch
#                 )

#         if not callbacks:
#             callbacks = []
#         callbacks.append(HPTCallback())
    
    # ====================================================
    # train model
    # ====================================================
    
    logging.info('Starting training ...')
    
    if _is_chief(task_type, task_id):
        start_time = time.time()
    
    history = model.fit(
        x=train_ds
        , validation_data=valid_ds
        , steps_per_epoch=FLAGS.steps_per_epoch
        , validation_steps=FLAGS.eval_steps
        , epochs=FLAGS.epochs
        , callbacks=callbacks
    )
    
    # ====================================================
    # log Vertex Experiments
    # ====================================================
    
    SESSION_id = "".join(random.choices(string.ascii_lowercase + string.digits, k=3))
    
    if _is_chief(task_type, task_id):
        end_time = time.time()
        # val metrics
        val_keys = [v for v in history.history.keys()]
        total_train_time = int((end_time - start_time) / 60)

        metrics_dict = {"total_train_time": total_train_time}
        logging.info(f"total_train_time: {total_train_time}")
        _ = [metrics_dict.update({key: history.history[key][-1]}) for key in val_keys]
    
        logging.info(f" task_type logging experiments: {task_type}")
        logging.info(f" task_id logging experiments: {task_id}")
        logging.info(f" logging data to experiment run: {FLAGS.experiment_run}-{SESSION_id}")
        
        with vertex_ai.start_run(
            f'{FLAGS.experiment_run}-{SESSION_id}', 
        ) as my_run:
            
            logging.info(f"logging metrics...")
            my_run.log_metrics(metrics_dict)

            logging.info(f"logging metaparams...")
            my_run.log_params(
                {
                    "epochs": FLAGS.epochs,
                    "strategy": FLAGS.strategy,
                    "per_replica_batch_size": FLAGS.per_replica_batch_size,
                    "TRAIN_GPU": FLAGS.TRAIN_GPU,
                    "TRAIN_NGPU": FLAGS.TRAIN_NGPU,
                    "replica_count": FLAGS.replica_count,
                    "reduction_cnt": FLAGS.reduction_cnt,
                    "global_batch_size": global_batch_size,
                }
            )

            vertex_ai.end_run()
            logging.info(f"EXPERIMENT RUN: '{FLAGS.experiment_run}-{SESSION_id}' has ended")
            
    # ====================================================
    # save model
    # ====================================================

    if FLAGS.strategy=="tpu":
        logging.info(f"Training completed. Saving TPU trained model...")
        save_locally = tf.saved_model.SaveOptions(experimental_io_device='/job:localhost')
        model.save(model_dir, options=save_locally)
    # single, mirrored or primary for multiworker
    elif _is_chief(task_type, task_id):
        logging.info('Training completed. Saving the trained model to: {}'.format(model_dir))
        model.save(model_dir)
    # non-primary workers for multi-workers
    else:
        # each worker saves their model instance to a unique temp location
        worker_dir = model_dir + '/workertemp_' + str(task_id)
        tf.io.gfile.makedirs(worker_dir)
        model.save(worker_dir)
        logging.info(f"worker saved to temp worker_dir: {worker_dir} ...")
        
        logging.info(f"recursively deleting everything under path: {worker_dir} ...")
        tf.io.gfile.rmtree(worker_dir)

    logging.info('Models saved!')
        
    # Save trained model
    # saved_model_dir = '{}/saved_model'.format(model_dir)
    # logging.info('Training completed. Saving the trained model to: {}'.format(saved_model_dir))
    # model.save(saved_model_dir)
    #tf.saved_model.save(model, saved_model_dir)
    
    
if __name__ == '__main__':
    logging.set_verbosity(logging.INFO)
    app.run(main)

Writing scripts/trainer/task.py


### Dockferfile

In [20]:
TRAIN_IMAGE = f'gcr.io/{PROJECT_ID}/imdb_bert'

In [21]:
dockerfile = f'''
FROM {TRAIN_BASE_IMAGE}

RUN pip install tf-models-official==2.11.0
RUN pip install tensorflow-text==2.11.0
RUN pip install cloudml-hypertune

WORKDIR /

# Copies the trainer code to the docker image.
COPY trainer /trainer

RUN apt update && apt -y install nvtop

# Sets up the entry point to invoke the trainer.
ENTRYPOINT ["python", "-m", "trainer.task"]
'''

with open('scripts/Dockerfile', 'w') as f:
    f.write(dockerfile)

In [22]:
! docker build -t {TRAIN_IMAGE} scripts/

Sending build context to Docker daemon  22.02kB
Step 1/8 : FROM us-docker.pkg.dev/vertex-ai/training/tf-gpu.2-11:latest
 ---> 1fc7736bd93e
Step 2/8 : RUN pip install tf-models-official==2.11.0
 ---> Using cache
 ---> f4b45eeb250d
Step 3/8 : RUN pip install tensorflow-text==2.11.0
 ---> Using cache
 ---> 2a27018a1165
Step 4/8 : RUN pip install cloudml-hypertune
 ---> Using cache
 ---> 71a3be94b595
Step 5/8 : WORKDIR /
 ---> Using cache
 ---> fb7eb9e35db6
Step 6/8 : COPY trainer /trainer
 ---> 907ff3eec1b8
Step 7/8 : RUN apt update && apt -y install nvtop
 ---> Running in 6a75d8981f99
[91m

[0mGet:1 http://security.ubuntu.com/ubuntu jammy-security InRelease [110 kB]
Get:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1581 B]
Get:3 http://archive.ubuntu.com/ubuntu jammy InRelease [270 kB]
Get:4 http://packages.cloud.google.com/apt cloud-sdk InRelease [6361 B]
Get:5 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packag

In [23]:
! docker push {TRAIN_IMAGE}

Using default tag: latest
The push refers to repository [gcr.io/hybrid-vertex/imdb_bert]

[1B441a2243: Preparing 
[1B79440985: Preparing 
[1B13c176db: Preparing 
[1B4e30591d: Preparing 
[1Ba416ba09: Preparing 
[1B95c7b436: Preparing 
[1Bf663075e: Preparing 
[1B7fbea69f: Preparing 
[1Bdcb6c992: Preparing 
[1B57d3600a: Preparing 
[1B98aef500: Preparing 
[1B937e6451: Preparing 
[1Bf16690a3: Preparing 
[1B0ba8f8e0: Preparing 
[1B061c8df0: Preparing 
[1B20bdc5a8: Preparing 
[1B9623cc67: Preparing 
[1Bf5f47ef7: Preparing 
[1Be3a670db: Preparing 
[1Bab0a5210: Preparing 
[1B27b973c2: Preparing 
[1B003a8778: Preparing 
[1B6eb0eac1: Preparing 
[1Ba842d5cf: Preparing 
[1B7c820400: Preparing 
[1B2eabba29: Preparing 
[1B9e83e652: Preparing 
[1B8f4121e3: Preparing 
[1Bac953428: Preparing 
[1B89f48870: Preparing 
[1Bd7cd1026: Preparing 
[1Bf5c5948a: Preparing 
[1B06a133b8: Preparing 
[1B1c199f2f: Preparing 
[1Bd1f80fca: Preparing 
[1Bf0edb23d: Preparing 
[37B41a22

# Submitting training jobs

In [24]:
vertex_ai.init(
    project=PROJECT_ID,
    location=REGION,
    staging_bucket=STAGING_BUCKET
)

## Hyperarameter Tuning

### set Experiment

In [31]:
import time

EXPERIMENT_PREFIX = 'bert-hptune'
EXPERIMENT_NAME=f'{PREFIX}-{EXPERIMENT_PREFIX}'
RUN_NAME = f'run-{time.strftime("%Y%m%d-%H%M%S")}'

print(f"EXPERIMENT_NAME: {EXPERIMENT_NAME}")
print(f"RUN_NAME: {RUN_NAME}")

EXPERIMENT_NAME: jtv8-bert-hptune
RUN_NAME: run-20230621-024011


In [47]:
MACHINE_TYPE = 'n1-standard-16'
TRAIN_GPU, TRAIN_NGPU = ('NVIDIA_TESLA_T4', 1) # NVIDIA_TESLA_T4 NVIDIA_TESLA_V100

REPLICA_COUNT = 1
DISTRIBUTION_STRATEGY = "single" # single, mirrored, multiworker, tpu

HP_TUNING="True"

print(f"MACHINE_TYPE          : {MACHINE_TYPE}")
print(f"TRAIN_GPU             : {TRAIN_GPU}")
print(f"TRAIN_NGPU            : {TRAIN_NGPU}")
print(f"REPLICA_COUNT         : {REPLICA_COUNT}")
print(f"DISTRIBUTION_STRATEGY : {DISTRIBUTION_STRATEGY}")
print(f"HP_TUNING             : {HP_TUNING}")

MACHINE_TYPE          : n1-standard-16
TRAIN_GPU             : NVIDIA_TESLA_T4
TRAIN_NGPU            : 1
REPLICA_COUNT         : 1
DISTRIBUTION_STRATEGY : single
HP_TUNING             : True


In [48]:
epochs = 3
steps_per_epoch = 200
eval_steps = 50

PER_REPLICA_BATCH_SIZE = 32
LEARNING_RATE= 0.001

REDUCTION_SERVER_COUNT = 0
REDUCTION_SERVER_MACHINE_TYPE = "n1-highcpu-16"

training_data_path = f'{STAGING_BUCKET}/bert-finetuning/imdb/tfrecords/train'
validation_data_path = f'{STAGING_BUCKET}/bert-finetuning/imdb/tfrecords/valid'
testing_data_path = f'{STAGING_BUCKET}/bert-finetuning/imdb/tfrecords/test'

job_id = f'job-{datetime.now().strftime("%Y%m%d%H%M%S")}'.format()
job_dir = f'{STAGING_BUCKET}/jobs/{job_id}'

WORKER_ARGS = [
    "--epochs=" + str(epochs)
    , "--steps_per_epoch=" + str(steps_per_epoch)
    , "--eval_steps=" + str(eval_steps)
    , "--per_replica_batch_size=" + str(PER_REPLICA_BATCH_SIZE)
    , "--training_data_path=" + training_data_path
    , "--validation_data_path=" + validation_data_path
    , "--testing_data_path=" + testing_data_path
    , "--job_dir=" + job_dir
    , f"--strategy={DISTRIBUTION_STRATEGY}"
    , "--auto_shard_policy=data" #data
    , f"--job_id={job_id}"
    , f"--TRAIN_GPU={TRAIN_GPU}"
    , f"--TRAIN_NGPU={TRAIN_NGPU}"
    , f"--reduction_cnt={REDUCTION_SERVER_COUNT}"
    , f"--replica_count={REPLICA_COUNT}"
    , f"--experiment_name={EXPERIMENT_NAME}"
    , f"--experiment_run={RUN_NAME}"
    , f"--learning_rate={LEARNING_RATE}"
    , f"--tuning={HP_TUNING}"
]

from utils import workerpool_specs

WORKER_POOL_SPECS = workerpool_specs.prepare_worker_pool_specs(
    image_uri=TRAIN_IMAGE,
    args=WORKER_ARGS,
    replica_count=REPLICA_COUNT,
    machine_type=MACHINE_TYPE,
    accelerator_count=TRAIN_NGPU,
    accelerator_type=TRAIN_GPU,
    reduction_server_count=REDUCTION_SERVER_COUNT,
    reduction_server_machine_type=REDUCTION_SERVER_MACHINE_TYPE,
)

from pprint import pprint
pprint(WORKER_POOL_SPECS)

[{'container_spec': {'args': ['--epochs=3',
                              '--steps_per_epoch=200',
                              '--eval_steps=50',
                              '--per_replica_batch_size=32',
                              '--training_data_path=gs://jtv8-hybrid-vertex-bucket/bert-finetuning/imdb/tfrecords/train',
                              '--validation_data_path=gs://jtv8-hybrid-vertex-bucket/bert-finetuning/imdb/tfrecords/valid',
                              '--testing_data_path=gs://jtv8-hybrid-vertex-bucket/bert-finetuning/imdb/tfrecords/test',
                              '--job_dir=gs://jtv8-hybrid-vertex-bucket/jobs/job-20230621031158',
                              '--strategy=single',
                              '--auto_shard_policy=data',
                              '--job_id=job-20230621031158',
                              '--TRAIN_GPU=NVIDIA_TESLA_T4',
                              '--TRAIN_NGPU=1',
                              '--reduction_cnt=0

Create a `CustomJob`.

In [49]:
# Create a CustomJob

JOB_NAME = f'hptune-bert-{RUN_NAME}-{DISTRIBUTION_STRATEGY}-{REPLICA_COUNT}-{TRAIN_NGPU}'

my_custom_hpt_job = vertex_ai.CustomJob(
    display_name=JOB_NAME,
    project=PROJECT_ID,
    worker_pool_specs=WORKER_POOL_SPECS,
    staging_bucket=f'{STAGING_BUCKET}/{EXPERIMENT_NAME}/{RUN_NAME}',
)

Once your container is pushed to Google Container Registry, you use the Vertex SDK to create and run the hyperparameter tuning job.

You define the following specifications:

* `parameter_spec`: Dictionary specifying the parameters to optimize. The dictionary key is the string assigned to the command line argument for each hyperparameter in your training application code, and the dictionary value is the parameter specification. The parameter specification includes the type, min/max values, and scale for the hyperparameter.

* `metric_spec`: Dictionary specifying the metric to optimize. The dictionary key is the `hyperparameter_metric_tag` that you set in your training application code, and the value is the optimization goal.

In [44]:
from google.cloud.aiplatform import hyperparameter_tuning as hpt

metric_spec = {"binary_accuracy": "maximize"}

parameter_spec = {
    "learning_rate": hpt.DoubleParameterSpec(min=0.001, max=1, scale="log"),
    "per_replica_batch_size": hpt.DiscreteParameterSpec(values=[32, 64, 128], scale=None),
}

Then, create and run a HyperparameterTuningJob.

There are a few arguments to note:

* `max_trial_count`: Sets an upper bound on the number of trials the service will run. The recommended practice is to start with a smaller number of trials and get a sense of how impactful your chosen hyperparameters are before scaling up.

* `parallel_trial_count`: If you use parallel trials, the service provisions multiple training processing clusters. The worker pool spec that you specify when creating the job is used for each individual training cluster. Increasing the number of parallel trials reduces the amount of time the hyperparameter tuning job takes to run; however, it can reduce the effectiveness of the job overall. This is because the default tuning strategy uses results of previous trials to inform the assignment of values in subsequent trials.

* `search_algorithm`: The available search algorithms are grid, random, or default (None). The default option applies Bayesian optimization to search the space of possible hyperparameter values and is the recommended algorithm.

In [45]:
# Create and run HyperparameterTuningJob

hp_job = vertex_ai.HyperparameterTuningJob(
    display_name=JOB_NAME,
    custom_job=my_custom_hpt_job,
    metric_spec=metric_spec,
    parameter_spec=parameter_spec,
    max_trial_count=6,
    parallel_trial_count=3,
    project=PROJECT_ID,
    search_algorithm=None,
)

hp_job.run(
    sync=False
    , service_account=VERTEX_SA
)

In [46]:
print(f"Job Name: {hp_job.display_name}")
print(f"Job Resource Name: {hp_job.resource_name}\n")
# print(f"Check training progress at {custom_job._dashboard_uri()}")

Job Name: hptune-bert-run-20230621-024011-single-1-1
Job Resource Name: projects/934903580331/locations/us-central1/hyperparameterTuningJobs/1775640772675108864



### best trial

In [None]:
best = (None, None, None, 0.0)
for trial in hpt_job.trials:
    # Keep track of the best outcome
    if float(trial.final_measurement.metrics[0].value) > best[3]:
        try:
            best = (
                trial.id,
                float(trial.parameters[0].value),
                float(trial.parameters[1].value),
                float(trial.final_measurement.metrics[0].value),
            )
        except:
            best = (
                trial.id,
                float(trial.parameters[0].value),
                None,
                float(trial.final_measurement.metrics[0].value),
            )

print(best)

In [None]:
# LR = best[2]
# BATCH_SIZE = int(best[1])

## 1 Replica, 1 GPU

### set Experiment

In [25]:
import time

EXPERIMENT_PREFIX = 'bert'
EXPERIMENT_NAME=f'{PREFIX}-bert-tune'
RUN_NAME = f'run-{time.strftime("%Y%m%d-%H%M%S")}'

print(f"EXPERIMENT_NAME: {EXPERIMENT_NAME}")
print(f"RUN_NAME: {RUN_NAME}")

EXPERIMENT_NAME: jtv9-bert-tune
RUN_NAME: run-20230621-141557


### config compute

In [26]:
# MACHINE_TYPE = 'n1-standard-16'
# TRAIN_GPU, TRAIN_NGPU = ('NVIDIA_TESLA_T4', 1)

# MACHINE_TYPE = 'n1-standard-16'
# TRAIN_GPU, TRAIN_NGPU = ('NVIDIA_TESLA_V100', 1)

MACHINE_TYPE = 'a2-highgpu-1g'
TRAIN_GPU, TRAIN_NGPU = ('NVIDIA_TESLA_A100', 1)

# MACHINE_TYPE = 'a2-ultragpu-1g'
# TRAIN_GPU, TRAIN_NGPU = ('NVIDIA_A100_80GB', 1)

REPLICA_COUNT = 1
DISTRIBUTION_STRATEGY = "single" # single, mirrored, multiworker, tpu

HP_TUNING="False"

print(f"MACHINE_TYPE          : {MACHINE_TYPE}")
print(f"TRAIN_GPU             : {TRAIN_GPU}")
print(f"TRAIN_NGPU            : {TRAIN_NGPU}")
print(f"REPLICA_COUNT         : {REPLICA_COUNT}")
print(f"DISTRIBUTION_STRATEGY : {DISTRIBUTION_STRATEGY}")
print(f"HP_TUNING             : {HP_TUNING}")

MACHINE_TYPE          : a2-highgpu-1g
TRAIN_GPU             : NVIDIA_TESLA_A100
TRAIN_NGPU            : 1
REPLICA_COUNT         : 1
DISTRIBUTION_STRATEGY : single
HP_TUNING             : False


### worker args

In [27]:
epochs = 100
steps_per_epoch = 200
eval_steps = 50

PER_REPLICA_BATCH_SIZE = 32
LEARNING_RATE= 0.001

REDUCTION_SERVER_COUNT = 0
REDUCTION_SERVER_MACHINE_TYPE = "n1-highcpu-16"

training_data_path = f'{STAGING_BUCKET}/bert-finetuning/imdb/tfrecords/train'
validation_data_path = f'{STAGING_BUCKET}/bert-finetuning/imdb/tfrecords/valid'
testing_data_path = f'{STAGING_BUCKET}/bert-finetuning/imdb/tfrecords/test'

job_id = f'job-{datetime.now().strftime("%Y%m%d%H%M%S")}'.format()
job_dir = f'{STAGING_BUCKET}/jobs/{job_id}'

WORKER_ARGS = [
    "--epochs=" + str(epochs)
    , "--steps_per_epoch=" + str(steps_per_epoch)
    , "--eval_steps=" + str(eval_steps)
    , "--per_replica_batch_size=" + str(PER_REPLICA_BATCH_SIZE)
    , "--training_data_path=" + training_data_path
    , "--validation_data_path=" + validation_data_path
    , "--testing_data_path=" + testing_data_path
    , "--job_dir=" + job_dir
    , f"--strategy={DISTRIBUTION_STRATEGY}"
    , "--auto_shard_policy=data" # data | auto
    , f"--job_id={job_id}"
    , f"--TRAIN_GPU={TRAIN_GPU}"
    , f"--TRAIN_NGPU={TRAIN_NGPU}"
    , f"--reduction_cnt={REDUCTION_SERVER_COUNT}"
    , f"--replica_count={REPLICA_COUNT}"
    , f"--experiment_name={EXPERIMENT_NAME}"
    , f"--experiment_run={RUN_NAME}"
    , f"--learning_rate={LEARNING_RATE}"
    , f"--tuning={HP_TUNING}"
]

from utils import workerpool_specs

WORKER_POOL_SPECS = workerpool_specs.prepare_worker_pool_specs(
    image_uri=TRAIN_IMAGE,
    args=WORKER_ARGS,
    replica_count=REPLICA_COUNT,
    machine_type=MACHINE_TYPE,
    accelerator_count=TRAIN_NGPU,
    accelerator_type=TRAIN_GPU,
    reduction_server_count=REDUCTION_SERVER_COUNT,
    reduction_server_machine_type=REDUCTION_SERVER_MACHINE_TYPE,
)

from pprint import pprint
pprint(WORKER_POOL_SPECS)

[{'container_spec': {'args': ['--epochs=100',
                              '--steps_per_epoch=200',
                              '--eval_steps=50',
                              '--per_replica_batch_size=32',
                              '--training_data_path=gs://jtv9-hybrid-vertex-bucket/bert-finetuning/imdb/tfrecords/train',
                              '--validation_data_path=gs://jtv9-hybrid-vertex-bucket/bert-finetuning/imdb/tfrecords/valid',
                              '--testing_data_path=gs://jtv9-hybrid-vertex-bucket/bert-finetuning/imdb/tfrecords/test',
                              '--job_dir=gs://jtv9-hybrid-vertex-bucket/jobs/job-20230621141630',
                              '--strategy=single',
                              '--auto_shard_policy=data',
                              '--job_id=job-20230621141630',
                              '--TRAIN_GPU=NVIDIA_TESLA_A100',
                              '--TRAIN_NGPU=1',
                              '--reduction_c

### create tensorboard

In [28]:
vertex_ai_tb = vertex_ai.Tensorboard.create()
TENSORBOARD = vertex_ai_tb.gca_resource.name

# use existing
# TENSORBOARD="projects/934903580331/locations/us-central1/tensorboards/949934065933352960"

print(TENSORBOARD)

projects/934903580331/locations/us-central1/tensorboards/8961837803025465344


In [29]:
vertex_ai.init(
    experiment=EXPERIMENT_NAME
    # , experiment_tensorboard=vertex_ai_tb
)

### submit train job

In [30]:
ACCELERATOR = TRAIN_GPU.lower().replace("nvidia_","").replace("_","-")
print(ACCELERATOR)

tesla-a100


In [31]:
custom_job = vertex_ai.CustomJob(
    display_name=f'imdb-bert-{DISTRIBUTION_STRATEGY}-{TRAIN_NGPU}-{ACCELERATOR}'
    , worker_pool_specs=WORKER_POOL_SPECS
    , staging_bucket=f'{STAGING_BUCKET}/{EXPERIMENT_NAME}/{RUN_NAME}'
    # , location=REGION
)

In [32]:
custom_job.run(
    sync=False
    , service_account=VERTEX_SA
    , tensorboard=TENSORBOARD
    , restart_job_on_worker_restart=False
    , enable_web_access=True
)

In [33]:
print(f"Job Name: {custom_job.display_name}")
print(f"Job Resource Name: {custom_job.resource_name}\n")
# print(f"Check training progress at {custom_job._dashboard_uri()}")

Job Name: imdb-bert-single-1-tesla-a100
Job Resource Name: projects/934903580331/locations/us-central1/customJobs/8584063062468722688



## 1 Replica, 2 GPUs

### set Experiment Run

In [34]:
RUN_NAME = f'run-{time.strftime("%Y%m%d-%H%M%S")}'

print(f"EXPERIMENT_NAME: {EXPERIMENT_NAME}")
print(f"RUN_NAME: {RUN_NAME}")

EXPERIMENT_NAME: jtv9-bert-tune
RUN_NAME: run-20230621-155304


### config compute

In [36]:
# MACHINE_TYPE = 'n1-standard-32'
# TRAIN_GPU, TRAIN_NGPU = ('NVIDIA_TESLA_T4', 2)

# MACHINE_TYPE = 'n1-standard-16'
# TRAIN_GPU, TRAIN_NGPU = ('NVIDIA_TESLA_V100', 2)

MACHINE_TYPE = 'a2-highgpu-2g'
TRAIN_GPU, TRAIN_NGPU = ('NVIDIA_TESLA_A100', 2)

# MACHINE_TYPE = 'a2-ultragpu-2g'
# TRAIN_GPU, TRAIN_NGPU = ('NVIDIA_A100_80GB', 2)

REPLICA_COUNT = 1
DISTRIBUTION_STRATEGY = "mirrored" # single, mirrored, multiworker, tpu

HP_TUNING="False"

print(f"MACHINE_TYPE          : {MACHINE_TYPE}")
print(f"TRAIN_GPU             : {TRAIN_GPU}")
print(f"TRAIN_NGPU            : {TRAIN_NGPU}")
print(f"REPLICA_COUNT         : {REPLICA_COUNT}")
print(f"DISTRIBUTION_STRATEGY : {DISTRIBUTION_STRATEGY}")
print(f"HP_TUNING             : {HP_TUNING}")

MACHINE_TYPE          : a2-highgpu-2g
TRAIN_GPU             : NVIDIA_TESLA_A100
TRAIN_NGPU            : 2
REPLICA_COUNT         : 1
DISTRIBUTION_STRATEGY : mirrored
HP_TUNING             : False


### worker args

In [37]:
epochs = 10
steps_per_epoch = 200
eval_steps = 50

PER_REPLICA_BATCH_SIZE = 32
LEARNING_RATE= 0.001

REDUCTION_SERVER_COUNT = 0
REDUCTION_SERVER_MACHINE_TYPE = "n1-highcpu-16"

training_data_path = f'{STAGING_BUCKET}/bert-finetuning/imdb/tfrecords/train'
validation_data_path = f'{STAGING_BUCKET}/bert-finetuning/imdb/tfrecords/valid'
testing_data_path = f'{STAGING_BUCKET}/bert-finetuning/imdb/tfrecords/test'

job_id = f'job-{datetime.now().strftime("%Y%m%d%H%M%S")}'.format()
job_dir = f'{STAGING_BUCKET}/jobs/{job_id}'

WORKER_ARGS = [
    "--epochs=" + str(epochs)
    , "--steps_per_epoch=" + str(steps_per_epoch)
    , "--eval_steps=" + str(eval_steps)
    , "--per_replica_batch_size=" + str(PER_REPLICA_BATCH_SIZE)
    , "--training_data_path=" + training_data_path
    , "--validation_data_path=" + validation_data_path
    , "--testing_data_path=" + testing_data_path
    , "--job_dir=" + job_dir
    , f"--strategy={DISTRIBUTION_STRATEGY}"
    , "--auto_shard_policy=auto" #data
    , f"--job_id={job_id}"
    , f"--TRAIN_GPU={TRAIN_GPU}"
    , f"--TRAIN_NGPU={TRAIN_NGPU}"
    , f"--reduction_cnt={REDUCTION_SERVER_COUNT}"
    , f"--replica_count={REPLICA_COUNT}"
    , f"--experiment_name={EXPERIMENT_NAME}"
    , f"--experiment_run={RUN_NAME}"
    , f"--learning_rate={LEARNING_RATE}"
    , f"--tuning={HP_TUNING}"
]

from utils import workerpool_specs

WORKER_POOL_SPECS = workerpool_specs.prepare_worker_pool_specs(
    image_uri=TRAIN_IMAGE,
    args=WORKER_ARGS,
    replica_count=REPLICA_COUNT,
    machine_type=MACHINE_TYPE,
    accelerator_count=TRAIN_NGPU,
    accelerator_type=TRAIN_GPU,
    reduction_server_count=REDUCTION_SERVER_COUNT,
    reduction_server_machine_type=REDUCTION_SERVER_MACHINE_TYPE,
)

from pprint import pprint
pprint(WORKER_POOL_SPECS)

[{'container_spec': {'args': ['--epochs=10',
                              '--steps_per_epoch=200',
                              '--eval_steps=50',
                              '--per_replica_batch_size=32',
                              '--training_data_path=gs://jtv9-hybrid-vertex-bucket/bert-finetuning/imdb/tfrecords/train',
                              '--validation_data_path=gs://jtv9-hybrid-vertex-bucket/bert-finetuning/imdb/tfrecords/valid',
                              '--testing_data_path=gs://jtv9-hybrid-vertex-bucket/bert-finetuning/imdb/tfrecords/test',
                              '--job_dir=gs://jtv9-hybrid-vertex-bucket/jobs/job-20230621155321',
                              '--strategy=mirrored',
                              '--auto_shard_policy=auto',
                              '--job_id=job-20230621155321',
                              '--TRAIN_GPU=NVIDIA_TESLA_A100',
                              '--TRAIN_NGPU=2',
                              '--reduction_

### create tensorboard

In [38]:
vertex_ai_tb = vertex_ai.Tensorboard.create()
TENSORBOARD = vertex_ai_tb.gca_resource.name

# TENSORBOARD="projects/934903580331/locations/us-central1/tensorboards/6822627980024479744"

print(TENSORBOARD)

projects/934903580331/locations/us-central1/tensorboards/9128470989238173696


In [39]:
vertex_ai.init(
    experiment=EXPERIMENT_NAME
    # , experiment_tensorboard=vertex_ai_tb
)

### submit train job

In [40]:
ACCELERATOR = TRAIN_GPU.lower().replace("nvidia_","").replace("_","-")
print(ACCELERATOR)

tesla-a100


In [41]:
custom_m_job = vertex_ai.CustomJob(
    display_name=f'imdb-bert-{DISTRIBUTION_STRATEGY}-{TRAIN_NGPU}-{ACCELERATOR}',
    worker_pool_specs=WORKER_POOL_SPECS,
    staging_bucket=f'{STAGING_BUCKET}/{EXPERIMENT_NAME}/{RUN_NAME}'
)

In [42]:
custom_m_job.run(
    sync=False
    , service_account=VERTEX_SA
    , tensorboard=TENSORBOARD
    , restart_job_on_worker_restart=False
    , enable_web_access=True
)

In [43]:
print(f"Job Name: {custom_m_job.display_name}")
print(f"Job Resource Name: {custom_m_job.resource_name}\n")
# print(f"Check training progress at {custom_job._dashboard_uri()}")

Job Name: imdb-bert-mirrored-2-tesla-a100
Job Resource Name: projects/934903580331/locations/us-central1/customJobs/5587198985430368256



## 2 Replicas, 1 GPU each

> Now increase `replica_count` from 1 to 2

### set Experiment Run

In [68]:
RUN_NAME = f'run-{time.strftime("%Y%m%d-%H%M%S")}'

print(f"EXPERIMENT_NAME: {EXPERIMENT_NAME}")
print(f"RUN_NAME: {RUN_NAME}")

EXPERIMENT_NAME: jtv7-bert-tune
RUN_NAME: run-20230621-012310


### config compute

In [69]:
MACHINE_TYPE = 'n1-standard-16'
TRAIN_GPU, TRAIN_NGPU = ('NVIDIA_TESLA_T4', 1) # NVIDIA_TESLA_T4 NVIDIA_TESLA_V100

REPLICA_COUNT = 2
DISTRIBUTION_STRATEGY = "multiworker" # single, mirrored, multiworker, tpu

HP_TUNING="False"

print(f"MACHINE_TYPE          : {MACHINE_TYPE}")
print(f"TRAIN_GPU             : {TRAIN_GPU}")
print(f"TRAIN_NGPU            : {TRAIN_NGPU}")
print(f"REPLICA_COUNT         : {REPLICA_COUNT}")
print(f"DISTRIBUTION_STRATEGY : {DISTRIBUTION_STRATEGY}")
print(f"HP_TUNING             : {HP_TUNING}")

MACHINE_TYPE          : n1-standard-16
TRAIN_GPU             : NVIDIA_TESLA_T4
TRAIN_NGPU            : 1
REPLICA_COUNT         : 2
DISTRIBUTION_STRATEGY : multiworker


### worker args

In [70]:
epochs = 10
steps_per_epoch = 200
eval_steps = 50

PER_REPLICA_BATCH_SIZE = 32
LEARNING_RATE= 0.001

REDUCTION_SERVER_COUNT = 0
REDUCTION_SERVER_MACHINE_TYPE = "n1-highcpu-16"

training_data_path = f'{STAGING_BUCKET}/bert-finetuning/imdb/tfrecords/train'
validation_data_path = f'{STAGING_BUCKET}/bert-finetuning/imdb/tfrecords/valid'
testing_data_path = f'{STAGING_BUCKET}/bert-finetuning/imdb/tfrecords/test'
job_id = 'job-{}'.format(datetime.now().strftime("%Y%m%d%H%M%S"))
job_dir = f'{STAGING_BUCKET}/jobs/{job_id}'

WORKER_ARGS = [
    "--epochs=" + str(epochs)
    , "--steps_per_epoch=" + str(steps_per_epoch)
    , "--eval_steps=" + str(eval_steps)
    , "--per_replica_batch_size=" + str(PER_REPLICA_BATCH_SIZE)
    , "--training_data_path=" + training_data_path
    , "--validation_data_path=" + validation_data_path
    , "--testing_data_path=" + testing_data_path
    , "--job_dir=" + job_dir
    , f"--strategy={DISTRIBUTION_STRATEGY}"
    , "--auto_shard_policy=data" # data auto
    , f"--job_id={job_id}"
    , f"--TRAIN_GPU={TRAIN_GPU}"
    , f"--TRAIN_NGPU={TRAIN_NGPU}"
    , f"--reduction_cnt={REDUCTION_SERVER_COUNT}"
    , f"--replica_count={REPLICA_COUNT}"
    , f"--experiment_name={EXPERIMENT_NAME}"
    , f"--experiment_run={RUN_NAME}"
    , f"--learning_rate={LEARNING_RATE}"
    , f"--tuning={HP_TUNING}"
]

from utils import workerpool_specs

WORKER_POOL_SPECS = workerpool_specs.prepare_worker_pool_specs(
    image_uri=TRAIN_IMAGE,
    args=WORKER_ARGS,
    replica_count=REPLICA_COUNT,
    machine_type=MACHINE_TYPE,
    accelerator_count=TRAIN_NGPU,
    accelerator_type=TRAIN_GPU,
    reduction_server_count=REDUCTION_SERVER_COUNT,
    reduction_server_machine_type=REDUCTION_SERVER_MACHINE_TYPE,
)

from pprint import pprint
pprint(WORKER_POOL_SPECS)

[{'container_spec': {'args': ['--epochs=10',
                              '--steps_per_epoch=200',
                              '--eval_steps=50',
                              '--per_replica_batch_size=32',
                              '--training_data_path=gs://jtv7-hybrid-vertex-bucket/bert-finetuning/imdb/tfrecords/train',
                              '--validation_data_path=gs://jtv7-hybrid-vertex-bucket/bert-finetuning/imdb/tfrecords/valid',
                              '--testing_data_path=gs://jtv7-hybrid-vertex-bucket/bert-finetuning/imdb/tfrecords/test',
                              '--job_dir=gs://jtv7-hybrid-vertex-bucket/jobs/job-20230621012317',
                              '--strategy=multiworker',
                              '--auto_shard_policy=data',
                              '--job_id=job-20230621012317',
                              '--TRAIN_GPU=NVIDIA_TESLA_T4',
                              '--TRAIN_NGPU=1',
                              '--reduction

### create tensorboard

In [71]:
vertex_ai_tb = vertex_ai.Tensorboard.create()
TENSORBOARD = vertex_ai_tb.gca_resource.name

# TENSORBOARD="projects/934903580331/locations/us-central1/tensorboards/4467245374909710336"

print(TENSORBOARD)

projects/934903580331/locations/us-central1/tensorboards/4467245374909710336


In [72]:
vertex_ai.init(
    experiment=EXPERIMENT_NAME
    # , experiment_tensorboard=vertex_ai_tb
)

### submit train job

In [None]:
ACCELERATOR = TRAIN_GPU.lower().replace("nvidia_","").replace("_","-")
print(ACCELERATOR)

In [73]:
custom_mm_job = vertex_ai.CustomJob(
    display_name=f'imdb-bert-{DISTRIBUTION_STRATEGY}-{TRAIN_NGPU}-{ACCELERATOR}',
    worker_pool_specs=WORKER_POOL_SPECS,
    staging_bucket=f'{STAGING_BUCKET}/{EXPERIMENT_NAME}/{RUN_NAME}'
)

In [74]:
custom_mm_job.run(
    sync=False
    , service_account=VERTEX_SA
    , tensorboard=TENSORBOARD
    , restart_job_on_worker_restart=False
    , enable_web_access=True
)

In [75]:
print(f"Job Name: {custom_mm_job.display_name}")
print(f"Job Resource Name: {custom_mm_job.resource_name}\n")
# print(f"Check training progress at {custom_m_job._dashboard_uri()}")

Job Name: imdb-bert-run-20230621-012310-multiworker-2-1
Job Resource Name: projects/934903580331/locations/us-central1/customJobs/3847648444986425344



## 2 Replicas, 1 GPU each + Reduction Server

### set Experiment Run

In [76]:
RUN_NAME = f'run-{time.strftime("%Y%m%d-%H%M%S")}'

print(f"EXPERIMENT_NAME: {EXPERIMENT_NAME}")
print(f"RUN_NAME: {RUN_NAME}")

EXPERIMENT_NAME: jtv7-bert-tune
RUN_NAME: run-20230621-012602


### config compute

In [77]:
MACHINE_TYPE = 'n1-standard-16'
TRAIN_GPU, TRAIN_NGPU = ('NVIDIA_TESLA_T4', 1) # NVIDIA_TESLA_T4 NVIDIA_TESLA_V100

REPLICA_COUNT = 2
DISTRIBUTION_STRATEGY = "multiworker" # single, mirrored, multiworker, tpu

HP_TUNING="False"

print(f"MACHINE_TYPE          : {MACHINE_TYPE}")
print(f"TRAIN_GPU             : {TRAIN_GPU}")
print(f"TRAIN_NGPU            : {TRAIN_NGPU}")
print(f"REPLICA_COUNT         : {REPLICA_COUNT}")
print(f"DISTRIBUTION_STRATEGY : {DISTRIBUTION_STRATEGY}")
print(f"HP_TUNING             : {HP_TUNING}")

MACHINE_TYPE          : n1-standard-16
TRAIN_GPU             : NVIDIA_TESLA_T4
TRAIN_NGPU            : 1
REPLICA_COUNT         : 2
DISTRIBUTION_STRATEGY : multiworker


### worker args

In [78]:
epochs = 10
steps_per_epoch = 200
eval_steps = 50

PER_REPLICA_BATCH_SIZE = 32
LEARNING_RATE= 0.001

REDUCTION_SERVER_COUNT = 2
REDUCTION_SERVER_MACHINE_TYPE = "n1-highcpu-16"

training_data_path = f'{STAGING_BUCKET}/bert-finetuning/imdb/tfrecords/train'
validation_data_path = f'{STAGING_BUCKET}/bert-finetuning/imdb/tfrecords/valid'
testing_data_path = f'{STAGING_BUCKET}/bert-finetuning/imdb/tfrecords/test'
job_id = 'job-{}'.format(datetime.now().strftime("%Y%m%d%H%M%S"))
job_dir = f'{STAGING_BUCKET}/jobs/{job_id}'

WORKER_ARGS = [
    "--epochs=" + str(epochs)
    , "--steps_per_epoch=" + str(steps_per_epoch)
    , "--eval_steps=" + str(eval_steps)
    , "--per_replica_batch_size=" + str(PER_REPLICA_BATCH_SIZE)
    , "--training_data_path=" + training_data_path
    , "--validation_data_path=" + validation_data_path
    , "--testing_data_path=" + testing_data_path
    , "--job_dir=" + job_dir
    , f"--strategy={DISTRIBUTION_STRATEGY}"
    , "--auto_shard_policy=data" # data auto
    , f"--job_id={job_id}"
    , f"--TRAIN_GPU={TRAIN_GPU}"
    , f"--TRAIN_NGPU={TRAIN_NGPU}"
    , f"--reduction_cnt={REDUCTION_SERVER_COUNT}"
    , f"--replica_count={REPLICA_COUNT}"
    , f"--experiment_name={EXPERIMENT_NAME}"
    , f"--experiment_run={RUN_NAME}"
    , f"--learning_rate={LEARNING_RATE}"
    , f"--tuning={HP_TUNING}"
]

from utils import workerpool_specs

WORKER_POOL_SPECS = workerpool_specs.prepare_worker_pool_specs(
    image_uri=TRAIN_IMAGE,
    args=WORKER_ARGS,
    replica_count=REPLICA_COUNT,
    machine_type=MACHINE_TYPE,
    accelerator_count=TRAIN_NGPU,
    accelerator_type=TRAIN_GPU,
    reduction_server_count=REDUCTION_SERVER_COUNT,
    reduction_server_machine_type=REDUCTION_SERVER_MACHINE_TYPE,
)

from pprint import pprint
pprint(WORKER_POOL_SPECS)

[{'container_spec': {'args': ['--epochs=10',
                              '--steps_per_epoch=200',
                              '--eval_steps=50',
                              '--per_replica_batch_size=32',
                              '--training_data_path=gs://jtv7-hybrid-vertex-bucket/bert-finetuning/imdb/tfrecords/train',
                              '--validation_data_path=gs://jtv7-hybrid-vertex-bucket/bert-finetuning/imdb/tfrecords/valid',
                              '--testing_data_path=gs://jtv7-hybrid-vertex-bucket/bert-finetuning/imdb/tfrecords/test',
                              '--job_dir=gs://jtv7-hybrid-vertex-bucket/jobs/job-20230621012604',
                              '--strategy=multiworker',
                              '--auto_shard_policy=data',
                              '--job_id=job-20230621012604',
                              '--TRAIN_GPU=NVIDIA_TESLA_T4',
                              '--TRAIN_NGPU=1',
                              '--reduction

### create tensorboard

In [79]:
vertex_ai_tb = vertex_ai.Tensorboard.create()
TENSORBOARD = vertex_ai_tb.gca_resource.name

# TENSORBOARD="projects/934903580331/locations/us-central1/tensorboards/5728253270573449216"

print(TENSORBOARD)

projects/934903580331/locations/us-central1/tensorboards/5728253270573449216


In [80]:
vertex_ai.init(
    experiment=EXPERIMENT_NAME
    # , experiment_tensorboard=vertex_ai_tb
)

### submit train job

In [None]:
ACCELERATOR = TRAIN_GPU.lower().replace("nvidia_","").replace("_","-")
print(ACCELERATOR)

In [81]:
custom_mmr_job = vertex_ai.CustomJob(
    display_name=f'imdb-bert-{DISTRIBUTION_STRATEGY}-{TRAIN_NGPU}-{ACCELERATOR}',
    worker_pool_specs=WORKER_POOL_SPECS,
    staging_bucket=f'{STAGING_BUCKET}/{EXPERIMENT_NAME}/{RUN_NAME}'
)

In [82]:
custom_mmr_job.run(
    sync=False
    , service_account=VERTEX_SA
    , tensorboard=TENSORBOARD
    , restart_job_on_worker_restart=False
    , enable_web_access=True
)

In [83]:
print(f"Job Name: {custom_mmr_job.display_name}")
print(f"Job Resource Name: {custom_mmr_job.resource_name}\n")
# print(f"Check training progress at {custom_m_job._dashboard_uri()}")

Job Name: imdb-bert-run-20230621-012602-multiworker-2-1-r
Job Resource Name: projects/934903580331/locations/us-central1/customJobs/2179064783045656576



## 2 Replicas, 2 GPUs each + Reduction Server

### set Experiment Run

In [84]:
RUN_NAME = f'run-{time.strftime("%Y%m%d-%H%M%S")}'

print(f"EXPERIMENT_NAME: {EXPERIMENT_NAME}")
print(f"RUN_NAME: {RUN_NAME}")

EXPERIMENT_NAME: jtv7-bert-tune
RUN_NAME: run-20230621-012707


### config compute

In [85]:
# MACHINE_TYPE = 'a2-highgpu-2g'
# TRAIN_GPU, TRAIN_NGPU = ('NVIDIA_TESLA_A100', 2)

# MACHINE_TYPE = 'n1-standard-16'
# TRAIN_GPU, TRAIN_NGPU = ('NVIDIA_TESLA_T4', 2)

# MACHINE_TYPE = 'a2-highgpu-2g'
# TRAIN_GPU, TRAIN_NGPU = ('NVIDIA_TESLA_A100', 2)

MACHINE_TYPE = 'a2-ultragpu-2g'
TRAIN_GPU, TRAIN_NGPU = ('NVIDIA_A100_80GB', 2)

REPLICA_COUNT = 2
DISTRIBUTION_STRATEGY = "multiworker" # single, mirrored, multiworker, tpu

HP_TUNING="False"

print(f"MACHINE_TYPE          : {MACHINE_TYPE}")
print(f"TRAIN_GPU             : {TRAIN_GPU}")
print(f"TRAIN_NGPU            : {TRAIN_NGPU}")
print(f"REPLICA_COUNT         : {REPLICA_COUNT}")
print(f"DISTRIBUTION_STRATEGY : {DISTRIBUTION_STRATEGY}")
print(f"HP_TUNING             : {HP_TUNING}")

MACHINE_TYPE          : n1-standard-16
TRAIN_GPU             : NVIDIA_TESLA_T4
TRAIN_NGPU            : 2
REPLICA_COUNT         : 2
DISTRIBUTION_STRATEGY : multiworker


### worker args

In [86]:
epochs = 10
steps_per_epoch = 200
eval_steps = 50

PER_REPLICA_BATCH_SIZE = 32
LEARNING_RATE= 0.001

REDUCTION_SERVER_COUNT = 2
REDUCTION_SERVER_MACHINE_TYPE = "n1-highcpu-16"

training_data_path = f'{STAGING_BUCKET}/bert-finetuning/imdb/tfrecords/train'
validation_data_path = f'{STAGING_BUCKET}/bert-finetuning/imdb/tfrecords/valid'
testing_data_path = f'{STAGING_BUCKET}/bert-finetuning/imdb/tfrecords/test'
job_id = 'job-{}'.format(datetime.now().strftime("%Y%m%d%H%M%S"))
job_dir = f'{STAGING_BUCKET}/jobs/{job_id}'

WORKER_ARGS = [
    "--epochs=" + str(epochs)
    , "--steps_per_epoch=" + str(steps_per_epoch)
    , "--eval_steps=" + str(eval_steps)
    , "--per_replica_batch_size=" + str(PER_REPLICA_BATCH_SIZE)
    , "--training_data_path=" + training_data_path
    , "--validation_data_path=" + validation_data_path
    , "--testing_data_path=" + testing_data_path
    , "--job_dir=" + job_dir
    , f"--strategy={DISTRIBUTION_STRATEGY}"
    , "--auto_shard_policy=data" # data auto
    , f"--job_id={job_id}"
    , f"--TRAIN_GPU={TRAIN_GPU}"
    , f"--TRAIN_NGPU={TRAIN_NGPU}"
    , f"--reduction_cnt={REDUCTION_SERVER_COUNT}"
    , f"--replica_count={REPLICA_COUNT}"
    , f"--experiment_name={EXPERIMENT_NAME}"
    , f"--experiment_run={RUN_NAME}"
    , f"--learning_rate={LEARNING_RATE}"
    , f"--tuning={HP_TUNING}"
]

from utils import workerpool_specs

WORKER_POOL_SPECS = workerpool_specs.prepare_worker_pool_specs(
    image_uri=TRAIN_IMAGE,
    args=WORKER_ARGS,
    replica_count=REPLICA_COUNT,
    machine_type=MACHINE_TYPE,
    accelerator_count=TRAIN_NGPU,
    accelerator_type=TRAIN_GPU,
    reduction_server_count=REDUCTION_SERVER_COUNT,
    reduction_server_machine_type=REDUCTION_SERVER_MACHINE_TYPE,
)

from pprint import pprint
pprint(WORKER_POOL_SPECS)

[{'container_spec': {'args': ['--epochs=10',
                              '--steps_per_epoch=200',
                              '--eval_steps=50',
                              '--per_replica_batch_size=32',
                              '--training_data_path=gs://jtv7-hybrid-vertex-bucket/bert-finetuning/imdb/tfrecords/train',
                              '--validation_data_path=gs://jtv7-hybrid-vertex-bucket/bert-finetuning/imdb/tfrecords/valid',
                              '--testing_data_path=gs://jtv7-hybrid-vertex-bucket/bert-finetuning/imdb/tfrecords/test',
                              '--job_dir=gs://jtv7-hybrid-vertex-bucket/jobs/job-20230621012708',
                              '--strategy=multiworker',
                              '--auto_shard_policy=data',
                              '--job_id=job-20230621012708',
                              '--TRAIN_GPU=NVIDIA_TESLA_T4',
                              '--TRAIN_NGPU=2',
                              '--reduction

### create tensorboard

In [87]:
vertex_ai_tb = vertex_ai.Tensorboard.create()
TENSORBOARD = vertex_ai_tb.gca_resource.name

# TENSORBOARD="projects/934903580331/locations/us-central1/tensorboards/3422410261359755264"

print(TENSORBOARD)

projects/934903580331/locations/us-central1/tensorboards/6655994793811771392


In [88]:
vertex_ai.init(
    experiment=EXPERIMENT_NAME
    # , experiment_tensorboard=vertex_ai_tb
)

### submit train job

In [None]:
ACCELERATOR = TRAIN_GPU.lower().replace("nvidia_","").replace("_","-")
print(ACCELERATOR)

In [89]:
custom_r4_job = vertex_ai.CustomJob(
    display_name=f'imdb-bert-{DISTRIBUTION_STRATEGY}-{TRAIN_NGPU}-{ACCELERATOR}',
    worker_pool_specs=WORKER_POOL_SPECS,
    staging_bucket=f'{STAGING_BUCKET}/{EXPERIMENT_NAME}/{RUN_NAME}'
)

In [90]:
custom_r4_job.run(
    sync=False
    , service_account=VERTEX_SA
    , tensorboard=TENSORBOARD
    , restart_job_on_worker_restart=False
    , enable_web_access=True
)

In [91]:
print(f"Job Name: {custom_r4_job.display_name}")
print(f"Job Resource Name: {custom_r4_job.resource_name}\n")
# print(f"Check training progress at {custom_m_job._dashboard_uri()}")

Job Name: imdb-bert-run-20230621-012707-multiworker-2-2-r
Job Resource Name: projects/934903580331/locations/us-central1/customJobs/1688172423662272512



## Cloud TPU 

* To use [Tensor Processing Units (TPUs)](https://cloud.google.com/tpu/docs/tpus) for custom training on Vertex AI, you can configure a worker pool to use a [TPU VM](https://cloud.google.com/tpu/docs/system-architecture-tpu-vm#tpu-vm).

When you use a TPU VM in Vertex AI, you must only use a single worker pool for custom training, and you must configure this worker pool to use only one replica.

#### TPU VMs worker pool configurations:

**To configure a TPU VM with [TPU V2](https://cloud.google.com/tpu/docs/system-architecture-tpu-vm#tpu_v2), specify the following fields in the WorkerPoolSpec:**
* Set `machineSpec.machineType` to `cloud-tpu`.
* Set `machineSpec.acceleratorType` to `TPU_V2`.
* Set `machineSpec.acceleratorCount` to 8 for single TPU or `32 or multiple of 32` for TPU Pods.
* Set `replicaCount` to 1.

**To configure a TPU VM with [TPU V3](https://cloud.google.com/tpu/docs/system-architecture-tpu-vm#tpu_v3), specify the following fields in the WorkerPoolSpec:**
* Set `machineSpec.machineType` to `cloud-tpu`.
* Set `machineSpec.acceleratorType` to `TPU_V3`.
* Set `machineSpec.acceleratorCount` to `8` for single TPU or `32+` for TPU Pods.
* Set `replicaCount` to 1.

In [147]:
! rm -rf scripts_tpu/trainer
! mkdir -p scripts_tpu/trainer
! touch scripts_tpu/trainer/__init__.py

In [148]:
! cp -R scripts/trainer scripts_tpu/
! ls scripts_tpu

Dockerfile  trainer


#### write Dockefile for TPU training

In [149]:
# TRAIN_BASE_IMAGE = 'us-docker.pkg.dev/vertex-ai/training/tf-gpu.2-11:latest'
# FROM us-docker.pkg.dev/vertex-ai/training/tf-tpu-pod-base-cp38:latest

# TRAIN_IMAGE_TPU = gcr.io/hybrid-vertex/imdb_bert_tpu:latest
DOCKER_TPU = 'Dockerfile'

In [150]:
# dockerfile = f'''
# FROM python:3.8

# RUN pip install tf-models-official==2.12.0
# RUN pip install tensorflow-text==2.12.0

# WORKDIR /

# # Copies the trainer code to the docker image.
# COPY trainer /trainer

# # Install TPU Tensorflow and dependencies.
# # libtpu.so must be under the '/lib' directory.
# RUN wget https://storage.googleapis.com/cloud-tpu-tpuvm-artifacts/libtpu/20221214/libtpu.so -O /lib/libtpu.so
# RUN chmod 777 /lib/libtpu.so

# RUN wget https://storage.googleapis.com/cloud-tpu-tpuvm-artifacts/tensorflow/20221214/tf_nightly-2.12.0-cp38-cp38-linux_x86_64.whl
# RUN pip3 install tf_nightly-2.12.0-cp38-cp38-linux_x86_64.whl
# RUN rm tf_nightly-2.12.0-cp38-cp38-linux_x86_64.whl

# # Sets up the entry point to invoke the trainer.
# ENTRYPOINT ["python", "-m", "trainer.task"]
# '''

# with open(f'scripts_tpu/{DOCKER_TPU}', 'w') as f:
#     f.write(dockerfile)

In [151]:
dockerfile = f'''
FROM python:3.8

WORKDIR /

RUN wget https://storage.googleapis.com/cloud-tpu-tpuvm-artifacts/libtpu/1.5.0/libtpu.so -O /lib/libtpu.so
RUN chmod 777 /lib/libtpu.so

RUN wget https://storage.googleapis.com/cloud-tpu-tpuvm-artifacts/tensorflow/tf-2.11.0/tensorflow-2.11.0-cp38-cp38-linux_x86_64.whl
RUN pip3 install tensorflow-2.11.0-cp38-cp38-linux_x86_64.whl
RUN rm tensorflow-2.11.0-cp38-cp38-linux_x86_64.whl

RUN pip install tf-models-official==2.11.0
RUN pip install tensorflow-text==2.11.0
RUN pip install cloudml-hypertune
RUN pip install --upgrade tensorflow-hub
RUN pip install --upgrade google-cloud-aiplatform

# Copies the trainer code to the docker image.
COPY trainer /trainer

# Sets up the entry point to invoke the trainer.
ENTRYPOINT ["python", "-m", "trainer.task"]
'''

with open(f'scripts_tpu/{DOCKER_TPU}', 'w') as f:
     f.write(dockerfile)

#### run docker commands in terminal

In [None]:
# ! docker build -t $TRAIN_IMAGE_TPU scripts_tpu/

In [None]:
# ! docker push $TRAIN_IMAGE_TPU

### set Experiment Run

In [152]:
RUN_NAME = f'run-{time.strftime("%Y%m%d-%H%M%S")}'

print(f"EXPERIMENT_NAME: {EXPERIMENT_NAME}")
print(f"RUN_NAME: {RUN_NAME}")

EXPERIMENT_NAME: jtv8-bert-tune
RUN_NAME: run-20230621-122648


### config compute

In [180]:
# Use TPU Accelerators. Temporarily using numeric codes, until types are added to the SDK
#   6 = TPU_V2
#   7 = TPU_V3
TRAIN_TPU, TRAIN_NTPU = (6, 8)
MACHINE_TYPE = "cloud-tpu"

# VM count
REPLICA_COUNT = 1

# distribution strategy
if not TRAIN_NTPU or TRAIN_NTPU < 2:
    DISTRIBUTION_STRATEGY = "single"
else:
    DISTRIBUTION_STRATEGY = "tpu"
print(DISTRIBUTION_STRATEGY)

# hptuning
HP_TUNING="False"

print(f"MACHINE_TYPE          : {MACHINE_TYPE}")
print(f"TRAIN_TPU             : {TRAIN_TPU}")
print(f"TRAIN_NTPU            : {TRAIN_NTPU}")
print(f"REPLICA_COUNT         : {REPLICA_COUNT}")
print(f"DISTRIBUTION_STRATEGY : {DISTRIBUTION_STRATEGY}")
print(f"HP_TUNING             : {HP_TUNING}")

tpu
MACHINE_TYPE          : cloud-tpu
TRAIN_TPU             : 6
TRAIN_NTPU            : 8
REPLICA_COUNT         : 1
DISTRIBUTION_STRATEGY : tpu
HP_TUNING             : False


### worker args

In [181]:
epochs = 10
steps_per_epoch = 200
eval_steps = 50

PER_REPLICA_BATCH_SIZE = 32
LEARNING_RATE= 0.001

REDUCTION_SERVER_COUNT = 0
REDUCTION_SERVER_MACHINE_TYPE = "n1-highcpu-16"

training_data_path = f'{STAGING_BUCKET}/bert-finetuning/imdb/tfrecords/train'
validation_data_path = f'{STAGING_BUCKET}/bert-finetuning/imdb/tfrecords/valid'
testing_data_path = f'{STAGING_BUCKET}/bert-finetuning/imdb/tfrecords/test'
job_id = 'job-{}'.format(datetime.now().strftime("%Y%m%d%H%M%S"))
job_dir = f'{STAGING_BUCKET}/jobs/{job_id}'

WORKER_ARGS = [
    "--epochs=" + str(epochs)
    , "--steps_per_epoch=" + str(steps_per_epoch)
    , "--eval_steps=" + str(eval_steps)
    , "--per_replica_batch_size=" + str(PER_REPLICA_BATCH_SIZE)
    , "--training_data_path=" + training_data_path
    , "--validation_data_path=" + validation_data_path
    , "--testing_data_path=" + testing_data_path
    , "--job_dir=" + job_dir
    , f"--strategy={DISTRIBUTION_STRATEGY}"
    , "--auto_shard_policy=data" # data auto
    , f"--job_id={job_id}"
    , f"--TRAIN_GPU={TRAIN_TPU}"
    , f"--TRAIN_NGPU={TRAIN_NTPU}"
    , f"--reduction_cnt={REDUCTION_SERVER_COUNT}"
    , f"--replica_count={REPLICA_COUNT}"
    , f"--experiment_name={EXPERIMENT_NAME}"
    , f"--experiment_run={RUN_NAME}"
    , f"--learning_rate={LEARNING_RATE}"
    , f"--tuning={HP_TUNING}"
]

from utils import workerpool_specs

WORKER_POOL_SPECS = workerpool_specs.prepare_worker_pool_specs(
    image_uri=TRAIN_IMAGE_TPU,
    args=WORKER_ARGS,
    replica_count=REPLICA_COUNT,
    machine_type=MACHINE_TYPE,
    accelerator_count=TRAIN_NTPU,
    accelerator_type=TRAIN_TPU,
    reduction_server_count=REDUCTION_SERVER_COUNT,
    reduction_server_machine_type=REDUCTION_SERVER_MACHINE_TYPE,
)

from pprint import pprint
pprint(WORKER_POOL_SPECS)

[{'container_spec': {'args': ['--epochs=10',
                              '--steps_per_epoch=200',
                              '--eval_steps=50',
                              '--per_replica_batch_size=32',
                              '--training_data_path=gs://jtv8-hybrid-vertex-bucket/bert-finetuning/imdb/tfrecords/train',
                              '--validation_data_path=gs://jtv8-hybrid-vertex-bucket/bert-finetuning/imdb/tfrecords/valid',
                              '--testing_data_path=gs://jtv8-hybrid-vertex-bucket/bert-finetuning/imdb/tfrecords/test',
                              '--job_dir=gs://jtv8-hybrid-vertex-bucket/jobs/job-20230621124323',
                              '--strategy=tpu',
                              '--auto_shard_policy=data',
                              '--job_id=job-20230621124323',
                              '--TRAIN_GPU=6',
                              '--TRAIN_NGPU=8',
                              '--reduction_cnt=0',
             

### create tensorboard

In [155]:
vertex_ai_tb = vertex_ai.Tensorboard.create()
TENSORBOARD = vertex_ai_tb.gca_resource.name

# TENSORBOARD="projects/934903580331/locations/us-central1/tensorboards/5728253270573449216"

print(TENSORBOARD)

projects/934903580331/locations/us-central1/tensorboards/4318626587206483968


In [182]:
vertex_ai.init(
    experiment=EXPERIMENT_NAME
    # , experiment_tensorboard=vertex_ai_tb
)

### submit train job

In [183]:
ACCELERATOR = "tpu-v2"
print(ACCELERATOR)

tpu-v2


In [184]:
custom_tpu_job = vertex_ai.CustomJob(
    display_name=f'imdb-bert-{DISTRIBUTION_STRATEGY}-{TRAIN_NTPU}-{ACCELERATOR}',
    worker_pool_specs=WORKER_POOL_SPECS,
    staging_bucket=f'{STAGING_BUCKET}/{EXPERIMENT_NAME}/{RUN_NAME}'
)

In [185]:
custom_tpu_job.run(
    sync=False
    # , service_account=VERTEX_SA
    # , tensorboard=TENSORBOARD
    , restart_job_on_worker_restart=False
    , enable_web_access=True
)

In [186]:
print(f"Job Name: {custom_tpu_job.display_name}")
print(f"Job Resource Name: {custom_tpu_job.resource_name}\n")
# print(f"Check training progress at {custom_tpu_job._dashboard_uri()}")

Job Name: imdb-bert-tpu-8-tpu-v2
Job Resource Name: projects/934903580331/locations/us-central1/customJobs/6248242968235343872

