##### Copyright 2021 The TensorFlow Cloud Authors.


In [None]:
#@title Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Running custom tasks experiments from TF Model Garden on GCP with TF Cloud

<table class="tfo-notebook-buttons" align="left">
  <td>
        <a target="_blank" href="https://colab.research.google.com/github/tensorflow/cloud/blob/master/src/python/tensorflow_cloud/core/experimental/tests/examples/running_custom_task_experiment_from_tf_model_garden_on_gcp_with_tf_cloud.ipynb"><img src="https://www.tensorflow.org/images/colab_logo_32px.png" />Run in Google Colab</a>
  </td>
  <td>
        <a target="_blank" href="https://github.com/tensorflow/cloud/blob/master/src/python/tensorflow_cloud/core/experimental/tests/examples/running_custom_task_experiment_from_tf_model_garden_on_gcp_with_tf_cloud.ipynb"><img src="https://www.tensorflow.org/images/GitHub-Mark-32px.png" />View on GitHub</a>
  </td>
  <td>
        <a href="https://kaggle.com/kernels/welcome?src=https://github.com/tensorflow/cloud/blob/master/src/python/tensorflow_cloud/core/experimental/tests/examples/running_custom_task_experiment_from_tf_model_garden_on_gcp_with_tf_cloud.ipynb" target="blank"> <img width="90" src="https://www.kaggle.com/static/images/site-logo.png" alt="Kaggle logo" />Run in Kaggle</a>
  </td>
</table>

In this example we will use [run_experiment_cloud](https://github.com/tensorflow/cloud/blob/690c3eee65dadee8af260a19341ff23f42f1f070/src/python/tensorflow_cloud/core/experimental/models.py#L230) from the experimental module of TF Cloud to train a ResNet model from [TF Model Garden](https://github.com/tensorflow/models/tree/master/official) on an image classification task with the cifar 10 dataset from TFDS. We will also be showing the different distribution strategies that this method supports.

## Install Packages

We need the nightly version of tensorflow-cloud that we can get from github, the official release of tf-models-official, and keras 2.6.0rc0 for compatibility.

In [None]:
!pip install -q tensorflow-cloud tf-models-nightly

In [2]:
import tensorflow_cloud as tfc
print(tfc.__version__)

0.1.16


## Import required modules

In [None]:
import copy
import os
import sys

from tensorflow_cloud.core.experimental.models import run_experiment_cloud

from official.core import config_definitions as cfg
from official.core import exp_factory
from official.core import task_factory
from official.modeling import optimization
from official.vision.beta.configs.backbones import Backbone
from official.vision.beta.configs.backbones import ResNet
from official.vision.beta.configs import common
from official.vision.beta.configs import image_classification

## Project Configurations
Setting project parameters. For more details on Google Cloud Specific parameters please refer to [Google Cloud Project Setup Instructions](https://www.kaggle.com/nitric/google-cloud-project-setup-instructions/).

In [None]:
# Set Google Cloud Specific parameters

# TODO: Please set GCP_PROJECT_ID to your own Google Cloud project ID.
GCP_PROJECT_ID = 'YOUR_PROJECT_ID' #@param {type:"string"}

# TODO: set GCS_BUCKET to your own Google Cloud Storage (GCS) bucket.
GCS_BUCKET = 'YOUR_BUCKET_NAME' #@param {type:"string"}

# DO NOT CHANGE: Currently only the 'us-central1' region is supported.
REGION = 'us-central1'

# OPTIONAL: You can change the job name to any string.
JOB_NAME = 'cifar10_resnet' #@param {type:"string"}

# Setting location were training logs and checkpoints will be stored
GCS_BASE_PATH = f'gs://{GCS_BUCKET}/{JOB_NAME}'
MODEL_DIR = os.path.join(GCS_BASE_PATH,"model")

## Authenticating the notebook to use your Google Cloud Project

This code authenticates the notebook, checking your valid Google Cloud credentials and identity. It is inside the `if not tfc.remote()` block to ensure that it is only run in the notebook, and will not be run when the notebook code is sent to Google Cloud.

Note: For Kaggle Notebooks click on "Add-ons"->"Google Cloud SDK" before running the cell below.

In [None]:
if not tfc.remote():

    # Authentication for Kaggle Notebooks
    if "kaggle_secrets" in sys.modules:
        from kaggle_secrets import UserSecretsClient
        UserSecretsClient().set_gcloud_credentials(project=GCP_PROJECT_ID)

    # Authentication for Colab Notebooks
    if "google.colab" in sys.modules:
        from google.colab import auth
        auth.authenticate_user()
        os.environ["GOOGLE_CLOUD_PROJECT"] = GCP_PROJECT_ID

## Set Up TF Model Garden Experiment

We are going to set up the experiment from TF Model Garden that we want to run. In this case, we are going to be training a ResNet model on an image classification task with the cifar 10 dataset from TFDS.

In [None]:
@exp_factory.register_config_factory('resnet_cifar10')
def image_classification_cifar10() -> cfg.ExperimentConfig:
  """Image classification on cifar10 with resnet."""
  tfds_name = 'cifar10'
  train_examples = 50000
  val_examples = 10000
  train_batch_size = 256
  eval_batch_size = 256
  num_classes = 10
  steps_per_epoch = train_examples // train_batch_size
  config = cfg.ExperimentConfig(
      task=image_classification.ImageClassificationTask(
          model=image_classification.ImageClassificationModel(
              num_classes=num_classes,
              input_size=[224, 224, 3],
              backbone=Backbone(
                  type='resnet', resnet=ResNet(model_id=50)),
              norm_activation=common.NormActivation(
                  norm_momentum=0.9, norm_epsilon=1e-5, use_sync_bn=False)),
          losses=image_classification.Losses(l2_weight_decay=1e-4),
          train_data=image_classification.DataConfig(
              tfds_name=tfds_name,
              tfds_split='train',
              is_training=True,
              global_batch_size=train_batch_size),
          validation_data=image_classification.DataConfig(
              tfds_name=tfds_name,
              tfds_split='test',
              is_training=False,
              global_batch_size=eval_batch_size)),
      trainer=cfg.TrainerConfig(
          steps_per_loop=steps_per_epoch,
          summary_interval=steps_per_epoch,
          checkpoint_interval=steps_per_epoch,
          train_steps=90 * steps_per_epoch,
          validation_steps=val_examples // eval_batch_size,
          validation_interval=steps_per_epoch,
          optimizer_config=optimization.OptimizationConfig({
              'optimizer': {
                  'type': 'sgd',
                  'sgd': {
                      'momentum': 0.9
                  }
              },
              'learning_rate': {
                  'type': 'stepwise',
                  'stepwise': {
                      'boundaries': [
                          30 * steps_per_epoch, 60 * steps_per_epoch,
                          80 * steps_per_epoch
                      ],
                      'values': [
                          0.1 * train_batch_size / 256,
                          0.01 * train_batch_size / 256,
                          0.001 * train_batch_size / 256,
                          0.0001 * train_batch_size / 256,
                      ]
                  }
              },
              'warmup': {
                  'type': 'linear',
                  'linear': {
                      'warmup_steps': 5 * steps_per_epoch,
                      'warmup_learning_rate': 0
                  }
              }
          })),
      restrictions=[
          'task.train_data.is_training != None',
          'task.validation_data.is_training != None'
      ])

  return config



After having the experiment config ready, we can store all of the params in a dictionary. For more details refer to [run_experiment GitHhub](https://github.com/tensorflow/models/blob/7c2ff1afc4423266223bcd50cba0ed55aca826c8/official/core/train_lib.py#L35).

Note: run_experiment requires a distribution_strategy parameter. However, run_experiment_cloud selects the distribution strategy based on the cloud configuration. Therefore, you should not pass this parameter as part of run_experiment_kwargs. For more information on distribution strategies check [Running model experiments from TF Model Garden on GCP with TF Cloud](https://github.com/tensorflow/cloud/blob/master/src/python/tensorflow_cloud/core/experimental/tests/examples/running_model_experiments_from_tf_model_garden_on_gcp_with_tf_cloud.ipynb).

In [None]:
from official.vision.beta.tasks.image_classification import ImageClassificationTask

config = exp_factory.get_exp_config('resnet_cifar10')

run_experiment_kwargs = dict(
    params=config,
    task=task_factory.get_task(config.task),
    mode="train_and_eval",
    model_dir=MODEL_DIR,
)

## Set up TensorFlowCloud run

Setting up parameters for tfc.run(). The chief_config, worker_count and worker_config will be set up individually for each distribution strategy. For more details refer to [TensorFlow Cloud overview tutorial](https://colab.research.google.com/github/tensorflow/cloud/blob/master/g3doc/tutorials/overview.ipynb)

In [None]:
with open('requirements.txt','w') as f:
    f.write('tf-models-nightly\n')

run_kwargs = dict(
    requirements_txt = 'requirements.txt',
    docker_config=tfc.DockerConfig(
        parent_image="gcr.io/deeplearning-platform-release/tf2-gpu.2-5",
        image_build_bucket=GCS_BUCKET
    ),
    chief_config=tfc.COMMON_MACHINE_CONFIGS["T4_4X"],
    job_labels={'job': JOB_NAME}
)

## Run remote experiment

With run_experiment_kwargs and run_kwargs complete, we can call now run_experiment_cloud to run the experiment in GCP.

In [None]:
run_experiment_cloud(run_experiment_kwargs, run_kwargs)

# Training Results
## Reconnect your Colab instance
Most remote training jobs are long running, if you are using Colab it may time out before the training results are available. In that case rerun the following sections to reconnect and configure your Colab instance to access the training results. Run the following sections in order:

1.   Import required modules
2.   Project Configurations
3.   Authenticating the notebook to use your Google Cloud Project

## Load your trained model

Once training is complete, you can retrieve your model from the GCS Bucket you  specified above.

In [None]:
import tensorflow as tf

trained_model = tf.keras.models.load_model(MODEL_DIR)
trained_model.summary()