##### Copyright 2021 The TensorFlow Cloud Authors.


In [None]:
#@title Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Running model experiments from TF Model Garden on GCP with TF Cloud

<table class="tfo-notebook-buttons" align="left">
  <td>
        <a target="_blank" href="https://colab.research.google.com/github/tensorflow/cloud/blob/master/src/python/tensorflow_cloud/core/experimental/tests/examples/running_model_experiments_from_tf_model_garden_on_gcp_with_tf_cloud.ipynb"><img src="https://www.tensorflow.org/images/colab_logo_32px.png" />Run in Google Colab</a>
  </td>
  <td>
        <a target="_blank" href="https://github.com/tensorflow/cloud/blob/master/src/python/tensorflow_cloud/core/experimental/tests/examples/running_model_experiments_from_tf_model_garden_on_gcp_with_tf_cloud.ipynb"><img src="https://www.tensorflow.org/images/GitHub-Mark-32px.png" />View on GitHub</a>
  </td>
  <td>
        <a href="https://kaggle.com/kernels/welcome?src=https://github.com/tensorflow/cloud/blob/master/src/python/tensorflow_cloud/core/experimental/tests/examples/running_model_experiments_from_tf_model_garden_on_gcp_with_tf_cloud.ipynb" target="blank"> <img width="90" src="https://www.kaggle.com/static/images/site-logo.png" alt="Kaggle logo" />Run in Kaggle</a>
  </td>
</table>

In this example we will use [run_experiment_cloud](https://github.com/tensorflow/cloud/blob/690c3eee65dadee8af260a19341ff23f42f1f070/src/python/tensorflow_cloud/core/experimental/models.py#L230) from the experimental module of TF Cloud to run a mock experiment from [TF Model Garden](https://github.com/tensorflow/models/tree/master/official). We will also be showing the different distribution strategies that this method supports.

## Install Packages

We need tensorflow-cloud and the official release of tf-models-official.

In [None]:
!pip install -q tensorflow-cloud tf-models-official

## Import required modules

In [5]:
import copy
import os
import sys

import tensorflow_cloud as tfc
from tensorflow_cloud.core.experimental.models import run_experiment_cloud

from official.core import task_factory
from official.utils.testing import mock_task

print(tfc.__version__)

0.1.17.dev


## Project Configurations
Setting project parameters. For more details on Google Cloud Specific parameters please refer to [Google Cloud Project Setup Instructions](https://www.kaggle.com/nitric/google-cloud-project-setup-instructions/).

In [None]:
# Set Google Cloud Specific parameters

# TODO: Please set GCP_PROJECT_ID to your own Google Cloud project ID.
GCP_PROJECT_ID = 'YOUR_PROJECT_ID' #@param {type:"string"}

# TODO: set GCS_BUCKET to your own Google Cloud Storage (GCS) bucket.
GCS_BUCKET = 'YOUR_BUCKET_NAME' #@param {type:"string"}

# DO NOT CHANGE: Currently only the 'us-central1' region is supported.
REGION = 'us-central1'

## Authenticating the notebook to use your Google Cloud Project

This code authenticates the notebook, checking your valid Google Cloud credentials and identity. It is inside the `if not tfc.remote()` block to ensure that it is only run in the notebook, and will not be run when the notebook code is sent to Google Cloud.

Note: For Kaggle Notebooks click on "Add-ons"->"Google Cloud SDK" before running the cell below.

In [None]:
if not tfc.remote():

    # Authentication for Kaggle Notebooks
    if "kaggle_secrets" in sys.modules:
        from kaggle_secrets import UserSecretsClient
        UserSecretsClient().set_gcloud_credentials(project=GCP_PROJECT_ID)

    # Authentication for Colab Notebooks
    if "google.colab" in sys.modules:
        from google.colab import auth
        auth.authenticate_user()
        os.environ["GOOGLE_CLOUD_PROJECT"] = GCP_PROJECT_ID

## Set Up TF Model Garden Experiment

We are going to set up the experiment from TF Model Garden that we want to run. In this case, we are going to be running a mock experiment. However, you can chose any experiment from TF Model Garden. Also, we are going to be overriding some of the params from the original experiment to include a trainer.

After having the experiment config ready, we can store all of the params in a dictionary. The only param we are missing is model_dir, however we will be setting this one up later on.

For more details refer to [run_experiment GitHub](https://github.com/tensorflow/models/blob/7c2ff1afc4423266223bcd50cba0ed55aca826c8/official/core/train_lib.py#L35).

Note: run_experiment requires a distribution_strategy parameter. However, run_experiment_cloud selects the distribution strategy based on the cloud configuration. Therefore, you should not pass this parameter as part of run_experiment_kwargs.

In [None]:
config = mock_task.mock_experiment()

overrides = {
    "trainer": {
        "checkpoint_interval": 10,
        "steps_per_loop": 10,
        "summary_interval": 10,
        "train_steps": 10,
        "validation_steps": 5,
        "validation_interval": 10,
        "continuous_eval_timeout": 1,
        "validation_summary_subdir": "validation",
        "optimizer_config": {
            "optimizer": {
                "type": "sgd",
            },
            "learning_rate": {
                "type": "constant"
            }
        }
    },
}
config.override(overrides, is_strict=False)

run_experiment_kwargs = dict(
    params=config,
    task=task_factory.get_task(config.task),
    mode="train_and_eval",
)

## Set up TensorFlowCloud run

Set up parameters for tfc.run(). The chief_config, worker_count and worker_config will be set up individually for each distribution strategy. For more details refer to [TensorFlow Cloud overview tutorial](https://colab.research.google.com/github/tensorflow/cloud/blob/master/g3doc/tutorials/overview.ipynb)

In [None]:
with open('requirements.txt','w') as f:
    f.write('tf-models-official\n')

run_kwargs = dict(
    requirements_txt = 'requirements.txt',
    docker_config=tfc.DockerConfig(
        parent_image="gcr.io/deeplearning-platform-release/tf2-gpu.2-5",
        image_build_bucket=GCS_BUCKET
    ),
)

## Set up distirbution strategies

Currently run_experiment_cloud supports 4 different distirbution strategies:

1.   One Device
2.   Mirror
3.   Multi Worker Mirror
4.   TPU

However, unlike run_experiment from TF Model Garden, the user does not specify the distirbution strategy. Instead, it is selected based on the machine configuration provided in run_kwargs. The following sections show how to set up the machine config to use the different distribution strategies.



### One device strategy

Using default values for config (One accelerator (T4_1X) and 0 workers).

In [None]:
JOB_NAME = 'one_device' #@param {type:"string"}

# Setting location were training logs and checkpoints will be stored
GCS_BASE_PATH = f'gs://{GCS_BUCKET}/{JOB_NAME}'
one_device_model_dir = os.path.join(GCS_BASE_PATH,"saved_model")

one_device_run_experiment_kwargs = copy.deepcopy(run_experiment_kwargs)
one_device_run_experiment_kwargs.update(dict(
    model_dir=one_device_model_dir,
))

one_device_run_kwargs = copy.deepcopy(run_kwargs)
one_device_run_kwargs.update(dict(
    job_labels={'job': JOB_NAME}
))

### Mirror strategy

Requires at least two accelerators in the chief_config and 0 workers.

In [None]:
JOB_NAME = 'mirror' #@param {type:"string"}

# Setting location were training logs and checkpoints will be stored
GCS_BASE_PATH = f'gs://{GCS_BUCKET}/{JOB_NAME}'
mirror_model_dir = os.path.join(GCS_BASE_PATH,"saved_model")

mirror_run_experiment_kwargs = copy.deepcopy(run_experiment_kwargs)
mirror_run_experiment_kwargs.update(dict(
    model_dir=mirror_model_dir,
))

mirror_run_kwargs = copy.deepcopy(run_kwargs)
mirror_run_kwargs.update(dict(
    chief_config=tfc.COMMON_MACHINE_CONFIGS["P100_4X"],
    job_labels={'job': JOB_NAME}
))

### Multi mirror strategy

Requires at least one worker.

In [None]:
JOB_NAME = 'multi_mirror' #@param {type:"string"}

# Setting location were training logs and checkpoints will be stored
GCS_BASE_PATH = f'gs://{GCS_BUCKET}/{JOB_NAME}'
multi_mirror_model_dir = os.path.join(GCS_BASE_PATH,"saved_model")

multi_mirror_run_experiment_kwargs = copy.deepcopy(run_experiment_kwargs)
multi_mirror_run_experiment_kwargs.update(dict(
    model_dir=multi_mirror_model_dir,
))

multi_mirror_run_kwargs = copy.deepcopy(run_kwargs)
multi_mirror_run_kwargs.update(dict(
    chief_config=tfc.COMMON_MACHINE_CONFIGS["P100_1X"],
    worker_count=1,
    worker_config=tfc.COMMON_MACHINE_CONFIGS["P100_1X"],
    job_labels={'job': JOB_NAME}
))

### TPU strategy

Rquires TPU as worker_config.

In [None]:
JOB_NAME = 'tpu' #@param {type:"string"}

# Setting location were training logs and checkpoints will be stored
GCS_BASE_PATH = f'gs://{GCS_BUCKET}/{JOB_NAME}'
tpu_model_dir = os.path.join(GCS_BASE_PATH,"saved_model")

tpu_run_experiment_kwargs = copy.deepcopy(run_experiment_kwargs)
tpu_run_experiment_kwargs.update(dict(
    model_dir=tpu_model_dir,
))

tpu_run_kwargs = copy.deepcopy(run_kwargs)
tpu_run_kwargs.update(dict(
    chief_config=tfc.COMMON_MACHINE_CONFIGS["CPU"],
    worker_count=1,
    worker_config=tfc.COMMON_MACHINE_CONFIGS["TPU"],
    job_labels={'job': JOB_NAME}
))

## Run remote experiment

Select the distribution strategy to use and then run the remote experiment by calling run_experiment_cloud with the specified configs.

In [None]:
run_experiment_configs = dict(
    one_device=one_device_run_experiment_kwargs,
    mirror=mirror_run_experiment_kwargs,
    multi_mirror=multi_mirror_run_experiment_kwargs,
    tpu=tpu_run_experiment_kwargs,
)

run_configs = dict(
    one_device=one_device_run_kwargs,
    mirror=mirror_run_kwargs,
    multi_mirror=multi_mirror_run_kwargs,
    tpu=tpu_run_kwargs,
)

distribution_strategy = 'one_device' #@param ["one_device", "mirror", "multi_mirror", "tpu"]

In [None]:
run_experiment_cloud(run_experiment_configs[distribution_strategy],
                     run_configs[distribution_strategy])

## Training Results
### Reconnect your Colab instance
Most remote training jobs are long running, if you are using Colab it may time out before the training results are available. In that case rerun the following sections to reconnect and configure your Colab instance to access the training results. Run the following sections in order:

1.   Import required modules
2.   Project Configurations
3.   Authenticating the notebook to use your Google Cloud Project

### Load your trained model

Once training is complete, you can retrieve your model from the GCS Bucket you  specified above.

In [None]:
import tensorflow as tf

saved_model_dir = run_experiment_configs[distribution_strategy]['model_dir']

trained_model = tf.keras.models.load_model(saved_model_dir)
trained_model.summary()