In [1]:
# !pip3 install google-cloud-aiplatform --upgrade --user

In [2]:
from google.cloud import aiplatform
from google.cloud.aiplatform import hyperparameter_tuning as hpt

Build container image (Assumption: We already have written the Docker file for this, by now.)

In [3]:
PROJECT_ID = "kubeflow-1-0-2"

# Set the IMAGE_URI
IMAGE_URI=f"gcr.io/{PROJECT_ID}/fire-detection:hypertune"

In [4]:
# Build the docker image
! docker build -f Dockerfile -t $IMAGE_URI ./

Sending build context to Docker daemon  53.76kB
Step 1/5 : FROM gcr.io/deeplearning-platform-release/tf2-gpu.2-5
 ---> 307b41b1aec7
Step 2/5 : WORKDIR /
 ---> Using cache
 ---> 7d124911c0d6
Step 3/5 : RUN pip install cloudml-hypertune
 ---> Using cache
 ---> 01464536d004
Step 4/5 : COPY trainer /trainer
 ---> d41331a3dbbe
Step 5/5 : ENTRYPOINT ["python", "-m", "trainer.task"]
 ---> Running in f48f2b4b1702
Removing intermediate container f48f2b4b1702
 ---> 0eb3ad196607
Successfully built 0eb3ad196607
Successfully tagged gcr.io/kubeflow-1-0-2/fire-detection:hypertune


Push to Container Registry

In [5]:
! docker push $IMAGE_URI

The push refers to repository [gcr.io/kubeflow-1-0-2/fire-detection]

[1Bf60161bd: Preparing 
[1B63dc2d20: Preparing 
[1Baa5df10d: Preparing 
[1B010939aa: Preparing 
[1Bc4ea3a81: Preparing 
[1B08c5711b: Preparing 
[1Bb564e194: Preparing 
[1B6808a3d1: Preparing 
[1Bbdf9b557: Preparing 
[1Bdbc2b748: Preparing 
[1Bb8f29c2e: Preparing 
[1B7b2f7486: Preparing 
[1B97a3e6e4: Preparing 
[1Ba5e8117f: Preparing 
[1B8124ed57: Preparing 
[1B4704bb3d: Preparing 
[1B6ef24b4b: Preparing 
[1B113f67c8: Preparing 
[1B857a1d48: Preparing 
[1B97864c52: Preparing 
[1Bbaac3e32: Preparing 
[1Ba1af4c10: Preparing 
[1Ba468ca49: Preparing 
[1B205798d1: Preparing 
[1Bcd6d4269: Preparing 
[1B55c89c2a: Preparing 
[1Bb9034da6: Preparing 
[1B4fbfce85: Preparing 
[1B9ca3db46: Preparing 
[1B1a1930ab: Preparing 
[1Bf5a43f1f: Preparing 
[32B60161bd: Pushed lready exists kB[29A[2K[30A[2K[24A[2K[21A[2K[16A[2K[12A[2K[9A[2K[4A[2K[1A[2K[32A[2Khypertune: digest: sha256:65791

Specs for distributed training

In [6]:
IMAGE_URI

'gcr.io/kubeflow-1-0-2/fire-detection:hypertune'

In [7]:
# The spec of the worker pools including machine type and Docker image
# Be sure to replace IMAGE_URI with the path to your Docker image in GCR
worker_pool_specs = [{
                    "machine_spec": {
                                    "machine_type": "n1-standard-4",
#                                     "accelerator_type": "NVIDIA_TESLA_T4",
#                                     "accelerator_count": 1
                                    },
                    "replica_count": 1,
                    "container_spec": {
                                        "image_uri": IMAGE_URI  # <-- Change this
                                      }
                    }]


# Dicionary representing metrics to optimize.
# The dictionary key is the metric_id, which is reported by your training job,
# And the dictionary value is the optimization goal of the metric.
metric_spec={'accuracy':'maximize'}

# Dictionary representing parameters to optimize.
# The dictionary key is the parameter_id, which is passed into your training
# job as a command line argument,
# And the dictionary value is the parameter specification of the metric.
parameter_spec = {
                "learning_rate": hpt.DoubleParameterSpec(min=0.01, max=0.015,  # float
                                                         scale="log"),
                "momentum": hpt.DoubleParameterSpec(min=0.0, max=0.01,  # float
                                                    scale="linear"),
                "num_hidden": hpt.DiscreteParameterSpec(values=[16, 32],  # int
                                                        scale=None),
#                 "batch_size": hpt.DiscreteParameterSpec(values=[16, 32],  # int
#                                                         scale=None)
                }

In [8]:
# Replace YOUR_BUCKET
staging_bucket = "gs://fire_detection_anurag/temp_dir/hptune"  # <-- Change this

my_custom_job = aiplatform.CustomJob(display_name='fire-detector-hptune',
                                     worker_pool_specs=worker_pool_specs,
                                     staging_bucket=staging_bucket)  

In [9]:
hp_job = aiplatform.HyperparameterTuningJob(
                                            display_name='fire-detector-hptune',
                                            custom_job=my_custom_job,
                                            metric_spec=metric_spec,
                                            parameter_spec=parameter_spec,
                                            max_trial_count=8,  # preferably high
                                            parallel_trial_count=2  # prefeably low
                                            )

hp_job.run()

INFO:google.cloud.aiplatform.jobs:Creating HyperparameterTuningJob
INFO:google.cloud.aiplatform.jobs:HyperparameterTuningJob created. Resource name: projects/9118975290/locations/us-central1/hyperparameterTuningJobs/345877770796007424
INFO:google.cloud.aiplatform.jobs:To use this HyperparameterTuningJob in another session:
INFO:google.cloud.aiplatform.jobs:hpt_job = aiplatform.HyperparameterTuningJob.get('projects/9118975290/locations/us-central1/hyperparameterTuningJobs/345877770796007424')
INFO:google.cloud.aiplatform.jobs:View HyperparameterTuningJob:
https://console.cloud.google.com/ai/platform/locations/us-central1/training/345877770796007424?project=9118975290
INFO:google.cloud.aiplatform.jobs:HyperparameterTuningJob projects/9118975290/locations/us-central1/hyperparameterTuningJobs/345877770796007424 current state:
JobState.JOB_STATE_PENDING
INFO:google.cloud.aiplatform.jobs:HyperparameterTuningJob projects/9118975290/locations/us-central1/hyperparameterTuningJobs/34587777079600