# Train an object detection model using Tensorflow on SageMaker

## Setup environment

In [2]:
import os
import sagemaker
from sagemaker.estimator import Framework, Estimator

role = sagemaker.get_execution_role()

inputs = {'train': 's3://sagemaker-gauge-detection-training-070122/data/'} # define s3 training data inputs, this is the output of the processing job
tensorboard_s3_prefix = 's3://sagemaker-gauge-detection-training-070122/output/' # s3 path for tensorboard events, up to you where to save events 

## Build and push container

In [3]:
%%bash
git clone https://github.com/tensorflow/models.git docker/models
# get model_main and exporter_main files from TF2 Object Detection GitHub repository
cp docker/models/research/object_detection/exporter_main_v2.py source_dir 
cp docker/models/research/object_detection/model_main_tf2.py source_dir

Cloning into 'docker/models'...


In [4]:
image_name = 'tf2-object-detection'

In [8]:
!sh ./docker/build_and_push.sh $image_name

https://docs.docker.com/engine/reference/commandline/login/#credentials-store

Login Succeeded
Building image with name tf2-object-detection
Sending build context to Docker daemon    676MB
Step 1/16 : FROM tensorflow/tensorflow:2.9.0-gpu
2.9.0-gpu: Pulling from tensorflow/tensorflow

[1B17ec1767: Pulling fs layer 
[1B9ecd2bff: Pulling fs layer 
[1B4ae53552: Pulling fs layer 
[1B2d09b8c4: Pulling fs layer 
[1B0d530989: Pulling fs layer 
[1B81af025b: Pulling fs layer 
[1Bc129f45e: Pulling fs layer 
[1B8fcb70c6: Pulling fs layer 
[1B9aa4a247: Pulling fs layer 
[1B3100c8d1: Pulling fs layer 
[1B3a6b487b: Pulling fs layer 
[1Be8773234: Pulling fs layer 
[1B36c9476c: Pulling fs layer 
[1B1b420cea: Pull complete 087kB/1.087kBB[13A[2K[14A[2K[14A[2K[12A[2K[11A[2K[7A[2K[8A[2K[6A[2K[8A[2K[14A[2K[6A[2K[6A[2K[6A[2K[9A[2K[6A[2K[9A[2K[9A[2K[14A[2K[6A[2K[8A[2K[8A[2K[6A[2K[14A[2K[9A[2K[6A[2K[14A[2K[9A[2K[14A[2K[5A[2K[5A[2K[9A[2

In [9]:
with open (os.path.join('docker', 'ecr_image_fullname.txt'), 'r') as f:
    container = f.readlines()[0][:-1]

print(container)

720148074160.dkr.ecr.us-east-1.amazonaws.com/tf2-object-detection:20220707201336


## Get pre-trained model from model zoo

Download the base model and extract locally

In [10]:
%%bash
mkdir /tmp/checkpoint
mkdir source_dir/checkpoint
wget -O /tmp/efficientdet.tar.gz http://download.tensorflow.org/models/object_detection/tf2/20200711/efficientdet_d1_coco17_tpu-32.tar.gz
tar -zxvf /tmp/efficientdet.tar.gz --strip-components 2 --directory source_dir/checkpoint efficientdet_d1_coco17_tpu-32/checkpoint

efficientdet_d1_coco17_tpu-32/checkpoint/ckpt-0.data-00000-of-00001
efficientdet_d1_coco17_tpu-32/checkpoint/checkpoint
efficientdet_d1_coco17_tpu-32/checkpoint/ckpt-0.index


--2022-07-07 20:25:04--  http://download.tensorflow.org/models/object_detection/tf2/20200711/efficientdet_d1_coco17_tpu-32.tar.gz
Resolving download.tensorflow.org (download.tensorflow.org)... 172.253.122.128, 2607:f8b0:4004:835::2010
Connecting to download.tensorflow.org (download.tensorflow.org)|172.253.122.128|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 51839363 (49M) [application/x-tar]
Saving to: ‘/tmp/efficientdet.tar.gz’

     0K .......... .......... .......... .......... ..........  0% 13.6M 4s
    50K .......... .......... .......... .......... ..........  0% 8.86M 5s
   100K .......... .......... .......... .......... ..........  0% 20.5M 4s
   150K .......... .......... .......... .......... ..........  0% 21.8M 3s
   200K .......... .......... .......... .......... ..........  0% 24.7M 3s
   250K .......... .......... .......... .......... ..........  0% 16.7M 3s
   300K .......... .......... .......... .......... ..........  0% 32.1M 3s
   350

## Create SageMaker Custom Framework and Launch Training job

Here we define a custom framework estimator using the Amazon SageMaker Python SDK and run training with that class, which will take care of managing these tasks.

In [11]:
class CustomFramework(Framework):
    def __init__(
        self,
        entry_point,
        framework_version=None,
        py_version=None,
        source_dir=None,
        hyperparameters=None,
        image_uri=None,
        distribution=None,
        **kwargs
    ):
        super(CustomFramework, self).__init__(
            entry_point, source_dir, hyperparameters, image_uri=image_uri, **kwargs
        )
        self.framework_version = framework_version
        self.py_version = None
        
    def _configure_distribution(self, distributions):
        return None

    def create_model(
        self,
        model_server_workers=None,
        role=None,
        vpc_config_override=None,
        entry_point=None,
        source_dir=None,
        dependencies=None,
        image_uri=None,
        **kwargs
    ):
        return None

In [12]:
tensorboard_output_config = sagemaker.debugger.TensorBoardOutputConfig(
    s3_output_path=tensorboard_s3_prefix,
    container_local_output_path='/opt/training/'
)

estimator = CustomFramework(
    role=role,
    image_uri=container,
    entry_point='run_training.sh',
    source_dir='source_dir/',
    hyperparameters={
        "model_dir":"s3://sagemaker-gauge-detection-training-070122/test_train_eval/model-base-070722/",        
        "pipeline_config_path": "pipeline.config",
        "num_train_steps": "40000",    
        "sample_1_of_n_eval_examples": "1"
    },
    instance_count=1,
    instance_type='ml.p3.2xlarge',
    tensorboard_output_config=tensorboard_output_config,
    disable_profiler=True,
    base_job_name='tf2-object-detection')

In [None]:
estimator.fit(inputs)

2022-07-07 20:32:44 Starting - Starting the training job...
2022-07-07 20:33:11 Starting - Preparing the instances for training..

## Visualize training metrics with Tensorboard

In [None]:
#Due to this issue: https://github.com/ipython/ipykernel/issues/395#issuecomment-479787997
#If you're using a custom conda env, there is a change that the tensorboard executable isn't in the Python path.
#uncomment the following lines

#bin_env_path = "/home/ec2-user/anaconda3/envs/myenv/bin/"
#os.environ["PATH"] += os.pathsep + bin_env_path

In [None]:
job_artifacts_path = estimator.latest_job_tensorboard_artifacts_path()
job_artifacts_path

### Visualize training outputs

In [None]:
#Careful notebook would be stuck until you stop tensorboard, you can also launch this from a terminal
tensorboard_s3_output_path = f'{job_artifacts_path}/train'
!F_CPP_MIN_LOG_LEVEL=3 AWS_REGION=eu-west-1 tensorboard --logdir=$tensorboard_s3_output_path

### Visualize evaluation outputs


In [None]:
#Careful notebook would be stuck until you stop tensorboard, you can also launch this from a terminal
tensorboard_s3_output_path = f'{job_artifacts_path}/eval'
!F_CPP_MIN_LOG_LEVEL=3 AWS_REGION=eu-west-1 tensorboard --logdir=$tensorboard_s3_output_path