# Train an object detection model using Tensorflow on SageMaker

## Setup environment

In [1]:
import os
import sagemaker
from sagemaker.estimator import Framework, Estimator

role = sagemaker.get_execution_role()

inputs = {'train': 's3://sagemaker-gauge-detection-training-070122/data/'} # define s3 training data inputs, this is the output of the processing job
tensorboard_s3_prefix = 's3://sagemaker-gauge-detection-training-070122/output/' # s3 path for tensorboard events, up to you where to save events 

In [2]:
image_name = 'tf2-object-detection'

In [3]:
with open (os.path.join('docker', 'ecr_image_fullname.txt'), 'r') as f:
    container = f.readlines()[0][:-1]

print(container)

720148074160.dkr.ecr.us-east-1.amazonaws.com/tf2-object-detection:20220707201336


## Create SageMaker Custom Framework and Launch Training job

Here we define a custom framework estimator using the Amazon SageMaker Python SDK and run training with that class, which will take care of managing these tasks.

In [5]:
class CustomFramework(Framework):
    def __init__(
        self,
        entry_point,
        framework_version=None,
        py_version=None,
        source_dir=None,
        hyperparameters=None,
        image_uri=None,
        distribution=None,
        **kwargs
    ):
        super(CustomFramework, self).__init__(
            entry_point, source_dir, hyperparameters, image_uri=image_uri, **kwargs
        )
        self.framework_version = framework_version
        self.py_version = None
        
    def _configure_distribution(self, distributions):
        return None

    def create_model(
        self,
        model_server_workers=None,
        role=None,
        vpc_config_override=None,
        entry_point=None,
        source_dir=None,
        dependencies=None,
        image_uri=None,
        **kwargs
    ):
        return None

In [6]:
tensorboard_output_config = sagemaker.debugger.TensorBoardOutputConfig(
    s3_output_path=tensorboard_s3_prefix,
    container_local_output_path='/opt/training/'
)

estimator = CustomFramework(
    role=role,
    image_uri=container,
    entry_point='run_simultaneous_evaluation.sh',
    source_dir='source_dir/',
    hyperparameters={
        "model_dir":"s3://sagemaker-gauge-detection-training-070122/test_train_eval/model-base-070722/",        
        "pipeline_config_path": "pipeline.config",
        "num_train_steps": "40000",    
        "sample_1_of_n_eval_examples": "1"
    },
    instance_count=1,
    instance_type='ml.g4dn.2xlarge',
    tensorboard_output_config=tensorboard_output_config,
    disable_profiler=True,
    base_job_name='tf2-object-detection')

In [None]:
estimator.fit(inputs)

2022-07-07 20:32:46 Starting - Starting the training job...
2022-07-07 20:33:11 Starting - Preparing the instances for training.........
2022-07-07 20:34:28 Downloading - Downloading input data.........
2022-07-07 20:36:08 Training - Downloading the training image...
2022-07-07 20:36:39 Training - Training image download completed. Training in progress..[34m2022-07-07 20:36:42,303 sagemaker-training-toolkit INFO     Invoking user script[0m
[34mTraining Env:[0m
[34m{
    "additional_framework_parameters": {},
    "channel_input_dirs": {
        "train": "/opt/ml/input/data/train"
    },
    "current_host": "algo-1",
    "framework_module": null,
    "hosts": [
        "algo-1"
    ],
    "hyperparameters": {
        "model_dir": "s3://sagemaker-gauge-detection-training-070122/test_train_eval/model-base-070722/",
        "num_train_steps": "40000",
        "pipeline_config_path": "pipeline.config",
        "sample_1_of_n_eval_examples": "1"
    },
    "input_config_dir": "/opt/ml/in

## Visualize training metrics with Tensorboard

In [None]:
#Due to this issue: https://github.com/ipython/ipykernel/issues/395#issuecomment-479787997
#If you're using a custom conda env, there is a change that the tensorboard executable isn't in the Python path.
#uncomment the following lines

#bin_env_path = "/home/ec2-user/anaconda3/envs/myenv/bin/"
#os.environ["PATH"] += os.pathsep + bin_env_path

In [None]:
job_artifacts_path = estimator.latest_job_tensorboard_artifacts_path()
job_artifacts_path

### Visualize training outputs

In [None]:
#Careful notebook would be stuck until you stop tensorboard, you can also launch this from a terminal
tensorboard_s3_output_path = f'{job_artifacts_path}/train'
!F_CPP_MIN_LOG_LEVEL=3 AWS_REGION=eu-west-1 tensorboard --logdir=$tensorboard_s3_output_path

### Visualize evaluation outputs


In [None]:
#Careful notebook would be stuck until you stop tensorboard, you can also launch this from a terminal
tensorboard_s3_output_path = f'{job_artifacts_path}/eval'
!F_CPP_MIN_LOG_LEVEL=3 AWS_REGION=eu-west-1 tensorboard --logdir=$tensorboard_s3_output_path