# Hyperparameter Tuning in SageMaker

In [17]:
import boto3
import sagemaker

sm = boto3.client(service_name='sagemaker')

sagemaker_session = sagemaker.Session()
region = sagemaker_session.boto_region_name
bucket = sagemaker_session.default_bucket()
role = sagemaker.get_execution_role()
prefix = 'sagemaker-scikit-learn-2023-01-06-17-33-49-347/output'

In [5]:
# Location of datasets
processed_train_data_s3_uri = 's3://{}/{}/sentiment-train/train.csv'.format(bucket,prefix)
processed_validation_data_s3_uri = 's3://{}/{}/sentiment-validation/validation.csv'.format(bucket,prefix)
processed_test_data_s3_uri = 's3://{}/{}/sentiment-test/test.csv'.format(bucket,prefix)

## Create Data Channels

In [6]:
# Create training data channel
s3_input_train_data = sagemaker.inputs.TrainingInput(
    s3_data = processed_train_data_s3_uri
)

# Create validation data channel
s3_input_validation_data = sagemaker.inputs.TrainingInput(
    s3_data = processed_validation_data_s3_uri
)

data_channels = {
    'train': s3_input_train_data,
    'validation': s3_input_validation_data
}

## Configure Hyperparameter Tuning Job

In [7]:
max_seq_length = 128
freeze_bert_layer = False
epochs = 3
train_steps_per_epoch = 50
validation_batch_size = 64
validation_steps_per_epoch = 50
seed = 42
run_validation = True

instance_count = 1
instance_type = 'ml.c5.9xlarge'
train_volume_size = 256
input_mode = 'File'

In [8]:
hyperparameters_static = {
    'max_seq_length': max_seq_length,
    'freeze_bert_layer': freeze_bert_layer,
    'epochs': epochs,
    'train_steps_per_epoch': train_steps_per_epoch,
    'validation_batch_size': validation_batch_size,
    'validation_steps_per_epoch': validation_steps_per_epoch,
    'seed': seed,
    'run_validation': run_validation
}

In [9]:
from sagemaker.tuner import IntegerParameter
from sagemaker.tuner import ContinuousParameter
from sagemaker.tuner import CategoricalParameter

hyperparameter_ranges = {
    'learning_rate': ContinuousParameter(0.00001, 0.00005, scaling_type = 'Linear'),
    'train_batch_size': CategoricalParameter([128, 256]),
}

## Evaluation Metrics

In [10]:
metric_definitions = [
    {'Name': 'validation:loss', 'Regex': 'val_loss: ([0-9.]+)'},
    {'Name': 'validation:accuracy', 'Regex': 'val_acc: ([0-9.]+)'}
]

## Run Hyperparameter Tuning Job

In [11]:
from sagemaker.pytorch import PyTorch as PyTorchEstimator

estimator = PyTorchEstimator(
    entry_point = 'train.py',
    source_dir = 'src',
    role = role,
    instance_count = instance_count,
    instance_type = instance_type,
    volume_size = train_volume_size,
    py_version = 'py3',
    framework_version = '1.6.0',
    hyperparameters = hyperparameters_static,
    metric_definitions = metric_definitions,
    input_mode = input_mode,
)

In [12]:
from sagemaker.tuner import HyperparameterTuner

tuner = HyperparameterTuner(
    estimator = estimator,
    hyperparameter_ranges = hyperparameter_ranges,
    metric_definitions= metric_definitions,
    strategy = 'Random',
    objective_type = 'Maximize',
    objective_metric_name = 'validation:accuracy',
    max_jobs = 2,
    max_parallel_jobs = 2,
    early_stopping_type = 'Auto'
)

In [13]:
tuner.fit(
    inputs = data_channels,
    include_cls_metadata = False,
    wait = False
)

No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config
No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config


In [14]:
tuning_job_name = tuner.latest_tuning_job.job_name
print(tuning_job_name)

pytorch-training-230113-1550


In [15]:
%%time

tuner.wait()

........................................................................................................................................................................................................................................................................................!
CPU times: user 1.41 s, sys: 101 ms, total: 1.51 s
Wall time: 23min 33s


## Evaluate Results

In [16]:
df_results = tuner.analytics().dataframe()
df_results

Unnamed: 0,learning_rate,train_batch_size,TrainingJobName,TrainingJobStatus,FinalObjectiveValue,TrainingStartTime,TrainingEndTime,TrainingElapsedTimeSeconds
0,3.2e-05,"""128""",pytorch-training-230113-1550-002-81886cc1,Stopped,32.029999,2023-01-13 15:51:17+00:00,2023-01-13 15:53:21+00:00,124.0
1,3.1e-05,"""128""",pytorch-training-230113-1550-001-621d2e2b,Completed,69.139999,2023-01-13 15:51:19+00:00,2023-01-13 16:14:26+00:00,1387.0


In [18]:
# Upload best model into S3 Bucket
best_candidate = df_results.sort_values('FinalObjectiveValue', ascending = 0).iloc[0]

training_job_top_model = best_candidate['TrainingJobName']

model_tar_s3_uri = sm.describe_training_job(TrainingJobName = training_job_top_model)['ModelArtifacts']['S3ModelArtifacts']

In [19]:
print(model_tar_s3_uri)

s3://sagemaker-us-east-2-003294323742/pytorch-training-230113-1550-001-621d2e2b/output/model.tar.gz


In [28]:
from sagemaker.sklearn.processing import SKLearnProcessor

instance_type = 'ml.c5.2xlarge'
instance_count = 1

processor = SKLearnProcessor(
    framework_version = '0.23-1',
    role = role,
    instance_type = instance_type,
    instance_count = instance_count,
    max_runtime_in_seconds = 7200,
)

In [29]:
from sagemaker.processing import ProcessingInput, ProcessingOutput

processor.run(
    code = 'src/evaluate_model_metrics.py',
    inputs = [
        ProcessingInput(
            input_name = 'model-tar-s3-uri',
            source = model_tar_s3_uri,
            destination = '/opt/ml/processing/input/model/'
        ),
        ProcessingInput(
            input_name = 'evaluation-data-s3-uri',
            source = processed_test_data_s3_uri,
            destination = '/opt/ml/processing/input/data',
        ),
    ],
    outputs = [
        ProcessingOutput(s3_upload_mode = 'EndOfJob', output_name = 'metrics', source = '/opt/ml/processing/output/metrics'),
    ],
    arguments = ['--max-seq-length', str(max_seq_length)],
    logs = True,
    wait = False,
)


Job Name:  sagemaker-scikit-learn-2023-01-13-16-48-59-243
Inputs:  [{'InputName': 'model-tar-s3-uri', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-2-003294323742/pytorch-training-230113-1550-001-621d2e2b/output/model.tar.gz', 'LocalPath': '/opt/ml/processing/input/model/', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'evaluation-data-s3-uri', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-2-003294323742/sagemaker-scikit-learn-2023-01-06-17-33-49-347/output/sentiment-test/test.csv', 'LocalPath': '/opt/ml/processing/input/data', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'code', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-2-003294323742/sagemaker-scikit-learn-2023-01-13-16-48-59-243/input/code/evaluate_model_metrics.py', 'LocalPath': '/opt

In [30]:
scikit_processing_job_name = processor.jobs[-1].describe()["ProcessingJobName"]
print(scikit_processing_job_name)

sagemaker-scikit-learn-2023-01-13-16-48-59-243


In [31]:
running_processor = sagemaker.processing.ProcessingJob.from_processing_name(
    processing_job_name=scikit_processing_job_name, sagemaker_session=sagemaker_session
)

In [32]:
%%time

running_processor.wait(logs=False)

.........................................................................................................!CPU times: user 507 ms, sys: 35.3 ms, total: 542 ms
Wall time: 8min 47s


In [34]:
processing_job_description = running_processor.describe()

output_config = processing_job_description["ProcessingOutputConfig"]
for output in output_config["Outputs"]:
    if output["OutputName"] == "metrics":
        processed_metrics_s3_uri = output["S3Output"]["S3Uri"]

print(processed_metrics_s3_uri)

s3://sagemaker-us-east-2-003294323742/sagemaker-scikit-learn-2023-01-13-16-48-59-243/output/metrics


In [36]:
import json

metrics_json = sagemaker.s3.S3Downloader.read_file("{}/evaluation.json".format(
    processed_metrics_s3_uri
))

print('Test accuracy: {}'.format(json.loads(metrics_json)))

Test accuracy: {'metrics': {'accuracy': {'value': 0.6925566343042071}}}
