<a href="https://colab.research.google.com/github/victorywwong/aws-big-data-machine-learning/blob/master/AWS_Model_Training_through_Sagemaker.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This is a text classification example

In [0]:
import boto3
import pandas
import sagemaker
from sagemaker import get_execution_role
sagemaker_session = sagemaker.Session()
role = get_execution_role()
role_name = role[role.rfind('/') + 1:]
print(role_name)

s3 = boto3.resource('s3')
s3_client = boto3.client('s3')
bucket = 'ml-datalake'
prefix = 'datalake/training'
job_run_id = 'id'

Data channels

In [0]:
train_channel = prefix + '/train/'
validation_channel = prefix + '/validation/'

s3_train_data = 's3://{}/{}'.format(bucket, train_channel)
s3_validation_data = 's3://{}/{}'.format(bucket, validation_channel)

train_data = sagemaker.session.s3_input(s3_train_data, distribution='FullyReplicated', content_type='text/plain', s3_data_type='S3Prefix')
validation_data = sagemaker.session.s3_input(s3_validation_data, distribution='FullyReplicated', content_type='text/plain', s3_d

data_channels = {'train': train_data, 'validation': validation_data} 

s3_output_location = 's3://{}/sagemaker/{}'.format(bucket,job_run_id)


Training

In [0]:
session = boto3.session.Session()
region = session.region_name

# Classification example
container = sagemaker.amazon.amazon_estimator.get_image_url(region, "blazingtext", "latest")
print('Using SageMaker container: {} ({})').format(container, region)


In [0]:
model = sagemaker.estimator.Estimator(container,
                                      role,
                                      train_instance_count=1,
                                      train_instance_type='ml.c5.4xlarge',
                                      train_volume_size = 30,
                                      train_max_run = 360000,
                                      input_mode = 'File',
                                      output_path=s3_output_location,
                                      sagemaker_session=sagemaker_session)

In [0]:
model.set_hyperparameters(mode='supervised',
                          min_epochs=5,
                          epochs=100,
                          min_count=2
                          learning_rate=0.01,
                          vector_dim=134,
                          early_stopping=True,
                          patience=50,
                          word_ngrams=5)

Hyperparameter Tuning

In [0]:
from sagemaker.tuner import IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner
hyperparameter_ranges = {'min_count': IntegerParameter(2, 5),
                         'learning_rate': ContinuousParameter(0.01, 0.08),
                         'vector_dim': IntegerParameter(100, 200),
                         'word_ngrams': IntegerParameter(2, 5)}

In [0]:
objective_metric_name = 'validation:accuracy'

In [0]:
tuner = HyperparameterTuner(model,
                            objective_metric_name,
                            hyperparameter_ranges,
                            max_job=4,
                            max_parallel_jobs=2,
                            base_tuning_job_name='training-'+job_run_id[3:])

In [0]:
tuner.fit(data_channels)

Deploy

In [0]:
tuner.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge', endpoint_name='classifier')