In [None]:
import sagemaker
from pathlib import Path
from sagemaker.predictor import json_serializer
import json

In [None]:
role = sagemaker.get_execution_role()
session = sagemaker.Session()

## Setup Path 

In [None]:
# location for train.csv, val.csv and labels.csv
DATA_PATH = Path("../data/")   

# Location for storing training_config.json
CONFIG_PATH = DATA_PATH/'config'
CONFIG_PATH.mkdir(exist_ok=True)

# S3 bucket name
bucket = 'sagemaker-deep-learning'

# Prefix for S3 bucket for input and output
prefix = 'toxic_comments/input'
prefix_output = 'toxic_comments/output'

## Hyperparameters & Training Config

In [None]:
hyperparameters = {
    "epochs": 10,
    "lr": 8e-5,
    "max_seq_length": 512,
    "train_batch_size": 16,
    "lr_schedule": "warmup_cosine",
    "warmup_steps": 1000,
    "optimizer_type": "adamw"
}

In [None]:
training_config = {
    "run_text": "toxic comments",
    "finetuned_model": None,
    "do_lower_case": "True",
    "train_file": "train.csv",
    "val_file": "val.csv",
    "label_file": "labels.csv",
    "text_col": "comment_text",
    "label_col": '["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]',
    "multi_label": "True",
    "grad_accumulation_steps": "1",
    "fp16_opt_level": "O1",
    "fp16": "True",
    "model_type": "roberta",
    "model_name": "roberta-base",
    "logging_steps": "300"
}

with open(CONFIG_PATH/'training_config.json', 'w') as f:
    json.dump(training_config, f)

## Upload Data

In [None]:
# This is a helper feature to upload data
# from your local machine to S3 bucket.

s3_input = session.upload_data(DATA_PATH, bucket=bucket , key_prefix=prefix)

session.upload_data(str(DATA_PATH/'labels.csv'), bucket=bucket , key_prefix=prefix)
session.upload_data(str(DATA_PATH/'train.csv'), bucket=bucket , key_prefix=prefix)
session.upload_data(str(DATA_PATH/'val.csv'), bucket=bucket , key_prefix=prefix)

## Create an Estimator and start training

In [None]:
account = session.boto_session.client('sts').get_caller_identity()['Account']
region = session.boto_session.region_name

image = "{}.dkr.ecr.{}.amazonaws.com/sagemaker-bert:1.0-gpu-py36".format(account, region)

In [None]:
output_path = "s3://{}/{}".format(bucket, prefix_output)

In [None]:
estimator = sagemaker.estimator.Estimator(image, 
                                          role,
                                          train_instance_count=1, 
                                          train_instance_type='ml.p3.8xlarge', 
                                          output_path=output_path, 
                                          base_job_name='toxic-comments',
                                          hyperparameters=hyperparameters,
                                          sagemaker_session=session
                                         )

In [None]:
estimator.fit(s3_input)

## Deploy the model to hosting service

In [None]:
predictor = estimator.deploy(1, 
                             'ml.m5.large', 
                             endpoint_name='bert-toxic-comments', 
                             update_endpoint=True, 
                             serializer=json_serializer)