# Prepare training container

In [None]:
# loging to Sagemaker ECR with Deep Learning Containers
!aws ecr get-login-password --region us-east-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-east-2.amazonaws.com
# loging to your private ECR
!aws ecr get-login-password --region us-east-2 | docker login --username AWS --password-stdin 553020858742.dkr.ecr.us-east-2.amazonaws.com

In [None]:
! pygmentize -l docker Dockerfile

In [22]:
! ./build_and_push.sh hf-transformers-sm latest Dockerfile

https://docs.docker.com/engine/reference/commandline/login/#credentials-store

Login Succeeded
Sending build context to Docker daemon  130.6kB
Step 1/10 : FROM 763104351884.dkr.ecr.us-east-2.amazonaws.com/pytorch-training:1.5.0-gpu-py36-cu101-ubuntu16.04
 ---> 47cd15520b75
Step 2/10 : LABEL author="vadimd@amazon.com"
 ---> Using cache
 ---> 08c09613bc5b
Step 3/10 : WORKDIR /opt/ml/code
 ---> Using cache
 ---> 20c4e4c866b7
Step 4/10 : RUN git clone https://github.com/huggingface/transformers
 ---> Using cache
 ---> b553ccffcb8d
Step 5/10 : RUN cd transformers/ &&     python3 -m pip install --no-cache-dir .
 ---> Using cache
 ---> 8ed5bc216ab3
Step 6/10 : COPY container_training /opt/ml/code
 ---> dd2103473b54
Step 7/10 : ENV SAGEMAKER_SUBMIT_DIRECTORY /opt/ml/code
 ---> Running in b59dc0184504
Removing intermediate container b59dc0184504
 ---> 5782444dd643
Step 8/10 : ENV SAGEMAKER_PROGRAM train_transformer.py
 ---> Running in eecd6ea6cb76
Removing intermediate container eecd6ea6cb76
 -

# Training Language Model

In [8]:
# Define IAM role
import boto3
import re

import os
import numpy as np
import pandas as pd
from sagemaker import get_execution_role

role = get_execution_role()

In [20]:
import sagemaker
from time import gmtime, strftime

# sess = sagemaker.LocalSession() # can use LocalSession() to run container locally
sess = sagemaker.Session()

bucket = sess.default_bucket()
region = "us-east-2"
account = sess.boto_session.client('sts').get_caller_identity()['Account']
prefix_input = 'detectron2-input'
prefix_output = 'detectron2-ouput'

In [16]:
container = "hf-transformers-sm" # your container name
tag = "latest"
image = '{}.dkr.ecr.{}.amazonaws.com/{}:{}'.format(account, region, container, tag)

In [17]:
# placeholder
metric_definitions=[]


In [23]:
hyperparameters = {
    "nlp-problem":"language-modeling",
    "dataset" : "wiki",
    #### Algo hyperparameters
    "model_type" : "gpt2",
    "model_name_or_path" : "gpt2",
    "do_train" : "true", 
    "do_eval" : "true"
}

In [24]:
lng_est = sagemaker.estimator.Estimator(image,
                                   role=role,
                                   train_instance_count=4, 
                                   train_instance_type='ml.p3.16xlarge',
#                                    train_instance_type="local_gpu", # use local_gpu for quick troubleshooting
#                                    train_volume_size=100,
                                   output_path="s3://{}/{}".format(sess.default_bucket(), prefix_output),
                                   metric_definitions = metric_definitions,
                                   hyperparameters = hyperparameters, 
                                   sagemaker_session=sess)

lng_est.fit({"train":"s3://vadimd-nlp-datasets/wikitext-2-raw/wiki.train.raw", 
       "test":"s3://vadimd-nlp-datasets/wikitext-2-raw/wiki.test.raw"}, 
      wait=False
          ) 

# Train for GLUE benchmarks (WIP)

In [None]:
glue_hp = {
    "nlp-problem" : "text-classification",
    #### Algo hyperparameters
    "task_name" : "MRPC", 
    "model_type" : "bert",
    "model_name_or_path" : "bert-base-cased", # provide only model name
    "max_seq_length" : 128,
    "per_gpu_train_batch_size" : 32,
    "learning_rate" : 2e-5,
    "num_train_epochs" : 3.0
}

In [None]:
cls_est = sagemaker.estimator.Estimator(image,
                                   role=role,
                                   train_instance_count=2, 
                                   train_instance_type='ml.p3.2xlarge',
#                                   train_instance_type="local_gpu", # use local_gpu for quick troubleshooting
#                                   train_volume_size=100,
                                   output_path="s3://{}/{}".format(sess.default_bucket(), prefix_output),
                                   metric_definitions = metric_definitions,
                                   hyperparameters = hyperparameters, 
                                   sagemaker_session=sess)

# TODO: prepare data channels
cls_est.fit({"train":"TBD", 
       "test":"TBD"}, 
       wait=False) 