# Prepare training container

In [None]:
# loging to Sagemaker ECR with Deep Learning Containers
!aws ecr get-login-password --region us-east-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-east-2.amazonaws.com
# loging to your private ECR
!aws ecr get-login-password --region us-east-2 | docker login --username AWS --password-stdin 553020858742.dkr.ecr.us-east-2.amazonaws.com

In [None]:
! pygmentize -l docker Dockerfile

In [22]:
! ./build_and_push.sh hf-transformers-sm latest Dockerfile

https://docs.docker.com/engine/reference/commandline/login/#credentials-store

Login Succeeded
Sending build context to Docker daemon    300kB
Step 1/10 : FROM 763104351884.dkr.ecr.us-east-2.amazonaws.com/pytorch-training:1.5.0-gpu-py36-cu101-ubuntu16.04
 ---> 47cd15520b75
Step 2/10 : LABEL author="vadimd@amazon.com"
 ---> Using cache
 ---> 7e416477db2b
Step 3/10 : WORKDIR /opt/ml/code
 ---> Using cache
 ---> 56ad1caf23ce
Step 4/10 : RUN git clone https://github.com/huggingface/transformers
 ---> Using cache
 ---> 3724d048cf47
Step 5/10 : RUN cd transformers/ &&     python3 -m pip install --no-cache-dir .
 ---> Using cache
 ---> fc1706b5b03f
Step 6/10 : COPY container_training /opt/ml/code
 ---> e5104694ae4e
Step 7/10 : ENV SAGEMAKER_SUBMIT_DIRECTORY /opt/ml/code
 ---> Running in 7cebb473e3d6
Removing intermediate container 7cebb473e3d6
 ---> f603f12a941f
Step 8/10 : ENV SAGEMAKER_PROGRAM train_transformer.py
 ---> Running in 62fa60010b04
Removing intermediate container 62fa60010b04
 -

# Training Language Model

In [15]:
# Define IAM role
import boto3
import re

import os
import numpy as np
import pandas as pd
from sagemaker import get_execution_role

role = get_execution_role()

In [20]:
import sagemaker
from time import gmtime, strftime

# sess = sagemaker.LocalSession() # can use LocalSession() to run container locally
sess = sagemaker.Session()

bucket = sess.default_bucket()
region = "us-east-2"
account = sess.boto_session.client('sts').get_caller_identity()['Account']
prefix_input = 'transformer-input'
prefix_output = 'transformer-ouput'

In [17]:
container = "hf-transformers-sm" # your container name
tag = "latest"
image = '{}.dkr.ecr.{}.amazonaws.com/{}:{}'.format(account, region, container, tag)

In [5]:
# placeholder
lng_model_metrics=[]


In [None]:
lng_model_hp = {
    
    "nlp-problem":"language-modeling",
    "dataset" : "wiki",
    "do_train" : "true", # whether to train your model
    "do_eval" : "true",  # whether to run evaluation
#     "fp16" : "true",     # It's unclear why, but mixed precision with gpt2 hangs. Thought, it should be working according to: https://github.com/huggingface/transformers/pull/495

    #### Algo hyperparameters
    "model_type" : "gpt2",
    "model_name_or_path" : "gpt2",
    "per_gpu_train_batch_size" : "2",
    "per_gpu_eval_batch_size" : "2",
    "gradient_accumulation_steps" : "2",
}

In [None]:
lng_est = sagemaker.estimator.Estimator(image,
                                   role=role,
                                   train_instance_count=2, 
                                   train_instance_type='ml.p3.2xlarge',
#                                    train_instance_type="local_gpu", # use local_gpu for quick troubleshooting
#                                    train_volume_size=100,
                                   output_path="s3://{}/{}".format(sess.default_bucket(), prefix_output),
                                   metric_definitions = lng_model_metrics,
                                   hyperparameters = lng_model_hp, 
                                   sagemaker_session=sess)

lng_est.fit({"train":"s3://vadimd-nlp-datasets/wikitext-2-raw/wiki.train.raw", 
             "test":"s3://vadimd-nlp-datasets/wikitext-2-raw/wiki.test.raw"},
             job_name = "hf-transformers-batch2-nofp16-v1",
             wait=False
           ) 

# Train for GLUE benchmarks

In [None]:
# Download script for GLUE data download
!wget https://gist.githubusercontent.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e/raw/17b8dd0d724281ed7c3b2aeeda662b92809aadd5/download_glue_data.py


In [None]:
! python download_glue_data.py

In [None]:
! ./upload_data_to_s3.sh vadimd-nlp-datasets glue glue_data

In [13]:
text_cls_hp = {
    "nlp-problem" : "text-classification",
    "do_train" : "true", # whether to train your model
    "do_eval" : "true",  # whether to run evaluation
    
    #### Algo hyperparameters
    "task_name" : "MRPC", 
    "model_type" : "bert",
    "model_name_or_path" : "bert-base-cased", # provide only model name
    "max_seq_length" : 128,
    "per_gpu_train_batch_size" : 32,
    "learning_rate" : 2e-5,
    "num_train_epochs" : 3.0
}

In [18]:
# placeholder
text_cls_metrics=[]

In [23]:
cls_est = sagemaker.estimator.Estimator(image,
                                   role=role,
                                   train_instance_count=2, 
                                   train_instance_type='ml.p3.2xlarge',
#                                   train_instance_type="local_gpu", # use local_gpu for quick troubleshooting
#                                   train_volume_size=100,
                                   output_path="s3://{}/{}".format(sess.default_bucket(), prefix_output),
                                   metric_definitions = text_cls_metrics,
                                   hyperparameters = text_cls_hp, 
                                   sagemaker_session=sess)

cls_est.fit({"train":"s3://vadimd-nlp-datasets/glue/MRPC"},
            job_name = "hf-transformers-glue-v4",
            wait=False) 