# Prepare training container

In [None]:
# login to Sagemaker ECR with Deep Learning Containers
!aws ecr get-login-password --region us-east-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-east-2.amazonaws.com
# login to your private ECR
!aws ecr get-login-password --region us-east-2 | docker login --username AWS --password-stdin 553020858742.dkr.ecr.us-east-2.amazonaws.com

In [None]:
! pygmentize -l docker Dockerfile

In [None]:
! ./build_and_push.sh hf-transformers-sm latest Dockerfile

# Training Language Model

In [None]:
# Define IAM role
import boto3
import re

import os
import numpy as np
import pandas as pd
from sagemaker import get_execution_role

role = get_execution_role()

In [55]:
import sagemaker
from time import gmtime, strftime

sess = sagemaker.Session() # can use LocalSession() to run container locally

bucket = sess.default_bucket()
region = "us-east-2"
account = sess.boto_session.client('sts').get_caller_identity()['Account']
prefix_input = 'transformer-input'
prefix_output = 'transformer-ouput'

In [51]:
container = "hf-transformers-sm" # your container name
tag = "latest"
image = '{}.dkr.ecr.{}.amazonaws.com/{}:{}'.format(account, region, container, tag)

In [52]:
# placeholder
lng_model_metrics=[]

In [57]:
lng_model_hp = {
    
    "nlp-problem":"language-modeling",
    "dataset" : "wiki",
    "do_train" : "true", # whether to train your model
    "do_eval" : "true",  # whether to run evaluation
    "fp16" : "true",     # It's unclear why, but mixed precision with gpt2 hangs. Thought, it should be working according to: https://github.com/huggingface/transformers/pull/495

    #### Algo hyperparameters
    "model_type" : "gpt2",
    "model_name_or_path" : "gpt2",
    "per_gpu_train_batch_size" : "2",
    "per_gpu_eval_batch_size" : "2",
    "gradient_accumulation_steps" : "2",
}

In [59]:
lng_est = sagemaker.estimator.Estimator(image,
                                   role=role,
                                   train_instance_count=1, 
                                   train_instance_type='ml.p3.16xlarge',
#                                    train_instance_type="local_gpu", # use local_gpu for quick troubleshooting
                                   output_path="s3://{}/{}".format(sess.default_bucket(), prefix_output),
                                   metric_definitions = lng_model_metrics,
                                   hyperparameters = lng_model_hp, 
                                   sagemaker_session=sess)

lng_est.fit({"train":"s3://vadimd-nlp-datasets/wikitext-2-raw/wiki.train.raw", 
             "test":"s3://vadimd-nlp-datasets/wikitext-2-raw/wiki.test.raw"},
             job_name = "hf-transformers-batch2-fp16",
             wait=False
           ) 

# Train for GLUE benchmarks

In [None]:
# Download script for GLUE data download
!wget https://gist.githubusercontent.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e/raw/17b8dd0d724281ed7c3b2aeeda662b92809aadd5/download_glue_data.py


In [None]:
! python download_glue_data.py

In [None]:
! ./upload_data_to_s3.sh vadimd-nlp-datasets glue glue_data

In [None]:
text_cls_hp = {
    "nlp-problem" : "text-classification",
    "do_train" : "true", # whether to train your model
    "do_eval" : "true",  # whether to run evaluation
    
    #### Algo hyperparameters
    "task_name" : "MRPC", 
    "model_type" : "bert",
    "model_name_or_path" : "bert-base-cased", # provide only model name
    "max_seq_length" : 128,
    "per_gpu_train_batch_size" : 32,
    "learning_rate" : 2e-5,
    "num_train_epochs" : 3.0
}

In [None]:
# placeholder
text_cls_metrics=[]

In [None]:
cls_est = sagemaker.estimator.Estimator(image,
                                   role=role,
                                   train_instance_count=1, 
#                                    train_instance_type='ml.p3.16xlarge',
                                  train_instance_type="local_gpu", # use local_gpu for quick troubleshooting
#                                   train_volume_size=100,
                                   output_path="s3://{}/{}".format(sess.default_bucket(), prefix_output),
                                   metric_definitions = text_cls_metrics,
                                   hyperparameters = text_cls_hp, 
                                   sagemaker_session=sess
                                       )

cls_est.fit({"train":"s3://vadimd-nlp-datasets/glue/MRPC"},
            job_name = "hf-transformers-glue-v5",
            wait=False) 