# Train DeBERTaV3 with HuggingFace Transformers using Distributed Data Parallel techniques on SageMaker

### Step 1: Install and import necessary dependencies

In [57]:
!pip install sagemaker botocore boto3 awscli --upgrade
!pip install transformers datasets --upgrade
#Required for tokenizer with deberta-v3.huggingface "This tokenizer cannot be instantiated. Please make sure you have `sentencepiece` installed in order to use this tokenizer."
!pip install sentencepiece    

Collecting sagemaker
  Downloading sagemaker-2.89.0.tar.gz (529 kB)
     |████████████████████████████████| 529 kB 5.9 MB/s            
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting botocore
  Downloading botocore-1.25.13-py3-none-any.whl (8.7 MB)
     |████████████████████████████████| 8.7 MB 91.4 MB/s            
Collecting boto3
  Downloading boto3-1.22.13-py3-none-any.whl (132 kB)
     |████████████████████████████████| 132 kB 126.2 MB/s            
Collecting awscli
  Downloading awscli-1.23.13-py3-none-any.whl (3.8 MB)
     |████████████████████████████████| 3.8 MB 120.6 MB/s            
Building wheels for collected packages: sagemaker
  Building wheel for sagemaker (setup.py) ... [?25ldone
[?25h  Created wheel for sagemaker: filename=sagemaker-2.89.0-py2.py3-none-any.whl size=736067 sha256=b7cd4bb01a34376c73b02ec58d89fcced30941d7275b74fca946a4ae31f3392b
  Stored in directory: /home/ec2-user/.cache/pip/wheels/3a/c4/96/07aaa977c8cff7917bc1356be22e44ffad1478c9d

In [None]:
!pip show transformers

In [None]:
# The following is necessary if you want to use the fast tokenizer for deberta v2 or v3
# This must be done before importing transformers
import shutil
from pathlib import Path

transformers_path = Path("/home/ec2-user/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/transformers")

input_dir = Path("input")

convert_file = input_dir / "convert_slow_tokenizer.py"
conversion_path = transformers_path/convert_file.name

if conversion_path.exists():
    conversion_path.unlink()

shutil.copy(convert_file, transformers_path)
deberta_v2_path = transformers_path / "models" / "deberta_v2"

for filename in ['tokenization_deberta_v2.py', 'tokenization_deberta_v2_fast.py']:
    filepath = deberta_v2_path/filename
    if filepath.exists():
        filepath.unlink()

    shutil.copy(input_dir/filename, filepath)

In [59]:
import botocore
import boto3
import sagemaker
import transformers

print(f"sagemaker: {sagemaker.__version__}")
print(f"transformers: {transformers.__version__}")

sagemaker: 2.88.2
transformers: 4.18.0


Copy and run the following code if you need to upgrade ipywidgets for datasets library and restart kernel. This is only needed when preprocessing is done in the notebook.

In [None]:
%%capture
import IPython
!conda install -c -v conda-forge ipywidgets -y
# has to restart kernel for the updates to be applied
IPython.Application.instance().kernel.do_shutdown(True)


### Setup SageMaker environment

In [60]:
sess = sagemaker.Session()
role = sagemaker.get_execution_role()

# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it does not exists
sagemaker_session_bucket = None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()
    
sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")
#Add instructions for local environment later, if needed

sagemaker role arn: arn:aws:iam::570106654206:role/Dev
sagemaker bucket: sagemaker-us-west-2-570106654206
sagemaker session region: us-west-2


### Preparing the dataset

1. HF Tutorial with SQUAD: https://huggingface.co/course/chapter7/7?fw=tf

2. DeBERTA v2 does not come with a fast tokenizer, assuming v3 does not either: https://huggingface.co/docs/transformers/index#supported-frameworks

In [None]:
from datasets import load_dataset

raw_datasets = load_dataset("squad")

In [None]:
print(raw_datasets)

In [None]:
print("Context: ", raw_datasets["train"][12]["context"])
print("Question: ", raw_datasets["train"][12]["question"])
print("Answer: ", raw_datasets["train"][12]["answers"])

In [None]:
#Verify that training samples have only one possible answer, but validation and testing can have multiple
print(raw_datasets["train"].filter(lambda x: len(x["answers"]["text"]) != 1))
print(raw_datasets["validation"][0]["answers"])

In [None]:
#convert the text in the input into IDs the model can make sense of, using a tokenizer:

from transformers.models.deberta_v2.tokenization_deberta_v2_fast import DebertaV2TokenizerFast

model_checkpoint = "microsoft/deberta-v3-base"
tokenizer = DebertaV2TokenizerFast.from_pretrained(model_checkpoint)

#Check if deberta-v3-base has a fast tokenizer
tokenizer.is_fast

In [None]:
max_length = 384  # The maximum length of a feature (question and context)
stride = (
    128  # The authorized overlap between two parts of the context when splitting it is needed.
)
pad_on_right = tokenizer.padding_side == "right"

In [None]:
def preprocess_training_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [None]:
tokenized_datasets = raw_datasets.map(
    preprocess_training_examples, batched=True, remove_columns=raw_datasets["train"].column_names
)

In [None]:
train_dataset = tokenized_datasets["train"]
eval_dataset = tokenized_datasets["validation"]

train_dataset.set_format(
    "torch", columns=["attention_mask", "end_positions", "input_ids", "start_positions"]
)
eval_dataset.set_format(
    "torch", columns=["attention_mask", "end_positions", "input_ids", "start_positions"]
)

In [None]:
print(train_dataset.column_names)


In [None]:
import botocore
from datasets.filesystems import S3FileSystem

s3 = S3FileSystem()

s3_prefix = "samples/datasets/squad"

# save train_dataset to s3
training_input_path = f"s3://{sess.default_bucket()}/{s3_prefix}/train"
print(training_input_path)
train_dataset.save_to_disk(training_input_path, fs=s3)

# save test_dataset to s3
eval_input_path = f"s3://{sess.default_bucket()}/{s3_prefix}/eval"
eval_dataset.save_to_disk(eval_input_path, fs=s3)


### Build the docker container and push it to ECR

In [62]:
dlc_account_id = 763104351884  # By default, set the account ID used for most regions
region = "us-west-2"
image = (
    "pt-hf-smdataparallel-deberta-sagemaker"  # Example: pt-smdataparallel-efficientnet-sagemaker
)
tag = "latest"  # Example: latest

In [63]:
!pygmentize ./Dockerfile

[34mARG[39;49;00m region

[37m# Download base PT DLC. Note that this notebook requires a HF DLC with >= PT 1.10.2[39;49;00m
[34mFROM[39;49;00m [33m763104351884.dkr.ecr.${region}.amazonaws.com/huggingface-pytorch-training:1.10.2-transformers4.17.0-gpu-py38-cu113-ubuntu20.04[39;49;00m

[34mARG[39;49;00m [31mWORK_DIR[39;49;00m=[33m"transformers_build"[39;49;00m
[34mWORKDIR[39;49;00m[33m $WORK_DIR[39;49;00m
[34mRUN[39;49;00m pwd; pip install git+https://github.com/huggingface/transformers; [36mecho[39;49;00m [33m"installed tran"[39;49;00m; [36mcd[39;49;00m transformers; [33m\[39;49;00m
    python setup.py; [33m\[39;49;00m
    [36mcd[39;49;00m ../..; rm -rf [31m$WORK_DIR[39;49;00m;

[37m# ARG dlc_account_id[39;49;00m
[37m# ARG region[39;49;00m

[37m# # Download base PT DLC. Note that this notebook required a DLC with >= PT 1.10.2[39;49;00m
[37m# FROM ${dlc_account_id}.dkr.ecr.${region}.amazonaws.com/pytorch-training:1.10.2-gpu-py38-cu1

In [64]:
!pygmentize ./build_and_push.sh

[37m#!/usr/bin/env bash[39;49;00m
[37m# This script shows how to build the Docker image and push it to ECR to be ready for use[39;49;00m
[37m# by SageMaker.[39;49;00m
[37m# The argument to this script is the image name. This will be used as the image on the local[39;49;00m
[37m# machine and combined with the account and region to form the repository name for ECR.[39;49;00m
[37m# set region[39;49;00m

[31mDIR[39;49;00m=[33m"[39;49;00m[34m$([39;49;00m [36mcd[39;49;00m [33m"[39;49;00m[34m$([39;49;00m dirname [33m"[39;49;00m[33m${[39;49;00m[31mBASH_SOURCE[39;49;00m[0][33m}[39;49;00m[33m"[39;49;00m [34m)[39;49;00m[33m"[39;49;00m && [36mpwd[39;49;00m [34m)[39;49;00m[33m"[39;49;00m
[36mecho[39;49;00m [33m"[39;49;00m[33mDir: [39;49;00m[33m${[39;49;00m[31mDIR[39;49;00m[33m}[39;49;00m[33m"[39;49;00m
[36mcd[39;49;00m [33m${[39;49;00m[31mDIR[39;49;00m[33m}[39;49;00m/deberta

[34mif[39;49;00m [ [33m"[39;49;00m[31m$#[

In [66]:
! chmod +x build_and_push.sh
! bash build_and_push.sh {region} {image} {tag}

Dir: /home/ec2-user/SageMaker
creating ECR repository : 570106654206.dkr.ecr.us-west-2.amazonaws.com/pt-hf-smdataparallel-deberta-sagemaker:latest 
https://docs.docker.com/engine/reference/commandline/login/#credentials-store

Login Succeeded
build_and_push.sh  DeBERTa_HF_SM_train.ipynb  hf_sm_script  lost+found
deberta		   Dockerfile		      input	    __pycache__
Sending build context to Docker daemon  7.727kB
Step 1/5 : ARG region
Step 2/5 : FROM 763104351884.dkr.ecr.${region}.amazonaws.com/huggingface-pytorch-training:1.10.2-transformers4.17.0-gpu-py38-cu113-ubuntu20.04
1.10.2-transformers4.17.0-gpu-py38-cu113-ubuntu20.04: Pulling from huggingface-pytorch-training
Digest: sha256:3d75e16fbada01c13bf1e4171fe67d88c5ebabc26f15e1e245d3844daae5e301
Status: Downloaded newer image for 763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-training:1.10.2-transformers4.17.0-gpu-py38-cu113-ubuntu20.04
 ---> a05fce0f724f
Step 3/5 : ARG WORK_DIR="transformers_build"
 ---> Using cache
 

In [69]:
training_input_path="s3://sagemaker-us-west-2-570106654206/samples/datasets/squad/train/"
eval_input_path="s3://sagemaker-us-west-2-570106654206/samples/datasets/squad/eval/"

from sagemaker.huggingface import HuggingFace

# hyperparameters, which are passed into the training script
hyperparameters={
    'epochs': 20,                                    
    'train_batch_size': 8,                         
    'acc':1
}

# refer https://github.com/aws/deep-learning-containers/blob/master/available_images.md#huggingface-training-containers to get the right uri's based on region
#image_uri = '763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-training:1.9.1-transformers4.12.3-gpu-py38-cu111-ubuntu20.04'
image_uri = '570106654206.dkr.ecr.us-west-2.amazonaws.com/pt-hf-smdataparallel-deberta-sagemaker:latest'

# configuration for running training on smdistributed Data Parallel
# this is the only line of code change required to leverage SageMaker Distributed Data Parallel
distribution = {'smdistributed':{'dataparallel':{ 'enabled': True }}}
#distribution = {"mpi":{"enabled":True, "num_of_processes_per_host":8}}

# create the Estimator
huggingface_estimator = HuggingFace(
    entry_point          = 'sm_hf_deberta_squad_train.py',       
    source_dir           = './deberta',       
    instance_type        = 'ml.p4d.24xlarge',   
    instance_count       = 1, 
    role                 = role,             
    py_version           = 'py38',            
    image_uri            = image_uri,
    hyperparameters      = hyperparameters,   
    distribution         = distribution,
    max_retry_attempts   = 30
)

# define a data input dictonary with our uploaded s3 uris
data = {
    'train': training_input_path,
    'eval': eval_input_path
}

# starting the train job with our uploaded datasets as input
huggingface_estimator.fit(data,wait=False)

# The name of the training job. You might need to note this down in case you lose connection to your notebook.
print(huggingface_estimator.latest_training_job.name)

pt-hf-smdataparallel-deberta-sagemaker-2022-05-12-23-12-34-905
