# Getting started with Hugging Face and Amazon Sagemaker

## Binary classification on movie reviews

* https://huggingface.co/distilbert-base-uncased
* https://huggingface.co/transformers/model_doc/distilbert.html
* https://huggingface.co/datasets/imdb

# Setup

In [None]:
!pip -q install "sagemaker>=2.31.0" "transformers>=4.4.2" "datasets[s3]==1.5.0" --upgrade

In [None]:
!pip -q install torch tensorflow --upgrade

In [None]:
import sagemaker

print(sagemaker.__version__)

sess = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()

# Preprocessing

We are using the `datasets` library to download and preprocess the `imdb` dataset. After preprocessing, the dataset will be uploaded to our `sagemaker_session_bucket` to be used within our training job. The [imdb](http://ai.stanford.edu/~amaas/data/sentiment/) dataset consists of 25000 training and 25000 testing highly polar movie reviews.

In [None]:
from datasets import load_dataset

train_dataset, test_dataset = load_dataset('imdb', split=['train', 'test'])

print(train_dataset.shape)
print(test_dataset.shape)

In [None]:
print(train_dataset[0])

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

def tokenize(batch):
    return tokenizer(batch['text'], padding='max_length', truncation=True)

train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))
test_dataset = test_dataset.map(tokenize, batched=True, batch_size=len(test_dataset))

In [None]:
print(train_dataset[0])

In [None]:
# Set format for pytorch
train_dataset = train_dataset.rename_column("label", "labels")
test_dataset = test_dataset.rename_column("label", "labels")

In [None]:
print(train_dataset[0])

# Upload data to S3

In [None]:
from datasets.filesystems import S3FileSystem

s3 = S3FileSystem()  

s3_prefix = 'hugging-face/demo'

training_input_path = f's3://{bucket}/{s3_prefix}/train'
train_dataset.save_to_disk(training_input_path,fs=s3)

test_input_path = f's3://{bucket}/{s3_prefix}/test'
test_dataset.save_to_disk(test_input_path,fs=s3)

In [None]:
print(training_input_path)
print(test_input_path)

# Fine-tuning & starting Sagemaker Training Job

In [None]:
!pygmentize ./scripts/train.py

## Fine-tune the Hugging Face model on SageMaker

In [None]:
hyperparameters={
    'epochs': 1,
    'train_batch_size': 32,
    'model_name':'distilbert-base-uncased'
}

In [None]:
import sagemaker.huggingface
from sagemaker.huggingface import HuggingFace

huggingface_estimator = HuggingFace(
    role=role,
    # Fine-tuning script
    entry_point='train.py',
    source_dir='./scripts',
    hyperparameters=hyperparameters,
    # Infrastructure
    transformers_version='4.4.2',
    pytorch_version='1.6.0',
    py_version='py36',
    instance_type='ml.p3.2xlarge',
    instance_count=1,
    # Managed Spot Training
    use_spot_instances=True,
    max_wait=3600,
    max_run=3600,
    # Disable profiling
    disable_profiler=True
)

In [None]:
huggingface_estimator.fit(
    {'train': training_input_path, 'test': test_input_path}
)

In [None]:
huggingface_estimator.sagemaker_session.logs_for_job(
    huggingface_estimator.latest_training_job.name)

## Retrieve model, load it and predict

In [None]:
%%sh -s $huggingface_estimator.model_data
aws s3 cp $1 .
mkdir -p model
tar -xvzf model.tar.gz -C model

In [None]:
from transformers import AutoModel, AutoConfig, DistilBertForSequenceClassification

config = AutoConfig.from_pretrained('./model/config.json')
model = DistilBertForSequenceClassification.from_pretrained('./model/pytorch_model.bin', config=config)

print(config)
print(model)

In [None]:
#inputs = tokenizer("The Phantom Menace was a really bad movie. What a waste of my life.", return_tensors='pt')
inputs = tokenizer("The Phantom Menace was an amazing movie. Jar Jar rocks!", return_tensors='pt')

print(inputs.input_ids)
print(inputs.attention_mask)

In [None]:
outputs = model(**inputs)

print(outputs.logits)

In [None]:
def top_class(logits):
    import torch
    import numpy as np
    softmax = torch.nn.Softmax(dim=1)
    print(softmax(logits))
    pred = np.argmax(softmax(logits).detach().numpy(), axis=1)

    return pred

In [None]:
print(top_class(outputs.logits))

In [None]:
## Fine-tune the Hugging Face model on SageMaker with Distributed Training

In [None]:
hyperparameters={
    'epochs': 16,
    'train_batch_size': 32,
    'model_name':'distilbert-base-uncased'
}

In [None]:
huggingface_estimator = HuggingFace(
    role=role,
    # Fine-tuning script
    entry_point='train.py',
    source_dir='./scripts',
    hyperparameters=hyperparameters,
    # Infrastructure
    transformers_version='4.4.2',
    pytorch_version='1.6.0',
    py_version='py36',
    instance_type='ml.p3.16xlarge',
    instance_count=2,
    # Managed Spot Training
    use_spot_instances=True,
    max_wait=3600,
    max_run=3600,
    # Disable profiling
    disable_profiler=True
)

huggingface_estimator.fit({'train': training_input_path, 'test': test_input_path})

## Fine-tune the Hugging Face model on SageMaker with Data Parallelism

In [None]:
huggingface_estimator = HuggingFace(
    role=role,
    # Fine-tuning script
    entry_point='train.py',
    source_dir='./scripts',
    hyperparameters=hyperparameters,
    # Infrastructure
    transformers_version='4.4.2',
    pytorch_version='1.6.0',
    py_version='py36',
    instance_type='ml.p3.16xlarge',
    instance_count=2,
    # Managed Spot Training
    use_spot_instances=True,
    max_wait=3600,
    max_run=3600,
    # Disable profiling
    disable_profiler=True,
    # Data Parallelism
    distribution={'smdistributed': {'dataparallel': {'enabled': True}}}
)

huggingface_estimator.fit({'train': training_input_path, 'test': test_input_path})