# Getting started with Hugging Face and Amazon Sagemaker

## Sentiment analysis on product reviews

* https://huggingface.co/distilbert-base-uncased
* https://huggingface.co/transformers/model_doc/distilbert.html
* https://huggingface.co/datasets/generated_reviews_enth

# Setup

In [None]:
!pip -q install awscli "sagemaker>=2.31.0" "transformers>=4.5.0" "datasets[s3]==1.5.0" --upgrade

In [None]:
!pip -q install widgetsnbextension ipywidgets

In [None]:
#!pip -q install torch --upgrade

In [None]:
import sagemaker

print(sagemaker.__version__)

sess = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()

# Preprocessing

We are using the `datasets` library to download and preprocess the `imdb` dataset. After preprocessing, the dataset will be uploaded to our `sagemaker_session_bucket` to be used within our training job. The [imdb](http://ai.stanford.edu/~amaas/data/sentiment/) dataset consists of 25000 training and 25000 testing highly polar movie reviews.

In [None]:
from datasets import load_dataset

train_dataset, valid_dataset, test_dataset = load_dataset('generated_reviews_enth', split=['train', 'validation', 'test'])

print(train_dataset.shape)
print(valid_dataset.shape)
print(test_dataset.shape)

In [None]:
train_dataset[0]

In [None]:
def map_stars_to_sentiment(row):
    return {
        'labels': 1 if row['review_star'] >= 4 else 0
    }

In [None]:
train_dataset = train_dataset.map(map_stars_to_sentiment)
valid_dataset = valid_dataset.map(map_stars_to_sentiment)

In [None]:
train_dataset[0]

In [None]:
train_dataset = train_dataset.flatten()
valid_dataset = valid_dataset.flatten()

In [None]:
train_dataset[0]

In [None]:
train_dataset = train_dataset.remove_columns(['correct', 'translation.th', 'review_star'])
valid_dataset = valid_dataset.remove_columns(['correct', 'translation.th', 'review_star'])

In [None]:
train_dataset = train_dataset.rename_column('translation.en', 'text')
valid_dataset = valid_dataset.rename_column('translation.en', 'text')

In [None]:
train_dataset[0]

In [None]:
valid_dataset[0]

## Tokenize

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

def tokenize(batch):
    return tokenizer(batch['text'], padding='max_length', truncation=True)

In [None]:
train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))

In [None]:
valid_dataset = valid_dataset.map(tokenize, batched=True, batch_size=len(valid_dataset))

In [None]:
import json

json.dumps(train_dataset[0])

# Upload data to S3

In [None]:
from datasets.filesystems import S3FileSystem

s3 = S3FileSystem()  

s3_prefix = 'hugging-face/sentiment-analysis'

train_input_path = f's3://{bucket}/{s3_prefix}/training'
train_dataset.save_to_disk(train_input_path, fs=s3)

valid_input_path = f's3://{bucket}/{s3_prefix}/validation'
valid_dataset.save_to_disk(valid_input_path, fs=s3)

In [None]:
# If you're using the output from a SageMaker Processing job
train_input_path = 's3://sagemaker-us-east-1-613904931467/sagemaker-scikit-learn-2021-04-12-17-18-37-118/output/training'
valid_input_path = 's3://sagemaker-us-east-1-613904931467/sagemaker-scikit-learn-2021-04-12-17-18-37-118/output/validation'


In [None]:
print(train_input_path)
print(valid_input_path)

# Fine-tuning & starting Sagemaker Training Job

In [None]:
!pygmentize train.py

## Fine-tune the Hugging Face model on SageMaker

In [None]:
hyperparameters={
    'epochs': 1,
    'train_batch_size': 32,
    'model_name':'distilbert-base-uncased'
}

In [None]:
import sagemaker.huggingface
from sagemaker.huggingface import HuggingFace

huggingface_estimator = HuggingFace(
    role=role,
    # Fine-tuning script
    entry_point='train.py',
    hyperparameters=hyperparameters,
    # Infrastructure
    transformers_version='4.4.2',
    pytorch_version='1.6.0',
    py_version='py36',
    instance_type='ml.p3.2xlarge',
    instance_count=1,
    # Managed Spot Training
    use_spot_instances=True,
    max_wait=3600,
    max_run=3600,
    # Disable profiling
    disable_profiler=True
)

In [None]:
huggingface_estimator.fit({'train': train_input_path, 'valid': valid_input_path})

## Retrieve model, load it and predict

In [None]:
%%sh -s $huggingface_estimator.model_data
aws s3 cp $1 .
mkdir -p model
tar -xvzf model.tar.gz -C model

In [None]:
from transformers import AutoModel, AutoConfig, DistilBertForSequenceClassification

config = AutoConfig.from_pretrained('./model/config.json')
model = DistilBertForSequenceClassification.from_pretrained('./model/pytorch_model.bin', config=config)

print(config)
print(model)

In [None]:
inputs = tokenizer('This is fantastic product, best purchase in a long time!', return_tensors='pt')
#inputs = tokenizer('What a rip-off, I want my money back', return_tensors='pt')

print(inputs.input_ids)
#print(inputs.attention_mask)

In [None]:
outputs = model(**inputs)

print(outputs.logits)

In [None]:
import torch
import numpy as np
    
def top_class(logits):
    softmax = torch.nn.Softmax(dim=1)
    print(softmax(logits))
    pred = np.argmax(softmax(logits).detach().numpy(), axis=1)
    return pred

In [None]:
print(top_class(outputs.logits))

## Fine-tune the Hugging Face model on SageMaker with Data Parallelism

In [None]:
hyperparameters={
    'epochs': 8,
    'train_batch_size': 32,
    'model_name':'distilbert-base-uncased'
}

In [None]:
huggingface_estimator = HuggingFace(
    role=role,
    # Fine-tuning script
    entry_point='train.py',
    hyperparameters=hyperparameters,
    # Infrastructure
    transformers_version='4.4.2',
    pytorch_version='1.6.0',
    py_version='py36',
    instance_type='ml.p3.16xlarge',
    instance_count=2,
    # Managed Spot Training
    use_spot_instances=True,
    max_wait=3600,
    max_run=3600,
    # Data Parallelism
    distribution={'smdistributed': {'dataparallel': {'enabled': True}}}
)

huggingface_estimator.fit({'train': train_input_path, 'valid': valid_input_path})

## Fine-tune the Hugging Face model on SageMaker with Model Parallelism

In [None]:
hyperparameters={
    'epochs': 1,
    'train_batch_size': 32,
    'model_name':'distilbert-base-uncased'
}

In [None]:
mpi_options = {
    "enabled" : True,
    "processes_per_host" : 2
}

smp_options = {
    "enabled": True,
    "parameters": {
        "microbatches": 2,
        "placement_strategy": "spread",
        "pipeline": "interleaved",
        "optimize": "memory",
        "partitions": 4
    }
}

In [None]:
huggingface_estimator = HuggingFace(
    role=role,
    # Fine-tuning script
    entry_point='train.py',
    hyperparameters=hyperparameters,
    # Infrastructure
    transformers_version='4.4.2',
    pytorch_version='1.6.0',
    py_version='py36',
    instance_type='ml.p3dn.24xlarge',    # 8 NVIDIA V100 GPUs with 32GB memory = 256GB 
    instance_count=1,                    
    # Managed Spot Training
    use_spot_instances=True,
    max_wait=3600,
    max_run=3600,
    # Data Parallelism
    distribution={"smdistributed": {"modelparallel": smp_options}, "mpi": mpi_options}
)

huggingface_estimator.fit({'train': train_input_path, 'valid': valid_input_path})