Using kernel `conda_pytorch_latest_p36`

In [1]:
import sys
sys.path.append('../../../')

In [2]:
from pathlib import Path
import os
import random
import json


In [3]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report
import torch
import boto3
from transformers import (
    AutoTokenizer, 
    DistilBertTokenizerFast,
    DistilBertForSequenceClassification,
    TrainingArguments, 
    Trainer
)
import sagemaker
from sagemaker import get_execution_role
from sagemaker.transformer import Transformer
from sagemaker.pytorch import PyTorchModel

In [4]:
from deep.constants import *

In [5]:
%load_ext autoreload
%autoreload 2

## Sagemaker Prep

In [6]:
default_bucket = 'deep-experiments-sagemaker-bucket'
sess = sagemaker.Session(default_bucket=default_bucket)

role = 'AmazonSageMaker-ExecutionRole-20210519T102514'
print(
    role
)  # This is the role that SageMaker would use to leverage AWS resources (S3, CloudWatch) on your behalf

bucket = SAGEMAKER_BUCKET
prefix = "huggingface/first"  # Replace with the prefix under which you want to store the data if needed


AmazonSageMaker-ExecutionRole-20210519T102514


### Bucket upload

In [7]:
pt_mnist_model_data = 's3://sagemaker-us-east-1-961104659532/pytorch-training-2021-05-26-13-34-05-285/output/model.tar.gz'

In [8]:
hyperparameters={
    'train_batch_size': 32,
    'model_name': 'distilbert-base-uncased'
}

model = PyTorchModel(
    entry_point="batch_inference.py",
    source_dir=str(SCRIPTS_TRAINING_PATH / 'stefano/pytorch_estimator_base'),
    role=role,
    model_data=pt_mnist_model_data,
    framework_version="1.8.1",
    py_version="py3",
)

In [9]:
# then create transformer from PyTorchModel object
transformer = model.transformer(
    instance_count=1, 
    instance_type='ml.m5.large', 
    strategy='MultiRecord',
    assemble_with='Line',
    output_path='s3://{}/batch_transform_output'.format(bucket),
)

In [10]:
dummy_data = pd.DataFrame({"inputs": ['There are many health problems', 'There are many more health problems']})
dummy_data.to_csv('s3://deep-experiments-sagemaker-bucket/test1/prova.csv')

In [11]:
transformer.transform(
    's3://deep-experiments-sagemaker-bucket/test1/prova.csv',
    content_type='text/csv', 
    split_type='Line', 
    logs=True, 
)

.............................[34mCollecting transformers==4.6.1
  Downloading transformers-4.6.1-py3-none-any.whl (2.2 MB)[0m
[34mCollecting pytorch-lightning==1.3.2
  Downloading pytorch_lightning-1.3.2-py3-none-any.whl (805 kB)[0m
[34mCollecting importlib-metadata
  Downloading importlib_metadata-4.3.0-py3-none-any.whl (16 kB)[0m
[34mCollecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)[0m
[34mCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)[0m
[34mCollecting huggingface-hub==0.0.8
  Downloading huggingface_hub-0.0.8-py3-none-any.whl (34 kB)[0m
[34mCollecting filelock
  Downloading filelock-3.0.12-py3-none-any.whl (7.6 kB)[0m
[34mCollecting regex!=2019.12.17
  Downloading regex-2021.4.4-cp36-cp36m-manylinux2014_x86_64.whl (722 kB)[0m
[34mCollecting fsspec[http]>=2021.4.0
  Downloading fsspec-2021.5.0-py3-none-any.whl (


[34mSuccessfully installed absl-py-0.12.0 aiohttp-3.7.4.post0 async-timeout-3.0.1 attrs-21.2.0 cachetools-4.2.2 click-8.0.1 filelock-3.0.12 fsspec-2021.5.0 google-auth-1.30.1 google-auth-oauthlib-0.4.4 grpcio-1.38.0 huggingface-hub-0.0.8 idna-ssl-1.1.0 importlib-metadata-4.3.0 markdown-3.3.4 multidict-5.1.0 oauthlib-3.1.0 protobuf-3.17.1 pyDeprecate-0.3.0 pyasn1-modules-0.2.8 pytorch-lightning-1.3.2 regex-2021.4.4 requests-oauthlib-1.3.0 sacremoses-0.0.45 tensorboard-2.4.1 tensorboard-plugin-wit-1.8.0 tokenizers-0.10.3 torchmetrics-0.3.2 transformers-4.6.1 werkzeug-2.0.1 yarl-1.6.3 zipp-3.4.1[0m
[34m2021-05-27 14:14:51,840 [INFO ] main org.pytorch.serve.ModelServer - [0m
[34mTorchserve version: 0.3.1[0m
[34mTS Home: /opt/conda/lib/python3.6/site-packages[0m
[34mCurrent directory: /[0m
[34mTemp directory: /home/model-server/tmp[0m
[34mNumber of GPUs: 0[0m
[34mNumber of CPUs: 2[0m
[34mMax heap size: 986 M[0m
[34mPython executable: /opt/conda/bin/python3.6[0m
[34mCon

[32m2021-05-27T14:14:59.253:[sagemaker logs]: MaxConcurrentTransforms=1, MaxPayloadInMB=6, BatchStrategy=MULTI_RECORD[0m
