Using kernel `conda_pytorch_latest_p36`

In [1]:
import sys
sys.path.append('../../../')

In [2]:
from pathlib import Path
import os
import random
from tqdm import tqdm

In [3]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report
import torch

In [4]:
from deep.constants import *

In [5]:
%load_ext autoreload
%autoreload 2

In [6]:
from transformers import (
    AutoTokenizer, 
    DistilBertTokenizerFast,
    DistilBertForSequenceClassification,
    TrainingArguments, 
    Trainer
)

## Sagemaker Prep

In [7]:
import sagemaker
from sagemaker import get_execution_role
import json
import boto3

default_bucket = 'deep-experiments-sagemaker-bucket'
sess = sagemaker.Session(default_bucket=default_bucket)

role = 'AmazonSageMaker-ExecutionRole-20210519T102514'
print(
    role
)  # This is the role that SageMaker would use to leverage AWS resources (S3, CloudWatch) on your behalf

bucket = DEV_BUCKET
prefix = "huggingface/first"  # Replace with the prefix under which you want to store the data if needed


AmazonSageMaker-ExecutionRole-20210519T102514


### Bucket upload

In [8]:
pt_mnist_model_data = 's3://sagemaker-us-east-1-961104659532/pytorch-training-2021-05-26-13-34-05-285/output/model.tar.gz'

In [9]:
from sagemaker.pytorch import PyTorchModel

hyperparameters={
    'train_batch_size': 32,
    'model_name': 'distilbert-base-uncased'
}

model = PyTorchModel(
    entry_point="inference.py",
    source_dir=str(SCRIPTS_TRAINING_PATH / 'stefano/pytorch_estimator_example'),
    role=role,
    model_data=pt_mnist_model_data,
    framework_version="1.8.1",
    py_version="py3",
)

In [10]:
instances = [
    'ml.p2.xlarge',
    'ml.p3.2xlarge',
    'ml.c4.xlarge'
]

In [11]:
from sagemaker.serializers import JSONSerializer
from sagemaker.deserializers import JSONDeserializer

# set local_mode to False if you want to deploy on a remote
# SageMaker instance

local_mode = False

if local_mode:
    instance_type = "local"
else:
    instance_type = "ml.p2.xlarge"

predictor = model.deploy(
    initial_instance_count=1,
    instance_type=instance_type,
    serializer=JSONSerializer(),
    deserializer=JSONDeserializer(),
)

----------------------!

In [12]:
models = {
    'tokenizer': DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased'),
    'model': DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')

}

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi

In [13]:
dummy_data = {"inputs": ['There are many health problems', 'There are many more health problems', 'hey how are you?']*100}

In [20]:
train = pd.read_csv(LATEST_DATA_PATH / 'data_v0.4.2_train.csv')
val = pd.read_csv(LATEST_DATA_PATH / 'data_v0.4.2_val.csv')
test = pd.read_csv(LATEST_DATA_PATH / 'data_v0.4.2_test.csv')

In [32]:
dummy_data = {'inputs': list(train.excerpt.iloc[400:500])}

In [33]:
res = predictor.predict(dummy_data)

In [29]:
res

[[-0.012573882937431335, -0.07482992857694626],
 [-0.009653947316110134, -0.027969634160399437],
 [-0.061148516833782196, -0.05205824598670006],
 [-0.036401353776454926, -0.07345907390117645],
 [-0.030329568311572075, -0.06974519789218903],
 [-0.016993410885334015, -0.05919870361685753],
 [-0.07242336869239807, -0.04151153936982155],
 [-0.056255873292684555, -0.07156162708997726],
 [-0.014177772216498852, -0.014838998205959797],
 [-0.0752091184258461, -0.07641132920980453],
 [0.01882767863571644, -0.013624168001115322],
 [-0.03899262100458145, -0.07284563779830933],
 [0.012607359327375889, -0.06264359503984451],
 [-0.01968485675752163, -0.06865695863962173],
 [-0.02501075156033039, -0.06545975804328918],
 [-0.06887733191251755, -0.0839947834610939],
 [-0.0577133409678936, -0.07013429701328278],
 [-0.04503881558775902, -0.0634227991104126],
 [-0.023082586005330086, -0.10964049398899078],
 [-0.03198631480336189, -0.09623625129461288],
 [-0.033341553062200546, -0.07573923468589783],
 [-0.

In [34]:
# predictor.delete_endpoint()