Using kernel `conda_pytorch_latest_p36`

In [1]:
# !pip install cloudpathlib
# !pip install s3fs
# !pip install transformers\
# !pip install pytorch-lightning

# Import

In [2]:
import sys
sys.path.append('../../../')

In [3]:
from pathlib import Path
import os
import random
import json
from datetime import datetime

In [4]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report
import torch
import sagemaker
from sagemaker import get_execution_role
import boto3
import pytorch_lightning as pl
import torch.nn as nn
from transformers import (
    AutoModelForSequenceClassification,
    AutoModel,
    AutoTokenizer,
    AdamW,
    get_linear_schedule_with_warmup,
)
from sklearn.preprocessing import MultiLabelBinarizer
from torch.utils.data import DataLoader,Dataset
from tqdm import tqdm

In [5]:
from deep.constants import *
from deep.utils import *

In [6]:
%load_ext autoreload
%autoreload 2

## Data

In [7]:
train = pd.read_csv(LATEST_DATA_PATH / 'data_v0.4.2_train.csv')
val = pd.read_csv(LATEST_DATA_PATH / 'data_v0.4.2_val.csv')
test = pd.read_csv(LATEST_DATA_PATH / 'data_v0.4.2_test.csv')

In [8]:
def process(df, column='subpillars'):
    df = df.copy()
    df[column] = df[column].apply(eval)
#     df['dimension_ids'] = df['dimension_ids'].apply(lambda x: torch.tensor(x, dtype=torch.float))
    
    mlb = MultiLabelBinarizer()
    labels = mlb.fit_transform(list(df[column]))
    df['labels'] = list(labels)
    
    df = df[['excerpt', 'labels']]
    df = df.rename(columns={'excerpt': 'texts'})
        
    return df

In [9]:
train_df = process(train, column='sectors')
val_df = process(val, column='sectors')
test_df = process(test, column='sectors')

In [10]:
model_name = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)

## Sagemaker Prep

### Session

In [11]:
sess = sagemaker.Session(default_bucket=DEV_BUCKET.name)
role = 'AmazonSageMaker-ExecutionRole-20210519T102514'

### Bucket upload

In [12]:
sample = False

if sample:
    train_df = train_df.sample(100)
    val_df = val_df.sample(100)
    test_df = test_df.sample(100)

    
job_name = f"pytorch-training-{formatted_time()}"
input_path = DEV_BUCKET / 'training' / 'input_data' / job_name

s3_train_data = str(input_path / 'train_df.pickle')
s3_validation_data = str(input_path / 'val_df.pickle')
s3_test_data = str(input_path / 'test_df.pickle')


train_df.to_pickle(s3_train_data, protocol=4)
test_df.to_pickle(s3_validation_data, protocol=4)
test_df.to_pickle(s3_test_data, protocol=4)

### Estimator Definition

In [13]:
instances = [
    'ml.p2.xlarge',
    'ml.p3.2xlarge'
]

In [14]:
metric_definitions=[
    {'Name': 'loss', 'Regex': "'loss': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'learning_rate', 'Regex': "'learning_rate': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_loss', 'Regex': "'eval_loss': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_accuracy', 'Regex': "'eval_accuracy': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'f1', 'Regex': "'f1': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_precision', 'Regex': "'eval_precision': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_recall', 'Regex': "'eval_recall': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'stupid_metric', 'Regex': "'stupid_metric': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_runtime', 'Regex': "'eval_runtime': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_samples_per_second', 'Regex': "'eval_samples_per_second': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'epoch', 'Regex': "'epoch': ([0-9]+(.|e\-)[0-9]+),?"}]

In [15]:
# # set True if you need spot instance
# use_spot = True
# train_max_run_secs =   2*24 * 60 * 60
# spot_wait_sec =  5 * 60
# max_wait_time_secs = train_max_run_secs +  spot_wait_sec

# if not use_spot:
#     max_wait_time_secs = None
    
# # During local mode, no spot.., use smaller dataset
# if instance_type == 'local':
#     use_spot = False
#     max_wait_time_secs = 0
#     wait = True
#     # Use smaller dataset to run locally
#     inputs = inputs_sample


In [16]:
from sagemaker.pytorch import PyTorch

hyperparameters={'epochs': 1,
                 'train_batch_size': 32,
                 'model_name': 'distilbert-base-uncased',
                 'n_classes': 11,
                 }

estimator = PyTorch(
    entry_point='train.py',
    source_dir=str(SCRIPTS_TRAINING_PATH / 'stefano/multiclass-lightning'),
    output_path=str(DEV_BUCKET / 'models/'),
    code_location=str(input_path),
    instance_type='ml.p3.2xlarge',
    instance_count=1,
    role=role,
    framework_version='1.8',
    py_version='py36',
    hyperparameters = hyperparameters,
    metric_definitions=metric_definitions,
    job_name=job_name,
#     train_instance_count=2,
#     train_instance_type="ml.c4.xlarge",
)

In [17]:
fit_arguments = {
    'train': str(input_path),
    'test': str(input_path)
}

In [18]:
estimator.fit(fit_arguments, job_name=job_name, wait=True)

2021-06-02 07:55:19 Starting - Starting the training job...
2021-06-02 07:55:22 Starting - Launching requested ML instancesProfilerReport-1622620517: InProgress
......
2021-06-02 07:56:38 Starting - Preparing the instances for training.........
2021-06-02 07:58:19 Downloading - Downloading input data
2021-06-02 07:58:19 Training - Downloading the training image.................[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2021-06-02 08:01:34,590 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2021-06-02 08:01:34,614 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m

2021-06-02 08:01:39 Training - Training image download completed. Training in progress.[34m2021-06-02 08:01:40,837 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2021-06-02 08:01:41,191 sagemaker-trainin

[34mInstalling collected packages: pyasn1-modules, oauthlib, multidict, cachetools, yarl, requests-oauthlib, idna-ssl, google-auth, async-timeout, tensorboard-plugin-wit, regex, markdown, grpcio, google-auth-oauthlib, filelock, aiohttp, absl-py, torchmetrics, tokenizers, tensorboard, sacremoses, pyDeprecate, huggingface-hub, transformers, pytorch-lightning[0m
[34mSuccessfully installed absl-py-0.12.0 aiohttp-3.7.4.post0 async-timeout-3.0.1 cachetools-4.2.2 filelock-3.0.12 google-auth-1.30.1 google-auth-oauthlib-0.4.4 grpcio-1.38.0 huggingface-hub-0.0.8 idna-ssl-1.1.0 markdown-3.3.4 multidict-5.1.0 oauthlib-3.1.1 pyDeprecate-0.3.0 pyasn1-modules-0.2.8 pytorch-lightning-1.3.3 regex-2021.4.4 requests-oauthlib-1.3.0 sacremoses-0.0.45 tensorboard-2.4.1 tensorboard-plugin-wit-1.8.0 tokenizers-0.10.3 torchmetrics-0.3.2 transformers-4.6.1 yarl-1.6.3[0m
[0m
[34m2021-06-02 08:01:55,067 sagemaker-training-toolkit INFO     Invoking user script
[0m
[34mTraining Env:
[0m
[34m{
    "additio

[34m2021-06-02 08:01:59,955 - filelock - INFO - Lock 140188903079216 released on /root/.cache/huggingface/transformers/8c8624b8ac8aa99c60c912161f8332de003484428c47906d7ff7eb7f73eecdbb.20430bd8e10ef77a7d2977accefe796051e01bc2fc4aa146bc862997a1a15e79.lock[0m
[34m2021-06-02 08:02:00,065 - filelock - INFO - Lock 140184926625576 acquired on /root/.cache/huggingface/transformers/9c169103d7e5a73936dd2b627e42851bec0831212b677c637033ee4bce9ab5ee.126183e36667471617ae2f0835fab707baa54b731f991507ebbb55ea85adb12a.lock[0m
[34m2021-06-02 08:02:04,986 - filelock - INFO - Lock 140184926625576 released on /root/.cache/huggingface/transformers/9c169103d7e5a73936dd2b627e42851bec0831212b677c637033ee4bce9ab5ee.126183e36667471617ae2f0835fab707baa54b731f991507ebbb55ea85adb12a.lock[0m
[34m#015Validation sanity check: 0it [00:00, ?it/s]#015Validation sanity check:   0%|          | 0/2 [00:00<?, ?it/s][2021-06-02 08:02:09.235 algo-1:37 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2021-0


2021-06-02 08:02:22 Uploading - Uploading generated training model
2021-06-02 08:02:22 Failed - Training job failed
ProfilerReport-1622620517: Stopping
[34m#015Validation sanity check:  50%|█████     | 1/2 [00:01<00:01,  1.52s/it]#015                                                                      #015#015Training: 0it [00:00, ?it/s]#015Training:   0%|          | 0/3008 [00:00<?, ?it/s]#015Epoch 0:   0%|          | 0/3008 [00:00<?, ?it/s] #015Downloading:   0%|          | 0.00/442 [00:00<?, ?B/s]#015Downloading: 100%|██████████| 442/442 [00:00<00:00, 694kB/s][0m
[34m#015Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]#015Downloading: 100%|██████████| 232k/232k [00:00<00:00, 44.0MB/s][0m
[34m#015Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]#015Downloading: 100%|██████████| 466k/466k [00:00<00:00, 45.8MB/s][0m
[34m#015Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]#015Downloading: 100%|██████████| 28.0/28.0 [00:00<00:00, 42.0kB/s][0m
[34m#015Downlo

UnexpectedStatusException: Error for Training job pytorch-training-2021-06-02-09-54-54-518: Failed. Reason: AlgorithmError: ExecuteUserScriptError:
Command "/opt/conda/bin/python3.6 train.py --epochs 1 --model_name distilbert-base-uncased --n_classes 11 --train_batch_size 32"
Downloading:   0%|          | 0.00/442 [00:00<?, ?B/s]Downloading: 100%|ââââââââââ| 442/442 [00:00<00:00, 694kB/s]
Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]Downloading: 100%|ââââââââââ| 232k/232k [00:00<00:00, 44.0MB/s]
Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]Downloading: 100%|ââââââââââ| 466k/466k [00:00<00:00, 45.8MB/s]
Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]Downloading: 100%|ââââââââââ| 28.0/28.0 [00:00<00:00, 42.0kB/s]
Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]Downloading:   2%|â         | 4.66M/268M [00:00<00:05, 46.6MB/s]Downloading:   4%|â         | 9.49M/268M [00:00<00:05, 47.1MB/s]Downloading:   5%|â         | 14.3M/268M [00:00<00:05, 47.5MB/s]Downloading:   7%|â         | 19.2M

In [None]:
from sagemaker import TrainingJobAnalytics

# Captured metrics can be accessed as a Pandas dataframe
df = TrainingJobAnalytics(training_job_name=estimator.latest_training_job.name).dataframe()
df.head(10)

In [None]:
df