Using kernel `conda_pytorch_latest_p36`

In [None]:
!pip install s3fs

In [None]:
!pip install cloudpathlib
!pip install s3fs
!pip install transformers
!pip install pytorch-lightning

# Import

In [1]:
import sys
sys.path.append('../../../')

In [2]:
from pathlib import Path
import os
import random
import json
from datetime import datetime

In [3]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report
import torch
import sagemaker
from sagemaker import get_execution_role
import boto3
import pytorch_lightning as pl
import torch.nn as nn
from transformers import (
    AutoModelForSequenceClassification,
    AutoModel,
    AutoTokenizer,
    AdamW,
    get_linear_schedule_with_warmup,
)
from sklearn.preprocessing import MultiLabelBinarizer
from torch.utils.data import DataLoader,Dataset
from tqdm import tqdm

In [4]:
from deep.constants import *
from deep.utils import *

In [5]:
%load_ext autoreload
%autoreload 2

## Data

In [6]:
train = pd.read_csv(FRAMEWORKS_PATH / 'data_v0.3_train.csv')
val = pd.read_csv(FRAMEWORKS_PATH / 'data_v0.3_val.csv')
test = pd.read_csv(FRAMEWORKS_PATH / 'data_v0.3_test.csv')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [7]:
def process(df):
    df = df.copy()
    df['dimension_ids'] = df['dimension_ids'].apply(eval)
#     df['dimension_ids'] = df['dimension_ids'].apply(lambda x: torch.tensor(x, dtype=torch.float))
    
    mlb = MultiLabelBinarizer()
    labels = mlb.fit_transform(list(df['dimension_ids']))
    df['labels'] = list(labels)
    
    df = df[['excerpt', 'labels']]
    df = df.rename(columns={'excerpt': 'texts'})
        
    return df

In [8]:
train_df = process(train)
val_df = process(val)
test_df = process(test)

In [9]:
model_name = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)

## Sagemaker Prep

### Session

In [10]:
sess = sagemaker.Session(default_bucket=DEV_BUCKET.name)
role = 'AmazonSageMaker-ExecutionRole-20210519T102514'

### Bucket upload

In [12]:
sample = True

if sample:
    train_df = train_df.sample(100)
    val_df = val_df.sample(100)
    test_df = test_df.sample(100)

    
job_name = f"pytorch-training-{formatted_time()}"
input_path = DEV_BUCKET / 'training' / 'input_data' / job_name

s3_train_data = str(input_path / 'train_df.pickle')
s3_validation_data = str(input_path / 'val_df.pickle')
s3_test_data = str(input_path / 'test_df.pickle')


train_df.to_pickle(s3_train_data, protocol=4)
test_df.to_pickle(s3_validation_data, protocol=4)
test_df.to_pickle(s3_test_data, protocol=4)

### Estimator Definition

In [13]:
instances = [
    'ml.p2.xlarge',
    'ml.p3.2xlarge'
]

In [14]:
metric_definitions=[
    {'Name': 'loss', 'Regex': "'loss': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'learning_rate', 'Regex': "'learning_rate': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_loss', 'Regex': "'eval_loss': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_accuracy', 'Regex': "'eval_accuracy': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_f1', 'Regex': "'eval_f1': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_precision', 'Regex': "'eval_precision': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_recall', 'Regex': "'eval_recall': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_stupid_metric', 'Regex': "'eval_stupid_metric': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_runtime', 'Regex': "'eval_runtime': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_samples_per_second', 'Regex': "'eval_samples_per_second': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'epoch', 'Regex': "'epoch': ([0-9]+(.|e\-)[0-9]+),?"}]

In [15]:
# # set True if you need spot instance
# use_spot = True
# train_max_run_secs =   2*24 * 60 * 60
# spot_wait_sec =  5 * 60
# max_wait_time_secs = train_max_run_secs +  spot_wait_sec

# if not use_spot:
#     max_wait_time_secs = None
    
# # During local mode, no spot.., use smaller dataset
# if instance_type == 'local':
#     use_spot = False
#     max_wait_time_secs = 0
#     wait = True
#     # Use smaller dataset to run locally
#     inputs = inputs_sample


In [16]:
from sagemaker.pytorch import PyTorch

hyperparameters={'epochs': 2,
                 'train_batch_size': 16,
                 'model_name': 'distilbert-base-uncased'
                 }

estimator = PyTorch(
    entry_point='train.py',
    source_dir=str(SCRIPTS_TRAINING_PATH / 'stefano/pytorch_estimator_example'),
    output_path=str(DEV_BUCKET / 'models/'),
    code_location=str(input_path),
    instance_type='ml.p3.2xlarge',
    instance_count=1,
    role=role,
    framework_version='1.8',
    py_version='py36',
    hyperparameters = hyperparameters,
    metric_definitions=metric_definitions,
    job_name=job_name,
#     train_instance_count=2,
#     train_instance_type="ml.c4.xlarge",
)

In [17]:
fit_arguments = {
    'train': str(input_path),
    'test': str(input_path)
}

In [18]:
estimator.fit(fit_arguments, job_name=job_name, wait=True)

2021-05-31 14:11:06 Starting - Starting the training job...
2021-05-31 14:11:30 Starting - Launching requested ML instancesProfilerReport-1622470263: InProgress
......
2021-05-31 14:12:37 Starting - Preparing the instances for training.........
2021-05-31 14:14:23 Downloading - Downloading input data
2021-05-31 14:14:23 Training - Downloading the training image..................
2021-05-31 14:17:34 Training - Training image download completed. Training in progress..[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2021-05-31 14:17:34,909 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2021-05-31 14:17:34,933 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2021-05-31 14:17:41,165 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2021-05-31 14:17:41,614 sagemaker-train


2021-05-31 14:18:13 Uploading - Uploading generated training model
2021-05-31 14:18:13 Failed - Training job failed
[34m2021-05-31 14:18:00,658 - filelock - INFO - Lock 139874266892104 acquired on /root/.cache/huggingface/transformers/8c8624b8ac8aa99c60c912161f8332de003484428c47906d7ff7eb7f73eecdbb.20430bd8e10ef77a7d2977accefe796051e01bc2fc4aa146bc862997a1a15e79.lock[0m
[34m2021-05-31 14:18:00,698 - filelock - INFO - Lock 139874266892104 released on /root/.cache/huggingface/transformers/8c8624b8ac8aa99c60c912161f8332de003484428c47906d7ff7eb7f73eecdbb.20430bd8e10ef77a7d2977accefe796051e01bc2fc4aa146bc862997a1a15e79.lock[0m
[34m2021-05-31 14:18:00,777 - filelock - INFO - Lock 139874266802664 acquired on /root/.cache/huggingface/transformers/9c169103d7e5a73936dd2b627e42851bec0831212b677c637033ee4bce9ab5ee.126183e36667471617ae2f0835fab707baa54b731f991507ebbb55ea85adb12a.lock[0m
[34m2021-05-31 14:18:06,264 - filelock - INFO - Lock 139874266802664 released on /root/.cache/huggingface

UnexpectedStatusException: Error for Training job pytorch-training-2021-05-31-16-07-33-804: Failed. Reason: AlgorithmError: ExecuteUserScriptError:
Command "/opt/conda/bin/python3.6 train.py --epochs 2 --model_name distilbert-base-uncased --train_batch_size 16"
Downloading:   0%|          | 0.00/442 [00:00<?, ?B/s]Downloading: 100%|ââââââââââ| 442/442 [00:00<00:00, 598kB/s]
Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]Downloading: 100%|ââââââââââ| 232k/232k [00:00<00:00, 43.1MB/s]
Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]Downloading: 100%|ââââââââââ| 466k/466k [00:00<00:00, 44.3MB/s]
Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]Downloading: 100%|ââââââââââ| 28.0/28.0 [00:00<00:00, 43.8kB/s]
Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]Downloading:   2%|â         | 4.36M/268M [00:00<00:06, 43.6MB/s]Downloading:   3%|â         | 8.80M/268M [00:00<00:05, 43.8MB/s]Downloading:   5%|â         | 13.5M/268M [00:00<00:05, 44.8MB/s]Downloading:   7%|â         | 18.3M/268M [00:00<00

In [None]:
from sagemaker import TrainingJobAnalytics

# Captured metrics can be accessed as a Pandas dataframe
df = TrainingJobAnalytics(training_job_name=estimator.latest_training_job.name).dataframe()
df.head(10)

In [None]:
df