# Import

These requirements are necessary if you launch this notebook from SageMaker instances

In [14]:
"""!pip install mlflow
!pip install pytorch-lightning
!pip install transformers
!pip install tqdm
!pip install sagemaker

!pip install s3fs
!pip install smdebug"""

'!pip install mlflow\n!pip install pytorch-lightning\n!pip install transformers\n!pip install tqdm\n!pip install sagemaker\n\n!pip install s3fs\n!pip install smdebug'

In [15]:
import sys
sys.path.append('../../../../')

import os
import sys
import json
import pickle
from sagemaker.pytorch import PyTorch
import sagemaker
import pandas as pd
from typing import Dict

Local constants, regarding the data, MLFlow server, paths, etc..: use them

In [16]:
from deep.constants import *
from deep.utils import *

In [17]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Data

Load data

In [18]:
sample = True 

if sample:
    data_file = 'data/sample_data.json'# sample data
else:
    data_file = ... #full data

with open(data_file, 'r') as openfile:
    data = json.load(openfile)

## Sagemaker Prep

### Session

Configure SageMaker

In [19]:
sess = sagemaker.Session(default_bucket=DEV_BUCKET.name)
role = SAGEMAKER_ROLE
role_arn = SAGEMAKER_ROLE_ARN
tracking_uri = MLFLOW_SERVER

In [20]:
def get_df_from_dict(data: Dict):

    str_data = {k: str(v) for k, v in data.items()}

    keys = list(str_data.keys())
    vals = list(str_data.values()) 

    df_data = pd.DataFrame(
        list(zip(keys, vals)),
        columns =['col_name', 'vals']
    )

    return df_data

### Bucket upload

You need to upload data to an S3 bucket. 




In [21]:
MLFLOW_SERVER

'http://mlflow-deep-387470f3-1883319727.us-east-1.elb.amazonaws.com/'

In [22]:
data.keys()

dict_keys(['train', 'val', 'test', 'tagname_to_id'])

### send data to bucket

In [23]:
job_name = f"pytorch-{formatted_time()}-entry-extraction"  # change it as you prefer
input_path = DEV_BUCKET / 'training' / 'input_data' / job_name  # Do not change this

data_path = str(input_path / 'data.pickle') # keep it as it is

# send data to s3 bucket
# need too check protocol, depending on data type (protocol 4 was made for pandas data inputs)

data_df = get_df_from_dict(data)
data_df.to_pickle(data_path, protocol=4)  # protocol 4 is necessary, since SageMaker uses python 3.6

### Estimator Definition

In [24]:
# GPU instances
instances = [
    'ml.p2.xlarge',
    'ml.p3.2xlarge'
]

# CPU instances
instances = [
    'ml.c4.2xlarge',
    'ml.c4.4xlarge',
    'ml.c5n.2xlarge'
]

# https://aws.amazon.com/sagemaker/pricing/instance-types

The hyperparameters are passed as command line arguments to the training script. 

You can add/change them as you like. It's important to keep the `tracking_uri` and the `experiment_name` which are used by MLFlow.

The class `PyTorch` is part of the `SageMaker` python API. The parameters are important and you should probably not change most of them. The ones you may want to change are:

- `instance_type`, specify the instance you want
- `source_dir`, specify your script directory. Try to use global variable as much as possible

In [25]:
instance_type = "ml.p2.xlarge"

experiment_name = "entry_extraction"
run_name = experiment_name  

hyperparameters = {
    "instance_type": instance_type,
    "tracking_uri": MLFLOW_SERVER,
    "experiment_name": experiment_name,
    "run_name": run_name,
    "model_name_or_path": "microsoft/xtremedistil-l6-h384-uncased",
    "tokenizer_name_or_path": "microsoft/xtremedistil-l6-h384-uncased",
    "learning_rate": 1e-4,
    "n_epochs": 1 if sample else 3,
    "weight_decay": 0.01,
    "dataloader_num_workers": 6,
    "val_batch_size": 32,
    "train_batch_size": 16,
    "max_len": 512,
    "extra_context_length": 64,
    "dropout": 0.2
    #"n_separate_layers": 1,
    #"per_device_train_batch_size": 1,
    #"per_device_eval_batch_size": 1,
    #"gradient_accumulation_steps": 8,
    #"save_strategy": "epoch",
    #"adam_beta1": 0.9,
    #"adam_beta2": 0.98,
    #"adam_epsilon": 1e-6,
    #"warmup_ratio": 0.3,
    #"fp16": true,
}

estimator = PyTorch(
    entry_point="train.py",
    source_dir=str(
        "scripts"
    ),
    output_path=str(DEV_BUCKET / "models/"),
    code_location=str(input_path),
    instance_type=instance_type,
    instance_count=1,
    role=role,
    framework_version="1.8",
    py_version="py3",
    hyperparameters=hyperparameters,
    job_name=job_name,
    debugger_hook_config=False
    #     train_instance_count=2,
    #     train_instance_type="ml.c4.xlarge",
)

fit_arguments = {"train": str(input_path)}


In [26]:
# Fit the estimator
estimator.fit(fit_arguments, job_name=job_name)

2022-10-04 10:29:19 Starting - Starting the training job...
2022-10-04 10:29:46 Starting - Preparing the instances for trainingProfilerReport-1664879356: InProgress
.........
2022-10-04 10:31:24 Downloading - Downloading input data...
2022-10-04 10:32:04 Training - Downloading the training image..............................
2022-10-04 10:37:25 Training - Training image download completed. Training in progress.[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2022-10-04 10:37:30,089 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2022-10-04 10:37:30,125 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2022-10-04 10:37:30,135 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2022-10-04 10:37:30,879 sagemaker-training-toolkit INFO     Installing dependencies from requir

UnexpectedStatusException: Error for Training job pytorch-2022-10-04-13-29-10-040-entry-extraction: Failed. Reason: AlgorithmError: ExecuteUserScriptError:
Command "/opt/conda/bin/python3.6 train.py --dataloader_num_workers 6 --dropout 0.2 --experiment_name entry_extraction --extra_context_length 64 --instance_type ml.p2.xlarge --learning_rate 0.0001 --max_len 512 --model_name_or_path microsoft/xtremedistil-l6-h384-uncased --n_epochs 1 --run_name entry_extraction --tokenizer_name_or_path microsoft/xtremedistil-l6-h384-uncased --tracking_uri http://mlflow-deep-387470f3-1883319727.us-east-1.elb.amazonaws.com/ --train_batch_size 16 --val_batch_size 32 --weight_decay 0.01"
INFO:root:building training and testing datasets
INFO:root:training model
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
Downloading:   0%|          | 0.00/526 [00:00<?, ?B/s]Downloading: 100%|ââââââââââ| 526/526 [00:00<00:00, 448kB/s]
Downloading:   0%|          | 0.00/90.9M [00:00<?, ?B/s]Downloading:   5%|â         | 4.28M/90.9M [00:00<00:02, 42.8MB/s]Downloading:   9%|â         | , exit code: 1