# Import

These requirements are necessary if you launch this notebook from SageMaker instances

In [1]:
"""!pip install mlflow
!pip install pytorch-lightning
!pip install transformers
!pip install tqdm
!pip install sagemaker
!pip install s3fs
!pip install smdebug"""

'!pip install mlflow\n!pip install pytorch-lightning\n!pip install transformers\n!pip install tqdm\n!pip install sagemaker\n!pip install s3fs\n!pip install smdebug'

In [2]:
import sys
sys.path.append('../../../')

import os
import sys
import logging
import argparse
from pathlib import Path
from ast import literal_eval
from collections import Counter
from typing import Any, Dict, Optional

In [3]:
from tqdm.auto import tqdm

import torchmetrics
from torchmetrics.functional import accuracy, f1, auroc

import sagemaker
from sagemaker import get_execution_role
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.core.decorators import auto_move_data
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import MLFlowLogger


import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, multilabel_confusion_matrix


import matplotlib.pyplot as plt
from pylab import rcParams
from matplotlib import rc

import transformers
from transformers import (
    AdamW,
    AutoConfig,
    AutoModel,
    AutoTokenizer,
)
from transformers.optimization import (
    Adafactor,
    get_linear_schedule_with_warmup,
)

Local constants, regarding the data, MLFlow server, paths, etc..: use them

In [4]:
from deep.constants import *
from deep.utils import *

  and should_run_async(code)


In [5]:
%load_ext autoreload
%autoreload 2

# Data

You can use the data you want. We advise the `pandas` format.

In [6]:
DATA_PATH = os.path.join('..', '..', '..', "data", "data_secondary_tags", "severity_tags.csv")

df = pd.read_csv(DATA_PATH, index_col=0, lineterminator='\n')
df['severity'] = df['tag_value'].apply(lambda x: [x])
df[['entry_id', 'excerpt', 'severity']].to_csv('severity_final.csv')
df = pd.read_csv('severity_final.csv', index_col=0)
train_df, val_df = train_test_split(df, test_size=0.2)

In [7]:
train_df.head(2)

Unnamed: 0,entry_id,excerpt,severity
128263,27251,• Figures for Chimoio urban area were omitted ...,['No problem']
464404,248212,"[1st Nov2020,North east Nigeria]FOOD AND NUTRI...",['Major']


## Sagemaker Prep

### Session

Configure SageMaker

In [8]:
sess = sagemaker.Session(default_bucket=DEV_BUCKET.name)
role = SAGEMAKER_ROLE
role_arn = SAGEMAKER_ROLE_ARN
tracking_uri = MLFLOW_SERVER

  and should_run_async(code)


### Bucket upload

You need to upload data to an S3 bucket. 




In [9]:
sample = False  # To make the computations faster, sample = True.

if sample:
    train_df = train_df.sample(n=1000)
    val_df = val_df.sample(n=1000)
    
job_name = f"pytorch-{formatted_time()}-subpillars-model-test-mlflow"  # cannot be changed
input_path = DEV_BUCKET / 'training' / 'input_data' / job_name  # Do not change this

train_path = str(input_path / 'train.pickle')
val_path = str(input_path / 'val.pickle')


train_df.to_pickle(train_path, protocol=4)  # protocol 4 is necessary, since SageMaker uses python 3.6
val_df.to_pickle(val_path, protocol=4)

### Estimator Definition

In [10]:
# GPU instances

instances = [
    'ml.p2.xlarge',
    'ml.p3.2xlarge'
]

The hyperparameters are passed as command line arguments to the training script. 

You can add/change them as you like. It's important to keep the `tracking_uri` and the `experiment_name` which are used by MLFlow.

The class `PyTorch` is part of the `SageMaker` python API. The parameters are important and you should probably not change most of them. The ones you may want to change are:

- `instance_type`, specify the instance you want
- `source_dir`, specify your script directory. Try to use global variable as much as possible

In [11]:
from sagemaker.pytorch import PyTorch


hyperparameters={
    'tracking_uri': MLFLOW_SERVER,
    'max_len': 512,
    'epochs': 5,
    'model_name': 'microsoft/xtremedistil-l6-h384-uncased',
    'tokenizer_name': 'microsoft/xtremedistil-l6-h384-uncased',
    'dropout_rate': 0.3,
    'pred_threshold':0.4,
    'output_length': 384,
    'learning_rate': 7e-5,
    'experiment_name': "pl-severity",
    'training_column':'severity',
    'multiclass_bool':False,
    'train_with_whole_dataset':True
}

estimator = PyTorch(
    entry_point='train_mlflow.py',
    source_dir=str('../../../scripts/training/selim/multiclass-lightning'),
    output_path=str(DEV_BUCKET/'models/'),
    code_location=str(input_path),
    instance_type='ml.p3.2xlarge',
    instance_count=1,
    role=role,
    framework_version='1.8',
    py_version='py36',
    hyperparameters = hyperparameters,
    job_name=job_name,
#     train_instance_count=2,
#     train_instance_type="ml.c4.xlarge",
)

In [12]:
fit_arguments = {
    'train': str(input_path),
    'test': str(input_path)
}

In [13]:
# Fit the estimator

estimator.fit(fit_arguments, job_name=job_name)

2021-08-27 11:35:20 Starting - Starting the training job...
2021-08-27 11:35:49 Starting - Launching requested ML instancesProfilerReport-1630064117: InProgress
...
2021-08-27 11:36:22 Starting - Preparing the instances for training.........
2021-08-27 11:38:14 Downloading - Downloading input data
2021-08-27 11:38:14 Training - Downloading the training image.........................[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2021-08-27 11:42:33,985 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2021-08-27 11:42:34,009 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2021-08-27 11:42:34,018 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2021-08-27 11:42:34,359 sagemaker-training-toolkit INFO     Installing dependencies from requirements.txt:[0m
[34m/opt/con


2021-08-27 11:43:15 Training - Training image download completed. Training in progress.[34mCollecting boto3>=1.16.32
  Downloading boto3-1.18.29-py3-none-any.whl (131 kB)
  Downloading boto3-1.18.28-py3-none-any.whl (131 kB)
  Downloading boto3-1.18.27-py3-none-any.whl (131 kB)
  Downloading boto3-1.18.26-py3-none-any.whl (131 kB)[0m
  Downloading boto3-1.18.25-py3-none-any.whl (131 kB)[0m
[34m  Downloading boto3-1.18.24-py3-none-any.whl (131 kB)
  Downloading boto3-1.18.23-py3-none-any.whl (131 kB)
  Downloading boto3-1.18.22-py3-none-any.whl (131 kB)
  Downloading boto3-1.18.21-py3-none-any.whl (131 kB)[0m
[34m  Downloading boto3-1.18.20-py3-none-any.whl (131 kB)
  Downloading boto3-1.18.19-py3-none-any.whl (131 kB)
  Downloading boto3-1.18.18-py3-none-any.whl (131 kB)
  Downloading boto3-1.18.17-py3-none-any.whl (131 kB)
  Downloading boto3-1.18.16-py3-none-any.whl (131 kB)[0m
[34m  Downloading boto3-1.18.15-py3-none-any.whl (131 kB)
  Downloading boto3-1.18.14-py3-none-any

[34m  Attempting uninstall: h5py
    Found existing installation: h5py 2.8.0
    Uninstalling h5py-2.8.0:[0m
[34m      Successfully uninstalled h5py-2.8.0
  Attempting uninstall: boto3
    Found existing installation: boto3 1.17.110
    Uninstalling boto3-1.17.110:
      Successfully uninstalled boto3-1.17.110[0m
[34m  Attempting uninstall: smdebug
    Found existing installation: smdebug 1.0.9
    Uninstalling smdebug-1.0.9:
      Successfully uninstalled smdebug-1.0.9[0m
[34m  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 0.24.2
    Uninstalling scikit-learn-0.24.2:
      Successfully uninstalled scikit-learn-0.24.2[0m
[34m  Attempting uninstall: sagemaker
    Found existing installation: sagemaker 2.48.2
    Uninstalling sagemaker-2.48.2:
      Successfully uninstalled sagemaker-2.48.2[0m
[34m  Attempting uninstall: s3fs
    Found existing installation: s3fs 0.4.2
    Uninstalling s3fs-0.4.2:
      Successfully uninstalled s3fs-0.4.2[0m

[34m[nltk_data] Downloading package averaged_perceptron_tagger to[0m
[34m[nltk_data]     /root/nltk_data...[0m
[34m[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.[0m
[34m[nltk_data] Downloading package wordnet to /root/nltk_data...[0m
[34m[nltk_data]   Unzipping corpora/wordnet.zip.[0m
[34m[nltk_data] Downloading package omw to /root/nltk_data...[0m
[34m[nltk_data]   Unzipping corpora/omw.zip.[0m
[34m#015Validation sanity check: 0it [00:00, ?it/s]#015Validation sanity check:   0%|          | 0/2 [00:00<?, ?it/s][2021-08-27 11:45:47.034 algo-1:91 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2021-08-27 11:45:47.070 algo-1:91 INFO profiler_config_parser.py:102] User has disabled profiler.[0m
[34m[2021-08-27 11:45:47.071 algo-1:91 INFO json_config.py:91] Creating hook from json_config at /opt/ml/input/config/debughookconfig.json.[0m
[34m[2021-08-27 11:45:47.072 algo-1:91 INFO hook.py:201] tensorboard_dir has not been set for the hook. S

[34m#015                                                              #015#015Training: 0it [00:00, ?it/s]#015Training:   0%|          | 0/1355 [00:00<?, ?it/s]#015Epoch 0:   0%|          | 0/1355 [00:00<?, ?it/s] #015Epoch 0:   2%|▏         | 30/1355 [00:02<01:53, 11.71it/s]#015Epoch 0:   2%|▏         | 30/1355 [00:02<01:53, 11.71it/s, loss=3.32, v_num=0, val_f1_epoch=0.167, val_loss_epoch=0.702, train_f1=0.550]#015Epoch 0:   4%|▍         | 60/1355 [00:04<01:42, 12.63it/s, loss=3.32, v_num=0, val_f1_epoch=0.167, val_loss_epoch=0.702, train_f1=0.550]#015Epoch 0:   4%|▍         | 60/1355 [00:04<01:42, 12.63it/s, loss=2.06, v_num=0, val_f1_epoch=0.167, val_loss_epoch=0.702, train_f1=0.570]#015Epoch 0:   7%|▋         | 90/1355 [00:06<01:36, 13.11it/s, loss=2.06, v_num=0, val_f1_epoch=0.167, val_loss_epoch=0.702, train_f1=0.570]#015Epoch 0:   7%|▋         | 90/1355 [00:06<01:36, 13.11it/s, loss=1.4, v_num=0, val_f1_epoch=0.167, val_loss_epoch=0.702, train_f1=0.651] #015Epoch 0:   9%|▉    

[34m#015Validating:  73%|███████▎  | 90/124 [00:03<00:01, 26.03it/s]#033[A#015Epoch 0: 100%|█████████▉| 1350/1355 [01:32<00:00, 14.54it/s, loss=0.6, v_num=0, val_f1_epoch=0.167, val_loss_epoch=0.702, train_f1=0.764][0m
[34m#015Validating:  97%|█████████▋| 120/124 [00:04<00:00, 26.65it/s]#033[A[0m
[34m#015Validating: 100%|██████████| 124/124 [00:04<00:00, 28.11it/s]#033[A#015Epoch 0: 100%|██████████| 1355/1355 [01:34<00:00, 14.40it/s, loss=0.597, v_num=0, val_f1_epoch=0.772, val_loss_epoch=0.302, train_f1=0.853, val_f1_step=0.844, val_loss_step=0.329][0m
[34m#015                                                             #033[A#015Epoch 0:   0%|          | 0/1355 [00:00<?, ?it/s, loss=0.597, v_num=0, val_f1_epoch=0.772, val_loss_epoch=0.302, train_f1=0.853, val_f1_step=0.844, val_loss_step=0.329]           #015Epoch 1:   0%|          | 0/1355 [00:00<?, ?it/s, loss=0.597, v_num=0, val_f1_epoch=0.772, val_loss_epoch=0.302, train_f1=0.853, val_f1_step=0.844, val_loss_step=0.329]#01

[34m#015                                                             #033[A#015Epoch 1:   0%|          | 0/1355 [00:00<?, ?it/s, loss=1.14, v_num=0, val_f1_epoch=0.802, val_loss_epoch=0.266, train_f1=0.838, val_f1_step=0.844, val_loss_step=0.242]           #015Epoch 2:   0%|          | 0/1355 [00:00<?, ?it/s, loss=1.14, v_num=0, val_f1_epoch=0.802, val_loss_epoch=0.266, train_f1=0.838, val_f1_step=0.844, val_loss_step=0.242]#015Epoch 2:   2%|▏         | 30/1355 [00:02<01:46, 12.49it/s, loss=1.14, v_num=0, val_f1_epoch=0.802, val_loss_epoch=0.266, train_f1=0.838, val_f1_step=0.844, val_loss_step=0.242]#015Epoch 2:   2%|▏         | 30/1355 [00:02<01:46, 12.49it/s, loss=0.778, v_num=0, val_f1_epoch=0.802, val_loss_epoch=0.266, train_f1=0.765, val_f1_step=0.844, val_loss_step=0.242]#015Epoch 2:   4%|▍         | 60/1355 [00:04<01:42, 12.67it/s, loss=0.778, v_num=0, val_f1_epoch=0.802, val_loss_epoch=0.266, train_f1=0.765, val_f1_step=0.844, val_loss_step=0.242]#015Epoch 2:   4%|▍         |

[34m#015                                                             #033[A#015Epoch 2:   0%|          | 0/1355 [00:00<?, ?it/s, loss=0.978, v_num=0, val_f1_epoch=0.827, val_loss_epoch=0.246, train_f1=0.779, val_f1_step=0.844, val_loss_step=0.290]           #015Epoch 3:   0%|          | 0/1355 [00:00<?, ?it/s, loss=0.978, v_num=0, val_f1_epoch=0.827, val_loss_epoch=0.246, train_f1=0.779, val_f1_step=0.844, val_loss_step=0.290]#015Epoch 3:   2%|▏         | 30/1355 [00:02<01:47, 12.36it/s, loss=0.978, v_num=0, val_f1_epoch=0.827, val_loss_epoch=0.246, train_f1=0.779, val_f1_step=0.844, val_loss_step=0.290]#015Epoch 3:   2%|▏         | 30/1355 [00:02<01:47, 12.36it/s, loss=1.04, v_num=0, val_f1_epoch=0.827, val_loss_epoch=0.246, train_f1=0.638, val_f1_step=0.844, val_loss_step=0.290] #015Epoch 3:   4%|▍         | 60/1355 [00:04<01:40, 12.95it/s, loss=1.04, v_num=0, val_f1_epoch=0.827, val_loss_epoch=0.246, train_f1=0.638, val_f1_step=0.844, val_loss_step=0.290]#015Epoch 3:   4%|▍        

[34m#015Validating:  97%|█████████▋| 120/124 [00:04<00:00, 26.27it/s]#033[A[0m
[34m#015Validating: 100%|██████████| 124/124 [00:04<00:00, 27.76it/s]#033[A#015Epoch 3: 100%|██████████| 1355/1355 [01:34<00:00, 14.34it/s, loss=0.632, v_num=0, val_f1_epoch=0.844, val_loss_epoch=0.226, train_f1=0.599, val_f1_step=0.844, val_loss_step=0.284][0m
[34m#015                                                             #033[A#015Epoch 3:   0%|          | 0/1355 [00:00<?, ?it/s, loss=0.632, v_num=0, val_f1_epoch=0.844, val_loss_epoch=0.226, train_f1=0.599, val_f1_step=0.844, val_loss_step=0.284]           #015Epoch 4:   0%|          | 0/1355 [00:00<?, ?it/s, loss=0.632, v_num=0, val_f1_epoch=0.844, val_loss_epoch=0.226, train_f1=0.599, val_f1_step=0.844, val_loss_step=0.284]#015Epoch 4:   2%|▏         | 30/1355 [00:02<01:44, 12.64it/s, loss=0.632, v_num=0, val_f1_epoch=0.844, val_loss_epoch=0.226, train_f1=0.599, val_f1_step=0.844, val_loss_step=0.284]#015Epoch 4:   2%|▏         | 30/1355 [00:0

[34m(7806, 5)[0m
[34m(7806, 5)[0m
[34m2021-08-27 11:44:02.112957: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0[0m
[34mINFO:root:reading, preprocessing data[0m
[34mINFO:filelock:Lock 139788394686952 acquired on /root/.cache/huggingface/transformers/31d6577412393ebb07c02de876b2d1397fcae2d85cb053b588145f6869ab1a15.44cd178af39e607af310bc4cc48a944f5e5f746b372c161b32511f0fd585789b.lock[0m
[34m#015Downloading:   0%|          | 0.00/526 [00:00<?, ?B/s]#015Downloading: 100%|██████████| 526/526 [00:00<00:00, 607kB/s][0m
[34mINFO:filelock:Lock 139788394686952 released on /root/.cache/huggingface/transformers/31d6577412393ebb07c02de876b2d1397fcae2d85cb053b588145f6869ab1a15.44cd178af39e607af310bc4cc48a944f5e5f746b372c161b32511f0fd585789b.lock[0m
[34mINFO:filelock:Lock 139788394687344 acquired on /root/.cache/huggingface/transformers/a9c548057d82391e2bd98d883850cb32ebea77d731e8aef568b3a62626fcb8b3.d789d64ebfe299b


2021-08-27 11:54:34 Uploading - Uploading generated training model
2021-08-27 11:55:00 Completed - Training job completed
Training seconds: 1013
Billable seconds: 1013
