# Import

These requirements are necessary if you launch this notebook from SageMaker instances

In [1]:
"""!pip install mlflow
!pip install pytorch-lightning
!pip install transformers
!pip install tqdm
!pip install sagemaker
!pip install s3fs
!pip install smdebug"""

'!pip install mlflow\n!pip install pytorch-lightning\n!pip install transformers\n!pip install tqdm\n!pip install sagemaker\n!pip install s3fs\n!pip install smdebug'

In [2]:
import sys
sys.path.append('../../../')

import os
import sys
import logging
import argparse
from pathlib import Path
from ast import literal_eval
from collections import Counter
from typing import Any, Dict, Optional

In [3]:
from tqdm.auto import tqdm

import torchmetrics
from torchmetrics.functional import accuracy, f1, auroc

import sagemaker
from sagemaker import get_execution_role
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.core.decorators import auto_move_data
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import MLFlowLogger


import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, multilabel_confusion_matrix


import matplotlib.pyplot as plt
from pylab import rcParams
from matplotlib import rc

import transformers
from transformers import (
    AdamW,
    AutoConfig,
    AutoModel,
    AutoTokenizer,
)
from transformers.optimization import (
    Adafactor,
    get_linear_schedule_with_warmup,
)

Local constants, regarding the data, MLFlow server, paths, etc..: use them

In [4]:
from deep.constants import *
from deep.utils import *

  and should_run_async(code)


In [5]:
%load_ext autoreload
%autoreload 2

# Data

You can use the data you want. We advise the `pandas` format.

In [6]:
TRAIN_PATH = os.path.join('..', '..', '..', "data", "frameworks_data", "data_v0.4.4", "data_v0.4.4_train.csv")
VAL_PATH = os.path.join('..', '..', '..', "data", "frameworks_data", "data_v0.4.4", "data_v0.4.4_val.csv")
TEST_PATH = os.path.join('..', '..', '..', "data", "frameworks_data", "data_v0.4.4", "data_v0.4.4_test.csv")

train_df = pd.read_csv(TRAIN_PATH)
val_df = pd.concat([pd.read_csv(TEST_PATH), pd.read_csv(VAL_PATH)])

## Sagemaker Prep

### Session

Configure SageMaker

In [7]:
sess = sagemaker.Session(default_bucket=DEV_BUCKET.name)
role = SAGEMAKER_ROLE
role_arn = SAGEMAKER_ROLE_ARN
tracking_uri = MLFLOW_SERVER

### Bucket upload

You need to upload data to an S3 bucket. 




In [8]:
MLFLOW_SERVER

'http://mlflow-deep-387470f3-1883319727.us-east-1.elb.amazonaws.com/'

In [9]:
sample = False  # To make the computations faster, sample = True.

if sample:
    train_df = train_df.sample(n=1000)
    val_df = val_df.sample(n=1000)
    
job_name = f"pytorch-{formatted_time()}-subpillars-model-test-mlflow"  # change it as you prefer
input_path = DEV_BUCKET / 'training' / 'input_data' / job_name  # Do not change this

train_path = str(input_path / 'train.pickle')
val_path = str(input_path / 'val.pickle')


train_df.to_pickle(train_path, protocol=4)  # protocol 4 is necessary, since SageMaker uses python 3.6
val_df.to_pickle(val_path, protocol=4)

### Estimator Definition

In [10]:
# GPU instances

instances = [
    'ml.p2.xlarge',
    'ml.p3.2xlarge'
]

The hyperparameters are passed as command line arguments to the training script. 

You can add/change them as you like. It's important to keep the `tracking_uri` and the `experiment_name` which are used by MLFlow.

The class `PyTorch` is part of the `SageMaker` python API. The parameters are important and you should probably not change most of them. The ones you may want to change are:

- `instance_type`, specify the instance you want
- `source_dir`, specify your script directory. Try to use global variable as much as possible

In [11]:
from sagemaker.pytorch import PyTorch


hyperparameters={
    'tracking_uri': MLFLOW_SERVER,
    'experiment_name': "pl-2d-subpilllars",
    'max_len': 256,
    'epochs': 5,
    'model_name': 'microsoft/xtremedistil-l6-h384-uncased',
    'tokenizer_name': 'microsoft/xtremedistil-l6-h384-uncased',
    'dropout_rate': 0.3,
    'pred_threshold':0.4,
    'output_length': 384,
    'learning_rate': 7e-5,
    'training_column':'subpillars'
}

estimator = PyTorch(
    entry_point='train_mlflow.py',
    source_dir=str('../../../scripts/training/selim/multiclass-lightning'),
    output_path=str(DEV_BUCKET/'models/'),
    code_location=str(input_path),
    instance_type='ml.p3.2xlarge',
    instance_count=1,
    role=role,
    framework_version='1.8',
    py_version='py36',
    hyperparameters = hyperparameters,
    job_name=job_name,
#     train_instance_count=2,
#     train_instance_type="ml.c4.xlarge",
)

In [12]:
fit_arguments = {
    'train': str(input_path),
    'test': str(input_path)
}

In [13]:
# Fit the estimator

estimator.fit(fit_arguments, job_name=job_name)

2021-08-06 14:39:03 Starting - Starting the training job..
2021-08-06 14:39:47 Starting - Launching requested ML instancesProfilerReport-1628260735: InProgress
...
2021-08-06 14:40:27 Starting - Preparing the instances for training.....
2021-08-06 14:42:28 Downloading - Downloading input data
2021-08-06 14:42:28 Training - Downloading the training image..........
2021-08-06 14:45:48 Training - Training image download completed. Training in progress.[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2021-08-06 14:45:49,047 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2021-08-06 14:45:49,072 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2021-08-06 14:45:50,493 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2021-08-06 14:45:51,037 sagemaker-training-toolkit INFO 

[34m  Building wheel for wrapt (setup.py): finished with status 'done'
  Created wheel for wrapt: filename=wrapt-1.12.1-cp36-cp36m-linux_x86_64.whl size=69751 sha256=f0ca420c96cf6aecaecc21de04761bf00802361c965719b14cfc20b23b142949
  Stored in directory: /root/.cache/pip/wheels/32/42/7f/23cae9ff6ef66798d00dc5d659088e57dbba01566f6c60db63
  Building wheel for prometheus-flask-exporter (setup.py): started
  Building wheel for prometheus-flask-exporter (setup.py): finished with status 'done'
  Created wheel for prometheus-flask-exporter: filename=prometheus_flask_exporter-0.18.2-py3-none-any.whl size=17398 sha256=705ebe5e6a7b4990dc662beee3d27c9549f236d1b33fa294f90d0d4e9256167a
  Stored in directory: /root/.cache/pip/wheels/15/77/e8/3ca90b66243b0b58d5a5323a3da02cc8c5daf1de7a65141701[0m
[34mSuccessfully built nltk sagemaker aiobotocore alembic databricks-cli idna-ssl termcolor wrapt prometheus-flask-exporter[0m
[34mInstalling collected packages: six, typing-extensions, pyasn1-modules, oa

[34m#015Validation sanity check: 0it [00:00, ?it/s]#015Validation sanity check:   0%|          | 0/2 [00:00<?, ?it/s][2021-08-06 14:49:07.409 algo-1:94 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2021-08-06 14:49:07.446 algo-1:94 INFO profiler_config_parser.py:102] User has disabled profiler.[0m
[34m[2021-08-06 14:49:07.446 algo-1:94 INFO json_config.py:91] Creating hook from json_config at /opt/ml/input/config/debughookconfig.json.[0m
[34m[2021-08-06 14:49:07.447 algo-1:94 INFO hook.py:201] tensorboard_dir has not been set for the hook. SMDebug will not be exporting tensorboard summaries.[0m
[34m[2021-08-06 14:49:07.447 algo-1:94 INFO hook.py:255] Saving to /opt/ml/output/tensors[0m
[34m[2021-08-06 14:49:07.448 algo-1:94 INFO state_store.py:77] The checkpoint config file /opt/ml/input/config/checkpointconfig.json does not exist.[0m
[34m[2021-08-06 14:49:07.645 algo-1:94 INFO hook.py:594] name:model.l0.embeddings.word_embeddings.weight count_params:117204

[34m#015                                                              #015#015Training: 0it [00:00, ?it/s]#015Training:   0%|          | 0/1731 [00:00<?, ?it/s]#015Epoch 0:   0%|          | 0/1731 [00:00<?, ?it/s] #015Epoch 0:   2%|▏         | 30/1731 [00:02<02:29, 11.41it/s]#015Epoch 0:   2%|▏         | 30/1731 [00:02<02:29, 11.41it/s, loss=8.88, v_num=0, val_f1_epoch=0.0668, val_loss_epoch=0.687, train_f1=0.434]#015Epoch 0:   3%|▎         | 60/1731 [00:04<02:15, 12.33it/s, loss=8.88, v_num=0, val_f1_epoch=0.0668, val_loss_epoch=0.687, train_f1=0.434]#015Epoch 0:   3%|▎         | 60/1731 [00:04<02:15, 12.33it/s, loss=2.69, v_num=0, val_f1_epoch=0.0668, val_loss_epoch=0.687, train_f1=0.543]#015Epoch 0:   5%|▌         | 90/1731 [00:06<02:07, 12.88it/s, loss=2.69, v_num=0, val_f1_epoch=0.0668, val_loss_epoch=0.687, train_f1=0.543]#015Epoch 0:   5%|▌         | 90/1731 [00:06<02:07, 12.88it/s, loss=2.5, v_num=0, val_f1_epoch=0.0668, val_loss_epoch=0.687, train_f1=0.595] #015Epoch 0:   7%|

[34m#015                                                             #033[A#015Epoch 0:   0%|          | 0/1731 [00:00<?, ?it/s, loss=0.807, v_num=0, val_f1_epoch=0.665, val_loss_epoch=0.178, train_f1=0.464, val_f1_step=0.654, val_loss_step=0.143]           #015Epoch 1:   0%|          | 0/1731 [00:00<?, ?it/s, loss=0.807, v_num=0, val_f1_epoch=0.665, val_loss_epoch=0.178, train_f1=0.464, val_f1_step=0.654, val_loss_step=0.143]#015Epoch 1:   2%|▏         | 30/1731 [00:02<02:21, 12.00it/s, loss=0.807, v_num=0, val_f1_epoch=0.665, val_loss_epoch=0.178, train_f1=0.464, val_f1_step=0.654, val_loss_step=0.143]#015Epoch 1:   2%|▏         | 30/1731 [00:02<02:21, 12.00it/s, loss=1.04, v_num=0, val_f1_epoch=0.665, val_loss_epoch=0.178, train_f1=0.683, val_f1_step=0.654, val_loss_step=0.143] #015Epoch 1:   3%|▎         | 60/1731 [00:04<02:08, 13.02it/s, loss=1.04, v_num=0, val_f1_epoch=0.665, val_loss_epoch=0.178, train_f1=0.683, val_f1_step=0.654, val_loss_step=0.143]#015Epoch 1:   3%|▎        

[34moss=0.969, v_num=0, val_f1_epoch=0.665, val_loss_epoch=0.178, train_f1=0.744, val_f1_step=0.654, val_loss_step=0.143]#015Epoch 1:  75%|███████▍  | 1290/1731 [01:56<00:39, 11.11it/s, loss=0.821, v_num=0, val_f1_epoch=0.665, val_loss_epoch=0.178, train_f1=0.756, val_f1_step=0.654, val_loss_step=0.143]#015Epoch 1:  76%|███████▋  | 1320/1731 [01:58<00:36, 11.16it/s, loss=0.821, v_num=0, val_f1_epoch=0.665, val_loss_epoch=0.178, train_f1=0.756, val_f1_step=0.654, val_loss_step=0.143]#015Epoch 1:  76%|███████▋  | 1320/1731 [01:58<00:36, 11.16it/s, loss=1.21, v_num=0, val_f1_epoch=0.665, val_loss_epoch=0.178, train_f1=0.759, val_f1_step=0.654, val_loss_step=0.143] #015Epoch 1:  78%|███████▊  | 1350/1731 [02:00<00:33, 11.21it/s, loss=1.21, v_num=0, val_f1_epoch=0.665, val_loss_epoch=0.178, train_f1=0.759, val_f1_step=0.654, val_loss_step=0.143]#015Epoch 1:  78%|███████▊  | 1350/1731 [02:00<00:33, 11.21it/s, loss=0.705, v_num=0, val_f1_epoch=0.665, val_loss_epoch=0.178, train_f1=0.790, val

[34moss=1.03, v_num=0, val_f1_epoch=0.716, val_loss_epoch=0.163, train_f1=0.750, val_f1_step=0.709, val_loss_step=0.120] #015Epoch 2:  76%|███████▋  | 1320/1731 [01:37<00:30, 13.54it/s, loss=1.03, v_num=0, val_f1_epoch=0.716, val_loss_epoch=0.163, train_f1=0.750, val_f1_step=0.709, val_loss_step=0.120]#015Epoch 2:  76%|███████▋  | 1320/1731 [01:37<00:30, 13.54it/s, loss=0.758, v_num=0, val_f1_epoch=0.716, val_loss_epoch=0.163, train_f1=0.753, val_f1_step=0.709, val_loss_step=0.120]#015Epoch 2:  78%|███████▊  | 1350/1731 [01:39<00:28, 13.54it/s, loss=0.758, v_num=0, val_f1_epoch=0.716, val_loss_epoch=0.163, train_f1=0.753, val_f1_step=0.709, val_loss_step=0.120]#015Epoch 2:  78%|███████▊  | 1350/1731 [01:39<00:28, 13.54it/s, loss=0.687, v_num=0, val_f1_epoch=0.716, val_loss_epoch=0.163, train_f1=0.705, val_f1_step=0.709, val_loss_step=0.120]#015Epoch 2:  80%|███████▉  | 1380/1731 [01:41<00:25, 13.54it/s, loss=0.687, v_num=0, val_f1_epoch=0.716, val_loss_epoch=0.163, train_f1=0.705, val

[34m [01:35<00:32, 13.47it/s, loss=0.867, v_num=0, val_f1_epoch=0.734, val_loss_epoch=0.157, train_f1=0.754, val_f1_step=0.805, val_loss_step=0.103]#015Epoch 3:  76%|███████▋  | 1320/1731 [01:37<00:30, 13.48it/s, loss=0.867, v_num=0, val_f1_epoch=0.734, val_loss_epoch=0.157, train_f1=0.754, val_f1_step=0.805, val_loss_step=0.103]#015Epoch 3:  76%|███████▋  | 1320/1731 [01:37<00:30, 13.48it/s, loss=1.02, v_num=0, val_f1_epoch=0.734, val_loss_epoch=0.157, train_f1=0.697, val_f1_step=0.805, val_loss_step=0.103] #015Epoch 3:  78%|███████▊  | 1350/1731 [01:40<00:28, 13.47it/s, loss=1.02, v_num=0, val_f1_epoch=0.734, val_loss_epoch=0.157, train_f1=0.697, val_f1_step=0.805, val_loss_step=0.103]#015Epoch 3:  78%|███████▊  | 1350/1731 [01:40<00:28, 13.47it/s, loss=0.54, v_num=0, val_f1_epoch=0.734, val_loss_epoch=0.157, train_f1=0.766, val_f1_step=0.805, val_loss_step=0.103]#015Epoch 3:  80%|███████▉  | 1380/1731 [01:42<00:26, 13.48it/s, loss=0.54, v_num=0, val_f1_epoch=0.734, val_loss_epoch=0

[34m1 [01:35<00:32, 13.46it/s, loss=1.18, v_num=0, val_f1_epoch=0.745, val_loss_epoch=0.154, train_f1=0.740, val_f1_step=0.812, val_loss_step=0.116] #015Epoch 4:  76%|███████▋  | 1320/1731 [01:38<00:30, 13.45it/s, loss=1.18, v_num=0, val_f1_epoch=0.745, val_loss_epoch=0.154, train_f1=0.740, val_f1_step=0.812, val_loss_step=0.116]#015Epoch 4:  76%|███████▋  | 1320/1731 [01:38<00:30, 13.45it/s, loss=0.487, v_num=0, val_f1_epoch=0.745, val_loss_epoch=0.154, train_f1=0.751, val_f1_step=0.812, val_loss_step=0.116]#015Epoch 4:  78%|███████▊  | 1350/1731 [01:40<00:28, 13.45it/s, loss=0.487, v_num=0, val_f1_epoch=0.745, val_loss_epoch=0.154, train_f1=0.751, val_f1_step=0.812, val_loss_step=0.116]#015Epoch 4:  78%|███████▊  | 1350/1731 [01:40<00:28, 13.45it/s, loss=0.579, v_num=0, val_f1_epoch=0.745, val_loss_epoch=0.154, train_f1=0.714, val_f1_step=0.812, val_loss_step=0.116]#015Epoch 4:  80%|███████▉  | 1380/1731 [01:42<00:26, 13.46it/s, loss=0.579, v_num=0, val_f1_epoch=0.745, val_loss_epoc


2021-08-06 15:01:04 Uploading - Uploading generated training model
2021-08-06 15:01:55 Completed - Training job completed
ProfilerReport-1628260735: IssuesFound
Training seconds: 1190
Billable seconds: 1190
