# Import

These requirements are necessary if you launch this notebook from SageMaker instances

In [1]:
"""!pip install mlflow
!pip install pytorch-lightning
!pip install transformers
!pip install tqdm
!pip install sagemaker
!pip install s3fs"""

'!pip install mlflow\n!pip install pytorch-lightning\n!pip install transformers\n!pip install tqdm\n!pip install sagemaker\n!pip install s3fs'

In [2]:
import sys
sys.path.append('../../../')

import os
import sys
import logging
import argparse
from pathlib import Path
from ast import literal_eval
from collections import Counter
from typing import Any, Dict, Optional

In [3]:
from tqdm.auto import tqdm

import torchmetrics
from torchmetrics.functional import accuracy, f1, auroc

import sagemaker
from sagemaker import get_execution_role
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.core.decorators import auto_move_data
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import MLFlowLogger


import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, multilabel_confusion_matrix


import matplotlib.pyplot as plt
from pylab import rcParams
from matplotlib import rc

import transformers
from transformers import (
    AdamW,
    AutoConfig,
    AutoModel,
    AutoTokenizer,
)
from transformers.optimization import (
    Adafactor,
    get_linear_schedule_with_warmup,
)

Local constants, regarding the data, MLFlow server, paths, etc..: use them

In [4]:
from deep.constants import *
from deep.utils import *

  and should_run_async(code)


In [5]:
%load_ext autoreload
%autoreload 2

# Data

You can use the data you want. We advise the `pandas` format.

In [6]:
TRAIN_PATH = os.path.join('..', '..', '..', "data", "frameworks_data", "data_v0.4.4", "data_v0.4.4_train.csv")
VAL_PATH = os.path.join('..', '..', '..', "data", "frameworks_data", "data_v0.4.4", "data_v0.4.4_val.csv")
TEST_PATH = os.path.join('..', '..', '..', "data", "frameworks_data", "data_v0.4.4", "data_v0.4.4_test.csv")

train_df = pd.read_csv(TRAIN_PATH)
val_df = pd.concat([pd.read_csv(TEST_PATH), pd.read_csv(VAL_PATH)])

## Sagemaker Prep

### Session

Configure SageMaker

In [7]:
sess = sagemaker.Session(default_bucket=DEV_BUCKET.name)
role = SAGEMAKER_ROLE
role_arn = SAGEMAKER_ROLE_ARN
tracking_uri = MLFLOW_SERVER

### Bucket upload

You need to upload data to an S3 bucket. 




In [8]:
MLFLOW_SERVER

'http://mlflow-deep-387470f3-1883319727.us-east-1.elb.amazonaws.com/'

In [9]:
sample = False  # To make the computations faster, sample = True.

if sample:
    train_df = train_df.sample(n=1000)
    val_df = val_df.sample(n=1000)
    
job_name = f"pytorch-{formatted_time()}-subpillars-model-test-mlflow"  # change it as you prefer
input_path = DEV_BUCKET / 'training' / 'input_data' / job_name  # Do not change this

train_path = str(input_path / 'train.pickle')
val_path = str(input_path / 'val.pickle')


train_df.to_pickle(train_path, protocol=4)  # protocol 4 is necessary, since SageMaker uses python 3.6
val_df.to_pickle(val_path, protocol=4)

### Estimator Definition

In [10]:
# GPU instances

instances = [
    'ml.p2.xlarge',
    'ml.p3.2xlarge'
]

The hyperparameters are passed as command line arguments to the training script. 

You can add/change them as you like. It's important to keep the `tracking_uri` and the `experiment_name` which are used by MLFlow.

The class `PyTorch` is part of the `SageMaker` python API. The parameters are important and you should probably not change most of them. The ones you may want to change are:

- `instance_type`, specify the instance you want
- `source_dir`, specify your script directory. Try to use global variable as much as possible

In [11]:
from sagemaker.pytorch import PyTorch


hyperparameters={
    'tracking_uri': MLFLOW_SERVER,
    'experiment_name': "pl-2d-subpilllars",
    'max_len': 256,
    'epochs': 5,
    'model_name': 'microsoft/xtremedistil-l6-h384-uncased',
    'tokenizer_name': 'microsoft/xtremedistil-l6-h384-uncased',
    'dropout_rate': 0.2,
    'pred_threshold':0.4,
    'output_length': 384,
    'learning_rate': 7e-5,
    'training_column':'subpillars'
}

estimator = PyTorch(
    entry_point='train_mlflow.py',
    source_dir=str('../../../scripts/training/selim/multiclass-lightning'),
    output_path=str(DEV_BUCKET/'models/'),
    code_location=str(input_path),
    instance_type='ml.p3.2xlarge',
    instance_count=1,
    role=role,
    framework_version='1.8',
    py_version='py36',
    hyperparameters = hyperparameters,
    job_name=job_name,
#     train_instance_count=2,
#     train_instance_type="ml.c4.xlarge",
)

In [12]:
fit_arguments = {
    'train': str(input_path),
    'test': str(input_path)
}

In [13]:
# Fit the estimator

estimator.fit(fit_arguments, job_name=job_name)

2021-08-02 11:56:23 Starting - Starting the training job...
2021-08-02 11:56:55 Starting - Launching requested ML instancesProfilerReport-1627905380: InProgress
......
2021-08-02 11:57:48 Starting - Preparing the instances for training......
2021-08-02 11:58:59 Downloading - Downloading input data...
2021-08-02 11:59:28 Training - Downloading the training image......................[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2021-08-02 12:03:22,799 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2021-08-02 12:03:22,822 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2021-08-02 12:03:25,877 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2021-08-02 12:03:26,322 sagemaker-training-toolkit INFO     Installing dependencies from requirements.txt:[0m
[34m/opt/con

[34m  Attempting uninstall: tqdm
    Found existing installation: tqdm 4.51.0
    Uninstalling tqdm-4.51.0:
      Successfully uninstalled tqdm-4.51.0[0m
[34m  Attempting uninstall: h5py
    Found existing installation: h5py 2.8.0
    Uninstalling h5py-2.8.0:
      Successfully uninstalled h5py-2.8.0[0m

2021-08-02 12:04:30 Training - Training image download completed. Training in progress.[34mSuccessfully installed Flask-2.0.1 Mako-1.1.4 absl-py-0.13.0 aiohttp-3.7.4.post0 alembic-1.4.1 astunparse-1.6.3 async-timeout-3.0.1 cachetools-4.2.2 databricks-cli-0.14.3 docker-5.0.0 entrypoints-0.3 filelock-3.0.12 flatbuffers-1.12 gast-0.3.3 gitdb-4.0.7 gitpython-3.1.18 google-auth-1.34.0 google-auth-oauthlib-0.4.5 grpcio-1.32.0 gunicorn-20.1.0 h5py-2.10.0 huggingface-hub-0.0.12 idna-ssl-1.1.0 itsdangerous-2.0.1 keras-preprocessing-1.1.2 markdown-3.3.4 mlflow-1.18.0 multidict-5.1.0 nlpaug-1.1.6 nltk-3.2.5 numpy-1.19.5 oauthlib-3.1.1 opt-einsum-3.3.0 prometheus-client-0.11.0 prometheus-flas

[34m[nltk_data] Downloading package averaged_perceptron_tagger to[0m
[34m[nltk_data]     /root/nltk_data...[0m
[34m[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.[0m
[34m[nltk_data] Downloading package wordnet to /root/nltk_data...[0m
[34m[nltk_data]   Unzipping corpora/wordnet.zip.[0m
[34m[nltk_data] Downloading package omw to /root/nltk_data...[0m
[34m[nltk_data]   Unzipping corpora/omw.zip.[0m
[34m#015Validation sanity check: 0it [00:00, ?it/s]#015Validation sanity check:   0%|          | 0/2 [00:00<?, ?it/s][2021-08-02 12:06:21.224 algo-1:79 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2021-08-02 12:06:21.290 algo-1:79 INFO profiler_config_parser.py:102] User has disabled profiler.[0m
[34m[2021-08-02 12:06:21.290 algo-1:79 INFO json_config.py:91] Creating hook from json_config at /opt/ml/input/config/debughookconfig.json.[0m
[34m[2021-08-02 12:06:21.291 algo-1:79 INFO hook.py:201] tensorboard_dir has not been set for the hook. S

[34m#015                                                              #015#015Training: 0it [00:00, ?it/s]#015Training:   0%|          | 0/1731 [00:00<?, ?it/s]#015Epoch 0:   0%|          | 0/1731 [00:00<?, ?it/s] #015Epoch 0:   2%|▏         | 30/1731 [00:02<02:28, 11.42it/s]#015Epoch 0:   2%|▏         | 30/1731 [00:02<02:28, 11.42it/s, loss=11.5, v_num=0, val_f1_epoch=0.0681, val_loss_epoch=0.690, train_f1=0.464]#015Epoch 0:   3%|▎         | 60/1731 [00:04<02:14, 12.39it/s, loss=11.5, v_num=0, val_f1_epoch=0.0681, val_loss_epoch=0.690, train_f1=0.464]#015Epoch 0:   3%|▎         | 60/1731 [00:04<02:14, 12.39it/s, loss=2.52, v_num=0, val_f1_epoch=0.0681, val_loss_epoch=0.690, train_f1=0.519]#015Epoch 0:   5%|▌         | 90/1731 [00:07<02:08, 12.77it/s, loss=2.52, v_num=0, val_f1_epoch=0.0681, val_loss_epoch=0.690, train_f1=0.519]#015Epoch 0:   5%|▌         | 90/1731 [00:07<02:08, 12.77it/s, loss=1.81, v_num=0, val_f1_epoch=0.0681, val_loss_epoch=0.690, train_f1=0.558]#015Epoch 0:   7%|

[34m#015                                                             #033[A#015Epoch 1:   0%|          | 0/1731 [00:00<?, ?it/s, loss=0.904, v_num=0, val_f1_epoch=0.727, val_loss_epoch=0.165, train_f1=0.940, val_f1_step=0.863, val_loss_step=0.105]           #015Epoch 2:   0%|          | 0/1731 [00:00<?, ?it/s, loss=0.904, v_num=0, val_f1_epoch=0.727, val_loss_epoch=0.165, train_f1=0.940, val_f1_step=0.863, val_loss_step=0.105]#015Epoch 2:   2%|▏         | 30/1731 [00:02<02:19, 12.18it/s, loss=0.904, v_num=0, val_f1_epoch=0.727, val_loss_epoch=0.165, train_f1=0.940, val_f1_step=0.863, val_loss_step=0.105]#015Epoch 2:   2%|▏         | 30/1731 [00:02<02:19, 12.18it/s, loss=0.694, v_num=0, val_f1_epoch=0.727, val_loss_epoch=0.165, train_f1=0.729, val_f1_step=0.863, val_loss_step=0.105]#015Epoch 2:   3%|▎         | 60/1731 [00:04<02:08, 12.97it/s, loss=0.694, v_num=0, val_f1_epoch=0.727, val_loss_epoch=0.165, train_f1=0.729, val_f1_step=0.863, val_loss_step=0.105]#015Epoch 2:   3%|▎       

[34m v_num=0, val_f1_epoch=0.727, val_loss_epoch=0.165, train_f1=0.805, val_f1_step=0.863, val_loss_step=0.105]#015Epoch 2:  75%|███████▍  | 1290/1731 [01:56<00:39, 11.11it/s, loss=0.71, v_num=0, val_f1_epoch=0.727, val_loss_epoch=0.165, train_f1=0.769, val_f1_step=0.863, val_loss_step=0.105]#015Epoch 2:  76%|███████▋  | 1320/1731 [01:58<00:36, 11.16it/s, loss=0.71, v_num=0, val_f1_epoch=0.727, val_loss_epoch=0.165, train_f1=0.769, val_f1_step=0.863, val_loss_step=0.105]#015Epoch 2:  76%|███████▋  | 1320/1731 [01:58<00:36, 11.16it/s, loss=0.561, v_num=0, val_f1_epoch=0.727, val_loss_epoch=0.165, train_f1=0.700, val_f1_step=0.863, val_loss_step=0.105]#015Epoch 2:  78%|███████▊  | 1350/1731 [02:00<00:34, 11.20it/s, loss=0.561, v_num=0, val_f1_epoch=0.727, val_loss_epoch=0.165, train_f1=0.700, val_f1_step=0.863, val_loss_step=0.105]#015Epoch 2:  78%|███████▊  | 1350/1731 [02:00<00:34, 11.20it/s, loss=0.815, v_num=0, val_f1_epoch=0.727, val_loss_epoch=0.165, train_f1=0.717, val_f1_step=0.

[34mf1_step=0.892, val_loss_step=0.0966]#015Epoch 3:  75%|███████▍  | 1290/1731 [01:35<00:32, 13.44it/s, loss=1.04, v_num=0, val_f1_epoch=0.734, val_loss_epoch=0.160, train_f1=0.764, val_f1_step=0.892, val_loss_step=0.0966] #015Epoch 3:  76%|███████▋  | 1320/1731 [01:38<00:30, 13.45it/s, loss=1.04, v_num=0, val_f1_epoch=0.734, val_loss_epoch=0.160, train_f1=0.764, val_f1_step=0.892, val_loss_step=0.0966]#015Epoch 3:  76%|███████▋  | 1320/1731 [01:38<00:30, 13.45it/s, loss=1.11, v_num=0, val_f1_epoch=0.734, val_loss_epoch=0.160, train_f1=0.736, val_f1_step=0.892, val_loss_step=0.0966]#015Epoch 3:  78%|███████▊  | 1350/1731 [01:40<00:28, 13.45it/s, loss=1.11, v_num=0, val_f1_epoch=0.734, val_loss_epoch=0.160, train_f1=0.736, val_f1_step=0.892, val_loss_step=0.0966]#015Epoch 3:  78%|███████▊  | 1350/1731 [01:40<00:28, 13.45it/s, loss=1.53, v_num=0, val_f1_epoch=0.734, val_loss_epoch=0.160, train_f1=0.740, val_f1_step=0.892, val_loss_step=0.0966]#015Epoch 3:  80%|███████▉  | 1380/1731 [01

[34m=0.871, val_loss_step=0.0923]#015Epoch 4:  75%|███████▍  | 1290/1731 [01:35<00:32, 13.45it/s, loss=0.494, v_num=0, val_f1_epoch=0.744, val_loss_epoch=0.156, train_f1=0.785, val_f1_step=0.871, val_loss_step=0.0923]#015Epoch 4:  76%|███████▋  | 1320/1731 [01:38<00:30, 13.45it/s, loss=0.494, v_num=0, val_f1_epoch=0.744, val_loss_epoch=0.156, train_f1=0.785, val_f1_step=0.871, val_loss_step=0.0923]#015Epoch 4:  76%|███████▋  | 1320/1731 [01:38<00:30, 13.45it/s, loss=0.532, v_num=0, val_f1_epoch=0.744, val_loss_epoch=0.156, train_f1=0.786, val_f1_step=0.871, val_loss_step=0.0923]#015Epoch 4:  78%|███████▊  | 1350/1731 [01:40<00:28, 13.44it/s, loss=0.532, v_num=0, val_f1_epoch=0.744, val_loss_epoch=0.156, train_f1=0.786, val_f1_step=0.871, val_loss_step=0.0923]#015Epoch 4:  78%|███████▊  | 1350/1731 [01:40<00:28, 13.44it/s, loss=0.494, v_num=0, val_f1_epoch=0.744, val_loss_epoch=0.156, train_f1=0.697, val_f1_step=0.871, val_loss_step=0.0923]#015Epoch 4:  80%|███████▉  | 1380/1731 [01:42


2021-08-02 12:17:34 Uploading - Uploading generated training model
2021-08-02 12:18:15 Completed - Training job completed
ProfilerReport-1627905380: IssuesFound
Training seconds: 1141
Billable seconds: 1141
