# Import

These requirements are necessary if you launch this notebook from SageMaker instances

In [1]:
"""!pip install mlflow
!pip install pytorch-lightning
!pip install transformers
!pip install tqdm
!pip install sagemaker

!pip install s3fs
!pip install smdebug"""

'!pip install mlflow\n!pip install pytorch-lightning\n!pip install transformers\n!pip install tqdm\n!pip install sagemaker\n\n!pip install s3fs\n!pip install smdebug'

In [2]:
import sys
sys.path.append('../../../')

import os
import sys
import logging
import argparse
from pathlib import Path
from typing import Any, Dict, Optional

In [3]:
from tqdm.auto import tqdm

import torchmetrics
from torchmetrics.functional import accuracy, f1, auroc

import sagemaker
from sagemaker import get_execution_role
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.core.decorators import auto_move_data
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import MLFlowLogger


import matplotlib.pyplot as plt
from pylab import rcParams
from matplotlib import rc

import pandas as pd

Local constants, regarding the data, MLFlow server, paths, etc..: use them

In [4]:
from deep.constants import *
from deep.utils import *

In [5]:
%load_ext autoreload
%autoreload 2

# Data

You can use the data you want. We advise the `pandas` format.

In [6]:
DATA_PATH = os.path.join(
    '..', '..', '..', "data", "frameworks_data", 'data_v0.6.2','generated_entries'
)

"""
original_df = pd.read_csv(os.path.join(DATA_PATH, 'full_dataset.csv'))
augmented_data = pd.read_csv(os.path.join(DATA_PATH, 'generated_text.csv'))"""


tot_df = pd.read_csv(os.path.join(DATA_PATH, 'total_df.csv'), index_col=0)

In [7]:
test_data = pd.read_excel(os.path.join('..', '..', '..', '..', 'feedback_output.xlsx'))
test_data = test_data[['Entry']].rename(columns={'Entry':'excerpt'})
test_data = test_data[test_data.excerpt.apply(lambda x: 'NONE' != x.upper())]

In [9]:
"""augmented_data = pd.merge(
    right=original_df.drop(columns=['excerpt']),
    left=augmented_data[['entry_id', 'excerpt']],
    on='entry_id',
    how='right'
)"""

"augmented_data = pd.merge(\n    right=original_df.drop(columns=['excerpt']),\n    left=augmented_data[['entry_id', 'excerpt']],\n    on='entry_id',\n    how='right'\n)"

In [10]:
"""tot_df = pd.concat([original_df, augmented_data])"""

'tot_df = pd.concat([original_df, augmented_data])'

In [12]:
"""from ast import literal_eval"""

'from ast import literal_eval'

In [13]:
"""tot_df['pillars_1d'] = tot_df.subpillars_1d.apply(
    lambda item: [x.split('->')[0] for x in literal_eval(item)]
)
tot_df['pillars_2d'] = tot_df.subpillars_2d.apply(
    lambda item: [x.split('->')[0] for x in literal_eval(item)]
)"""

"tot_df['pillars_1d'] = tot_df.subpillars_1d.apply(\n    lambda item: [x.split('->')[0] for x in literal_eval(item)]\n)\ntot_df['pillars_2d'] = tot_df.subpillars_2d.apply(\n    lambda item: [x.split('->')[0] for x in literal_eval(item)]\n)"

In [14]:
tot_df = tot_df[
    ['entry_id', 'excerpt', 'lead_id',
     'sectors',
     'severity',
     'demographic_groups',
     'subpillars_1d', 
     'specific_needs_groups',
     'subpillars_2d', 
     'affected_groups',
     'pillars_2d',
     'pillars_1d'
     
    ]
]

## Sagemaker Prep

### Session

Configure SageMaker

In [15]:
sess = sagemaker.Session(default_bucket=DEV_BUCKET.name)
role = SAGEMAKER_ROLE
role_arn = SAGEMAKER_ROLE_ARN
tracking_uri = MLFLOW_SERVER

### Bucket upload

You need to upload data to an S3 bucket. 




In [16]:
MLFLOW_SERVER

'http://mlflow-deep-387470f3-1883319727.us-east-1.elb.amazonaws.com/'

In [17]:
sample = True  # To make the computations faster, sample = True.

if sample:
    tot_df = tot_df.sample(n=100_000)
    
job_name = f"pytorch-{formatted_time()}-all-models"  # change it as you prefer
input_path = DEV_BUCKET / 'training' / 'input_data' / job_name  # Do not change this

train_path = str(input_path / 'train.pickle')
val_path = str(input_path / 'val.pickle')


tot_df.to_pickle(train_path, protocol=4)  # protocol 4 is necessary, since SageMaker uses python 3.6
test_data.to_pickle(val_path, protocol=4)

### Estimator Definition

In [18]:
# GPU instances

instances = [
    'ml.p2.xlarge',
    'ml.p3.2xlarge'
]

The hyperparameters are passed as command line arguments to the training script. 

You can add/change them as you like. It's important to keep the `tracking_uri` and the `experiment_name` which are used by MLFlow.

The class `PyTorch` is part of the `SageMaker` python API. The parameters are important and you should probably not change most of them. The ones you may want to change are:

- `instance_type`, specify the instance you want
- `source_dir`, specify your script directory. Try to use global variable as much as possible

In [19]:
from sagemaker.pytorch import PyTorch

proportions_negative_examples_test = {
    'sectors':0.16,
    'subpillars_2d':0.3,
    'pillars_2d':0.3,
    'subpillars_1d': 0.69,
    'pillars_1d':0.69,
    'demographic_groups': 0.74,
    'specific_needs_groups': 0.86,
    'affected_groups': 0.35
}
factor_prop_tot_train = 0.05
proportions_negative_examples_train = {
    key:value*factor_prop_tot_train for key, value in proportions_negative_examples_test.items()
}
instance_type='ml.p3.2xlarge'

hyperparameters={
    'tracking_uri': MLFLOW_SERVER,
    'experiment_name': "pl-trials",
    'max_len': 256,
    'epochs': 1,
    'model_name': 'microsoft/xtremedistil-l6-h256-uncased',
    'tokenizer_name': 'microsoft/xtremedistil-l6-h256-uncased',
    'dropout_rate': 0.3,
    'pred_threshold':0.4,
    'output_length': 256,
    'learning_rate': 5e-5,
    'training_names':'sectors,subpillars_2d,pillars_2d,pillars_1d,subpillars_1d,specific_needs_groups,affected_groups,demographic_groups,severity',
    #'training_names':'sectors,pillars_2d,subpillars_2d',
    #'training_names':'subpillars_1d,specific_needs_groups,demographic_groups',
    #'training_names':'sectors,subpillars_2d,subpillars_1d,severity,specific_needs_groups,affected_groups,demographic_groups',
    #'train_with_all_positive_examples':True,
    "model_mode":"train",
    "proportions_negative_examples_test": str(proportions_negative_examples_test),
    "proportions_negative_examples_train": str(proportions_negative_examples_train),
    "instance_type": instance_type,
    
    #"numbers_augmentation":"with"
}

estimator = PyTorch(
    entry_point='train_mlflow.py',
    source_dir=str('../../../scripts/training/selim/multiclass-lightning'),
    output_path=str(DEV_BUCKET/'models/'),
    code_location=str(input_path),
    instance_type=instance_type,
    instance_count=1,
    role=role,
    framework_version="1.8",
    py_version="py36",
    hyperparameters = hyperparameters,
    job_name=job_name,
    #distribution={"smdistributed": {"dataparallel": {"enabled": False}}}
#     train_instance_count=2,
#     train_instance_type="ml.c4.xlarge",
)

In [20]:
fit_arguments = {
    'train': str(input_path),
    'test': str(input_path)
}

In [None]:
# Fit the estimator

estimator.fit(fit_arguments, job_name=job_name)

2021-10-25 16:19:26 Starting - Starting the training job...
2021-10-25 16:19:29 Starting - Launching requested ML instancesProfilerReport-1635178763: InProgress
.........
2021-10-25 16:21:27 Starting - Preparing the instances for training.........
2021-10-25 16:23:07 Downloading - Downloading input data...
2021-10-25 16:23:48 Training - Downloading the training image...............
2021-10-25 16:26:29 Training - Training image download completed. Training in progress..[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2021-10-25 16:26:30,011 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2021-10-25 16:26:30,034 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2021-10-25 16:26:33,060 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2021-10-25 16:26:33,576 sagemaker-tr

[34mCollecting botocore<1.20.107,>=1.20.106
  Downloading botocore-1.20.106-py2.py3-none-any.whl (7.7 MB)[0m
[34mCollecting aiohttp>=3.3.1
  Downloading aiohttp-3.7.4.post0-cp36-cp36m-manylinux2014_x86_64.whl (1.3 MB)[0m
[34mCollecting aioitertools>=0.5.1
  Downloading aioitertools-0.8.0-py3-none-any.whl (21 kB)[0m
[34mCollecting Mako
  Downloading Mako-1.1.5-py2.py3-none-any.whl (75 kB)[0m
[34mCollecting python-editor>=0.3
  Downloading python_editor-1.0.4-py3-none-any.whl (4.9 kB)[0m
[34mCollecting boto3>=1.16.32
  Downloading boto3-1.19.2-py3-none-any.whl (131 kB)
  Downloading boto3-1.19.1-py3-none-any.whl (131 kB)
  Downloading boto3-1.19.0-py3-none-any.whl (131 kB)
  Downloading boto3-1.18.65-py3-none-any.whl (131 kB)
  Downloading boto3-1.18.64-py3-none-any.whl (131 kB)
  Downloading boto3-1.18.62-py3-none-any.whl (131 kB)[0m
[34m  Downloading boto3-1.18.61-py3-none-any.whl (131 kB)
  Downloading boto3-1.18.60-py3-none-any.whl (131 kB)
  Downloading boto3-1.18.59-py

[34mInstalling collected packages: typing-extensions, six, pyasn1-modules, oauthlib, multidict, cachetools, yarl, smmap, requests-oauthlib, numpy, idna-ssl, google-auth, botocore, async-timeout, wrapt, tqdm, tensorboard-plugin-wit, tensorboard-data-server, sqlalchemy, s3transfer, regex, python-editor, markdown, Mako, grpcio, google-auth-oauthlib, gitdb, fsspec, aioitertools, aiohttp, absl-py, torchmetrics, tokenizers, termcolor, tensorflow-estimator, tensorboard, sqlparse, sacremoses, querystring-parser, pyDeprecate, prometheus-flask-exporter, opt-einsum, keras-preprocessing, huggingface-hub, h5py, gunicorn, gitpython, gast, flatbuffers, docker, databricks-cli, boto3, astunparse, alembic, aiobotocore, transformers, tensorflow, smdebug, scikit-multilearn, scikit-learn, sagemaker, s3fs, pytorch-lightning, nltk, nlpaug, mlflow
  Attempting uninstall: typing-extensions
    Found existing installation: typing-extensions 3.10.0.2
    Uninstalling typing-extensions-3.10.0.2:[0m
[34m      S

[34m[2021-10-25 16:29:11.698 algo-1:85 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2021-10-25 16:29:11.738 algo-1:85 INFO profiler_config_parser.py:102] User has disabled profiler.[0m
[34m[2021-10-25 16:29:11.739 algo-1:85 INFO json_config.py:91] Creating hook from json_config at /opt/ml/input/config/debughookconfig.json.[0m
[34m[2021-10-25 16:29:11.739 algo-1:85 INFO hook.py:201] tensorboard_dir has not been set for the hook. SMDebug will not be exporting tensorboard summaries.[0m
[34m[2021-10-25 16:29:11.740 algo-1:85 INFO hook.py:255] Saving to /opt/ml/output/tensors[0m
[34m[2021-10-25 16:29:11.740 algo-1:85 INFO state_store.py:77] The checkpoint config file /opt/ml/input/config/checkpointconfig.json does not exist.[0m
[34m[2021-10-25 16:29:11.843 algo-1:85 INFO hook.py:594] name:model.l0.embeddings.word_embeddings.weight count_params:7813632[0m
[34m[2021-10-25 16:29:11.843 algo-1:85 INFO hook.py:594] name:model.l0.embeddings.position_embeddings.weig

[34m#015Validation sanity check: 0it [00:00, ?it/s]#015Validation sanity check:   0%|          | 0/2 [00:00<?, ?it/s]#015                                                              #015#015Training: 99it [00:00, ?it/s]#015Training:   0%|          | 0/1605 [00:00<?, ?it/s]#015Epoch 0:   0%|          | 0/1605 [00:00<?, ?it/s] #015Epoch 0:   2%|▏         | 30/1605 [00:02<01:45, 14.98it/s]#015Epoch 0:   2%|▏         | 30/1605 [00:02<01:45, 14.97it/s, loss=2.39, v_num=0, val_f1_epoch=0.0746, val_loss_epoch=0.673, train_f1=0.476]#015Epoch 0:   4%|▎         | 60/1605 [00:03<01:40, 15.33it/s, loss=2.39, v_num=0, val_f1_epoch=0.0746, val_loss_epoch=0.673, train_f1=0.476]#015Epoch 0:   4%|▎         | 60/1605 [00:03<01:40, 15.32it/s, loss=2.11, v_num=0, val_f1_epoch=0.0746, val_loss_epoch=0.673, train_f1=0.476]#015Epoch 0:   6%|▌         | 90/1605 [00:05<01:36, 15.65it/s, loss=2.11, v_num=0, val_f1_epoch=0.0746, val_loss_epoch=0.673, train_f1=0.476]#015Epoch 0:   6%|▌         | 90/1605 [00:05<

[34m#015                                                             #033[A#015Epoch 0: 100%|██████████| 1605/1605 [01:48<00:00, 14.78it/s, loss=1.83, v_num=0, val_f1_epoch=0.488, val_loss_epoch=0.238, train_f1=0.476, val_f1_step=0.482, val_loss_step=0.231]#015Validation sanity check: 0it [00:00, ?it/s]#015Validation sanity check:   0%|          | 0/2 [00:00<?, ?it/s]#015                                                              #015#015Training: 99it [00:00, ?it/s]#015Training:   0%|          | 0/1661 [00:00<?, ?it/s]#015Epoch 0:   0%|          | 0/1661 [00:00<?, ?it/s] #015Epoch 0:   2%|▏         | 30/1661 [00:01<01:44, 15.65it/s]#015Epoch 0:   2%|▏         | 30/1661 [00:01<01:44, 15.64it/s, loss=6.74, v_num=0, val_f1_epoch=0.0514, val_loss_epoch=0.694, train_f1=0.557]#015Epoch 0:   4%|▎         | 60/1661 [00:03<01:40, 15.91it/s, loss=6.74, v_num=0, val_f1_epoch=0.0514, val_loss_epoch=0.694, train_f1=0.557]#015Epoch 0:   4%|▎         | 60/1661 [00:03<01:40, 15.90it/s, loss=3.79, 

[34m#015                                                             #033[A#015Epoch 0: 100%|██████████| 1661/1661 [01:51<00:00, 14.88it/s, loss=2.14, v_num=0, val_f1_epoch=0.489, val_loss_epoch=0.172, train_f1=0.537, val_f1_step=0.490, val_loss_step=0.168]#015Validation sanity check: 0it [00:00, ?it/s]#015Validation sanity check:   0%|          | 0/2 [00:00<?, ?it/s]#015                                                              #015#015Training: 99it [00:00, ?it/s]#015Training:   0%|          | 0/1677 [00:00<?, ?it/s]#015Epoch 0:   0%|          | 0/1677 [00:00<?, ?it/s] #015Epoch 0:   2%|▏         | 30/1677 [00:02<01:51, 14.75it/s]#015Epoch 0:   2%|▏         | 30/1677 [00:02<01:51, 14.74it/s, loss=1.4, v_num=0, val_f1_epoch=0.107, val_loss_epoch=0.711, train_f1=0.527]#015Epoch 0:   4%|▎         | 60/1677 [00:04<01:47, 14.99it/s, loss=1.4, v_num=0, val_f1_epoch=0.107, val_loss_epoch=0.711, train_f1=0.527]#015Epoch 0:   4%|▎         | 60/1677 [00:04<01:47, 14.99it/s, loss=1.47, v_nu

[34m#015                                                             #033[A#015Epoch 0: 100%|██████████| 1677/1677 [01:52<00:00, 14.90it/s, loss=1.19, v_num=0, val_f1_epoch=0.582, val_loss_epoch=0.354, train_f1=0.665, val_f1_step=0.598, val_loss_step=0.331]#015Validation sanity check: 0it [00:00, ?it/s]#015Validation sanity check:   0%|          | 0/2 [00:00<?, ?it/s]#015                                                              #015#015Training: 99it [00:00, ?it/s]#015Training:   0%|          | 0/587 [00:00<?, ?it/s]#015Epoch 0:   0%|          | 0/587 [00:00<?, ?it/s] #015Epoch 0:   5%|▌         | 30/587 [00:01<00:34, 16.12it/s]#015Epoch 0:   5%|▌         | 30/587 [00:01<00:34, 16.11it/s, loss=1.97, v_num=0, val_f1_epoch=0.0533, val_loss_epoch=0.657, train_f1=0.728]#015Epoch 0:  10%|█         | 60/587 [00:03<00:32, 16.28it/s, loss=1.97, v_num=0, val_f1_epoch=0.0533, val_loss_epoch=0.657, train_f1=0.728]#015Epoch 0:  10%|█         | 60/587 [00:03<00:32, 16.27it/s, loss=1.74, v_num=

[34m#015                                                           #033[A#015Epoch 0: 100%|██████████| 578/578 [00:42<00:00, 13.50it/s, loss=2.19, v_num=0, val_f1_epoch=0.496, val_loss_epoch=0.0853, train_f1=0.485, val_f1_step=0.495, val_loss_step=0.0981]#015Validation sanity check: 0it [00:00, ?it/s]#015Validation sanity check:   0%|          | 0/2 [00:00<?, ?it/s]#015                                                              #015#015Training: 99it [00:00, ?it/s]#015Training:   0%|          | 0/179 [00:00<?, ?it/s]#015Epoch 0:   0%|          | 0/179 [00:00<?, ?it/s] #015Epoch 0:  17%|█▋        | 30/179 [00:01<00:09, 16.45it/s]#015Epoch 0:  17%|█▋        | 30/179 [00:01<00:09, 16.44it/s, loss=2.38, v_num=0, val_f1_epoch=0.00962, val_loss_epoch=0.708, train_f1=0.514]#015Epoch 0:  34%|███▎      | 60/179 [00:03<00:07, 16.54it/s, loss=2.38, v_num=0, val_f1_epoch=0.00962, val_loss_epoch=0.708, train_f1=0.514]#015Epoch 0:  34%|███▎      | 60/179 [00:03<00:07, 16.53it/s, loss=2.07, v_num=

[34m#015                                                             #033[A#015Epoch 0: 100%|██████████| 1294/1294 [01:28<00:00, 14.59it/s, loss=1.46, v_num=0, val_f1_epoch=0.706, val_loss_epoch=0.319, train_f1=0.845, val_f1_step=0.731, val_loss_step=0.310]#015Validation sanity check: 0it [00:00, ?it/s]#015Validation sanity check:   0%|          | 0/2 [00:00<?, ?it/s]#015                                                              #015#015Training: 99it [00:00, ?it/s]#015Training:   0%|          | 0/446 [00:00<?, ?it/s]#015Epoch 0:   0%|          | 0/446 [00:00<?, ?it/s] #015Epoch 0:   7%|▋         | 30/446 [00:01<00:26, 15.88it/s]#015Epoch 0:   7%|▋         | 30/446 [00:01<00:26, 15.87it/s, loss=1.08, v_num=0, val_f1_epoch=0.0722, val_loss_epoch=0.711, train_f1=0.622]#015Epoch 0:  13%|█▎        | 60/446 [00:03<00:24, 16.01it/s, loss=1.08, v_num=0, val_f1_epoch=0.0722, val_loss_epoch=0.711, train_f1=0.622]#015Epoch 0:  13%|█▎        | 60/446 [00:03<00:24, 16.01it/s, loss=1.16, v_num=


2021-10-25 16:52:36 Uploading - Uploading generated training model[34m#015                                                           #033[A#015Epoch 0: 100%|██████████| 726/726 [01:00<00:00, 12.06it/s, loss=2.32, v_num=0, val_f1_epoch=0.613, val_loss_epoch=0.380, train_f1=0.511, val_f1_step=0.589, val_loss_step=0.375]#015Epoch 0: 100%|██████████| 1677/1677 [13:29<00:00,  2.07it/s, loss=1.19, v_num=0, val_f1_epoch=0.582, val_loss_epoch=0.354, train_f1=0.665, val_f1_step=0.598, val_loss_step=0.331][0m
[34m#015Epoch 0: 100%|██████████| 726/726 [01:26<00:00,  8.35it/s, loss=2.32, v_num=0, val_f1_epoch=0.613, val_loss_epoch=0.380, train_f1=0.511, val_f1_step=0.589, val_loss_step=0.375][0m
[34m#015Epoch 0: 100%|██████████| 1294/1294 [05:25<00:00,  3.98it/s, loss=1.46, v_num=0, val_f1_epoch=0.706, val_loss_epoch=0.319, train_f1=0.845, val_f1_step=0.731, val_loss_step=0.310][0m
[34m#015Epoch 0: 100%|██████████| 179/179 [07:11<00:00,  2.41s/it, loss=2, v_num=0, val_f1_epoch=0.496, val_l

[34m#015 20%|██        | 22/110 [00:00<00:03, 22.75it/s]#033[A[0m
[34m#015 23%|██▎       | 25/110 [00:01<00:03, 24.14it/s]#033[A[0m
[34m#015 25%|██▌       | 28/110 [00:01<00:03, 25.16it/s]#033[A[0m
[34m#015 28%|██▊       | 31/110 [00:01<00:03, 26.07it/s]#033[A[0m
[34m#015 31%|███       | 34/110 [00:01<00:02, 26.48it/s]#033[A[0m
[34m#015 34%|███▎      | 37/110 [00:01<00:02, 26.74it/s]#033[A[0m
[34m#015 36%|███▋      | 40/110 [00:01<00:02, 26.63it/s]#033[A[0m
[34m#015 39%|███▉      | 43/110 [00:01<00:02, 26.46it/s]#033[A[0m
[34m#015 42%|████▏     | 46/110 [00:01<00:02, 26.67it/s]#033[A[0m
[34m#015 45%|████▍     | 49/110 [00:01<00:02, 26.56it/s]#033[A[0m
[34m#015 47%|████▋     | 52/110 [00:02<00:02, 26.80it/s]#033[A[0m
[34m#015 50%|█████     | 55/110 [00:02<00:02, 27.30it/s]#033[A[0m
[34m#015 53%|█████▎    | 58/110 [00:02<00:01, 27.38it/s]#033[A[0m
[34m#015 55%|█████▌    | 61/110 [00:02<00:01, 27.28it/s]#033[A[0m
[34m#015 58%|█████▊    | 64/110 [00:02<00:01, 2


2021-10-25 16:55:20 Completed - Training job completed


In [None]:
assert (1==2)


## Code used for deploying and testing models:

In [None]:
import mlflow
import torch

In [None]:
model = mlflow.pytorch.load_model(
    's3://deep-mlflow-artifact/16/21a5ece6091b4ddf8b223e78159ce1c7/artifacts/pytorch_model_all',
    map_location=torch.device('cpu')   
)

In [None]:
model.predict(test_data['excerpt'])

In [None]:
from mlflow import sagemaker

In [None]:
sagemaker.deploy(
    'testcpu-pytorch-trained-gpu',
    's3://deep-mlflow-artifact/16/21a5ece6091b4ddf8b223e78159ce1c7/artifacts/pytorch_model_all',
    execution_role_arn=SAGEMAKER_ROLE_ARN,
    image_url="961104659532.dkr.ecr.us-east-1.amazonaws.com/mlflow-pyfunc:latest",
    region_name="us-east-1",
    instance_type="ml.c4.xlarge",
    synchronous=False,
    archive=True,
)