# Import

These requirements are necessary if you launch this notebook from SageMaker instances

In [1]:
"""!pip install mlflow
!pip install pytorch-lightning
!pip install transformers
!pip install tqdm
!pip install sagemaker
!pip install s3fs
!pip install smdebug"""

'!pip install mlflow\n!pip install pytorch-lightning\n!pip install transformers\n!pip install tqdm\n!pip install sagemaker\n!pip install s3fs\n!pip install smdebug'

In [2]:
import sys
sys.path.append('../../../')

import os
import sys
import logging
import argparse
from pathlib import Path
from ast import literal_eval
from collections import Counter
from typing import Any, Dict, Optional

In [3]:
from tqdm.auto import tqdm

import torchmetrics
from torchmetrics.functional import accuracy, f1, auroc

import sagemaker
from sagemaker import get_execution_role
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.core.decorators import auto_move_data
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import MLFlowLogger


import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, multilabel_confusion_matrix


import matplotlib.pyplot as plt
from pylab import rcParams
from matplotlib import rc

import transformers
from transformers import (
    AdamW,
    AutoConfig,
    AutoModel,
    AutoTokenizer,
)
from transformers.optimization import (
    Adafactor,
    get_linear_schedule_with_warmup,
)

Local constants, regarding the data, MLFlow server, paths, etc..: use them

In [4]:
from deep.constants import *
from deep.utils import *

  and should_run_async(code)


In [5]:
%load_ext autoreload
%autoreload 2

# Data

You can use the data you want. We advise the `pandas` format.

In [6]:
DATA_PATH = os.path.join('..', '..', '..', "data", "secondary_tags", "augmented_specific_needs_groups.csv")

df = pd.read_csv(DATA_PATH, index_col=0, lineterminator='\n')

train_df, val_df = train_test_split(df, test_size=0.2)

In [7]:
train_df.head(2)

Unnamed: 0,entry_id,excerpt,specific_needs_groups
6048,323616,(Sahel) Para este mes se han registrado 54 nue...,"['Female Head of Household', 'Chronically Ill'..."
2902,287266,Effets des inondations Sur les moyens de subsi...,['Female Head of Household']


## Sagemaker Prep

### Session

Configure SageMaker

In [8]:
sess = sagemaker.Session(default_bucket=DEV_BUCKET.name)
role = SAGEMAKER_ROLE
role_arn = SAGEMAKER_ROLE_ARN
tracking_uri = MLFLOW_SERVER

  and should_run_async(code)


### Bucket upload

You need to upload data to an S3 bucket. 




In [9]:
sample = False  # To make the computations faster, sample = True.

if sample:
    train_df = train_df.sample(n=1000)
    val_df = val_df.sample(n=1000)
    
job_name = f"pytorch-{formatted_time()}-subpillars-model-test-mlflow"  # cannot be changed
input_path = DEV_BUCKET / 'training' / 'input_data' / job_name  # Do not change this

train_path = str(input_path / 'train.pickle')
val_path = str(input_path / 'val.pickle')


train_df.to_pickle(train_path, protocol=4)  # protocol 4 is necessary, since SageMaker uses python 3.6
val_df.to_pickle(val_path, protocol=4)

### Estimator Definition

In [10]:
# GPU instances

instances = [
    'ml.p2.xlarge',
    'ml.p3.2xlarge'
]

The hyperparameters are passed as command line arguments to the training script. 

You can add/change them as you like. It's important to keep the `tracking_uri` and the `experiment_name` which are used by MLFlow.

The class `PyTorch` is part of the `SageMaker` python API. The parameters are important and you should probably not change most of them. The ones you may want to change are:

- `instance_type`, specify the instance you want
- `source_dir`, specify your script directory. Try to use global variable as much as possible

In [11]:
from sagemaker.pytorch import PyTorch


hyperparameters={
    'tracking_uri': MLFLOW_SERVER,
    'max_len': 512,
    'epochs': 5,
    'model_name': 'microsoft/xtremedistil-l6-h384-uncased',
    'tokenizer_name': 'microsoft/xtremedistil-l6-h384-uncased',
    'dropout_rate': 0.3,
    'pred_threshold':0.4,
    'output_length': 384,
    'learning_rate': 7e-5,
    'experiment_name': "pl-specific-needs-groups",
    'training_column':'specific_needs_groups',
    'multiclass_bool':True,
    'train_with_whole_dataset':True
}

estimator = PyTorch(
    entry_point='train_mlflow.py',
    source_dir=str('../../../scripts/training/selim/multiclass-lightning'),
    output_path=str(DEV_BUCKET/'models/'),
    code_location=str(input_path),
    instance_type='ml.p2.xlarge',
    instance_count=1,
    role=role,
    framework_version='1.8',
    py_version='py36',
    hyperparameters = hyperparameters,
    job_name=job_name,
#     train_instance_count=2,
#     train_instance_type="ml.c4.xlarge",
)

In [12]:
fit_arguments = {
    'train': str(input_path),
    'test': str(input_path)
}

In [13]:
# Fit the estimator

estimator.fit(fit_arguments, job_name=job_name)

2021-08-25 10:59:59 Starting - Starting the training job...
2021-08-25 11:00:26 Starting - Launching requested ML instancesProfilerReport-1629889196: InProgress
...............
2021-08-25 11:03:07 Starting - Preparing the instances for training.........
2021-08-25 11:05:08 Downloading - Downloading input data
2021-08-25 11:05:08 Training - Downloading the training image........................[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2021-08-25 11:09:30,130 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2021-08-25 11:09:30,155 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2021-08-25 11:09:31,593 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2021-08-25 11:09:32,578 sagemaker-training-toolkit INFO     Installing dependencies from requirements.txt:[0m
[


2021-08-25 11:10:09 Training - Training image download completed. Training in progress.[34mCollecting aiohttp>=3.3.1
  Downloading aiohttp-3.7.4.post0-cp36-cp36m-manylinux2014_x86_64.whl (1.3 MB)[0m
[34mCollecting aioitertools>=0.5.1
  Downloading aioitertools-0.8.0-py3-none-any.whl (21 kB)[0m
[34mCollecting async-timeout<4.0,>=3.0
  Downloading async_timeout-3.0.1-py3-none-any.whl (8.2 kB)[0m
[34mCollecting idna-ssl>=1.0
  Downloading idna-ssl-1.1.0.tar.gz (3.4 kB)[0m
[34mCollecting multidict<7.0,>=4.5
  Downloading multidict-5.1.0-cp36-cp36m-manylinux2014_x86_64.whl (141 kB)[0m
[34mCollecting yarl<2.0,>=1.0
  Downloading yarl-1.6.3-cp36-cp36m-manylinux2014_x86_64.whl (293 kB)[0m
[34mCollecting Mako
  Downloading Mako-1.1.5-py2.py3-none-any.whl (75 kB)[0m
[34mCollecting python-editor>=0.3
  Downloading python_editor-1.0.4-py3-none-any.whl (4.9 kB)[0m
[34mCollecting boto3>=1.16.32
  Downloading boto3-1.18.28-py3-none-any.whl (131 kB)
  Downloading boto3-1.18.27-py3-no

[34mInstalling collected packages: six, typing-extensions, pyasn1-modules, oauthlib, multidict, cachetools, yarl, smmap, requests-oauthlib, numpy, itsdangerous, idna-ssl, google-auth, botocore, async-timeout, wrapt, tqdm, tensorboard-plugin-wit, tensorboard-data-server, sqlalchemy, regex, python-editor, prometheus-client, markdown, Mako, grpcio, google-auth-oauthlib, gitdb, Flask, filelock, aioitertools, aiohttp, absl-py, torchmetrics, tokenizers, termcolor, tensorflow-estimator, tensorboard, sqlparse, sacremoses, querystring-parser, pyDeprecate, prometheus-flask-exporter, opt-einsum, keras-preprocessing, huggingface-hub, h5py, gunicorn, gitpython, gast, flatbuffers, entrypoints, docker, databricks-cli, boto3, astunparse, alembic, aiobotocore, transformers, tensorflow, smdebug, scikit-learn, sagemaker, s3fs, pytorch-lightning, nltk, nlpaug, mlflow
  Attempting uninstall: six
    Found existing installation: six 1.16.0
    Uninstalling six-1.16.0:[0m
[34m      Successfully uninstalle

[34m#015Validation sanity check: 0it [00:00, ?it/s]#015Validation sanity check:   0%|          | 0/2 [00:00<?, ?it/s][2021-08-25 11:12:12.082 algo-1:92 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2021-08-25 11:12:12.132 algo-1:92 INFO profiler_config_parser.py:102] User has disabled profiler.[0m
[34m[2021-08-25 11:12:12.133 algo-1:92 INFO json_config.py:91] Creating hook from json_config at /opt/ml/input/config/debughookconfig.json.[0m
[34m[2021-08-25 11:12:12.133 algo-1:92 INFO hook.py:201] tensorboard_dir has not been set for the hook. SMDebug will not be exporting tensorboard summaries.[0m
[34m[2021-08-25 11:12:12.134 algo-1:92 INFO hook.py:255] Saving to /opt/ml/output/tensors[0m
[34m[2021-08-25 11:12:12.134 algo-1:92 INFO state_store.py:77] The checkpoint config file /opt/ml/input/config/checkpointconfig.json does not exist.[0m
[34m[2021-08-25 11:12:12.343 algo-1:92 INFO hook.py:594] name:model.l0.embeddings.word_embeddings.weight count_params:117204

[34m#015                                                              #015#015Training: 0it [00:00, ?it/s]#015Training:   0%|          | 0/737 [00:00<?, ?it/s]#015Epoch 0:   0%|          | 0/737 [00:00<?, ?it/s] #015Epoch 0:   4%|▍         | 30/737 [00:09<03:36,  3.27it/s]#015Epoch 0:   4%|▍         | 30/737 [00:09<03:36,  3.27it/s, loss=2, v_num=0, val_f1_epoch=0.0797, val_loss_epoch=0.713, train_f1=0.524]#015Epoch 0:   8%|▊         | 60/737 [00:17<03:22,  3.34it/s, loss=2, v_num=0, val_f1_epoch=0.0797, val_loss_epoch=0.713, train_f1=0.524]#015Epoch 0:   8%|▊         | 60/737 [00:17<03:22,  3.34it/s, loss=0.996, v_num=0, val_f1_epoch=0.0797, val_loss_epoch=0.713, train_f1=0.479]#015Epoch 0:  12%|█▏        | 90/737 [00:26<03:12,  3.37it/s, loss=0.996, v_num=0, val_f1_epoch=0.0797, val_loss_epoch=0.713, train_f1=0.479]#015Epoch 0:  12%|█▏        | 90/737 [00:26<03:12,  3.37it/s, loss=0.942, v_num=0, val_f1_epoch=0.0797, val_loss_epoch=0.713, train_f1=0.480]#015Epoch 0:  16%|█▋        |

[34m#015Validating:   0%|          | 0/67 [00:00<?, ?it/s]#033[A[0m
[34m#015Validating:  45%|████▍     | 30/67 [00:06<00:07,  4.84it/s]#033[A#015Epoch 1:  98%|█████████▊| 720/737 [03:25<00:04,  3.51it/s, loss=0.497, v_num=0, val_f1_epoch=0.805, val_loss_epoch=0.155, train_f1=0.850, val_f1_step=0.766, val_loss_step=0.171][0m
[34m#015Validating:  90%|████████▉ | 60/67 [00:12<00:01,  4.88it/s]#033[A[0m
[34m#015Validating: 100%|██████████| 67/67 [00:13<00:00,  4.92it/s]#033[A#015Epoch 1: 100%|██████████| 737/737 [03:32<00:00,  3.46it/s, loss=0.481, v_num=0, val_f1_epoch=0.858, val_loss_epoch=0.114, train_f1=0.789, val_f1_step=0.815, val_loss_step=0.136][0m
[34m#015                                                           #033[A#015Epoch 1:   0%|          | 0/737 [00:00<?, ?it/s, loss=0.481, v_num=0, val_f1_epoch=0.858, val_loss_epoch=0.114, train_f1=0.789, val_f1_step=0.815, val_loss_step=0.136]          #015Epoch 2:   0%|          | 0/737 [00:00<?, ?it/s, loss=0.481, v_num=0, va

[34m#015Validating:   0%|          | 0/67 [00:00<?, ?it/s]#033[A[0m
[34m#015Validating:  45%|████▍     | 30/67 [00:06<00:07,  4.78it/s]#033[A#015Epoch 2:  98%|█████████▊| 720/737 [03:25<00:04,  3.50it/s, loss=0.409, v_num=0, val_f1_epoch=0.858, val_loss_epoch=0.114, train_f1=0.807, val_f1_step=0.815, val_loss_step=0.136][0m
[34m#015Validating:  90%|████████▉ | 60/67 [00:12<00:01,  4.84it/s]#033[A[0m
[34m#015Validating: 100%|██████████| 67/67 [00:13<00:00,  4.89it/s]#033[A#015Epoch 2: 100%|██████████| 737/737 [03:33<00:00,  3.46it/s, loss=0.362, v_num=0, val_f1_epoch=0.880, val_loss_epoch=0.0981, train_f1=0.899, val_f1_step=0.820, val_loss_step=0.127][0m
[34m#015                                                           #033[A#015Epoch 2:   0%|          | 0/737 [00:00<?, ?it/s, loss=0.362, v_num=0, val_f1_epoch=0.880, val_loss_epoch=0.0981, train_f1=0.899, val_f1_step=0.820, val_loss_step=0.127]          #015Epoch 3:   0%|          | 0/737 [00:00<?, ?it/s, loss=0.362, v_num=0, 

[34m#015Validating:   0%|          | 0/67 [00:00<?, ?it/s]#033[A[0m
[34m#015Validating:  45%|████▍     | 30/67 [00:06<00:07,  4.86it/s]#033[A#015Epoch 3:  98%|█████████▊| 720/737 [03:25<00:04,  3.51it/s, loss=0.298, v_num=0, val_f1_epoch=0.880, val_loss_epoch=0.0981, train_f1=0.917, val_f1_step=0.820, val_loss_step=0.127][0m
[34m#015Validating:  90%|████████▉ | 60/67 [00:12<00:01,  4.90it/s]#033[A[0m
[34m#015Validating: 100%|██████████| 67/67 [00:13<00:00,  4.94it/s]#033[A#015Epoch 3: 100%|██████████| 737/737 [03:32<00:00,  3.46it/s, loss=0.316, v_num=0, val_f1_epoch=0.896, val_loss_epoch=0.0884, train_f1=0.784, val_f1_step=0.845, val_loss_step=0.116][0m
[34m#015                                                           #033[A#015Epoch 3:   0%|          | 0/737 [00:00<?, ?it/s, loss=0.316, v_num=0, val_f1_epoch=0.896, val_loss_epoch=0.0884, train_f1=0.784, val_f1_step=0.845, val_loss_step=0.116]          #015Epoch 4:   0%|          | 0/737 [00:00<?, ?it/s, loss=0.316, v_num=0,

[34m#015Validating:   0%|          | 0/67 [00:00<?, ?it/s]#033[A[0m
[34m#015Validating:  45%|████▍     | 30/67 [00:06<00:07,  4.84it/s]#033[A#015Epoch 4:  98%|█████████▊| 720/737 [03:25<00:04,  3.51it/s, loss=0.343, v_num=0, val_f1_epoch=0.896, val_loss_epoch=0.0884, train_f1=0.847, val_f1_step=0.845, val_loss_step=0.116][0m
[34m#015Validating:  90%|████████▉ | 60/67 [00:12<00:01,  4.89it/s]#033[A[0m
[34m#015Validating: 100%|██████████| 67/67 [00:13<00:00,  4.93it/s]#033[A#015Epoch 4: 100%|██████████| 737/737 [03:32<00:00,  3.46it/s, loss=0.277, v_num=0, val_f1_epoch=0.900, val_loss_epoch=0.0843, train_f1=0.845, val_f1_step=0.849, val_loss_step=0.113][0m
[34m#015                                                           #033[A#015Epoch 4: 100%|██████████| 737/737 [03:34<00:00,  3.44it/s, loss=0.277, v_num=0, val_f1_epoch=0.900, val_loss_epoch=0.0843, train_f1=0.845, val_f1_step=0.849, val_loss_step=0.113][0m
[34m(4049, 15)[0m
[34m(4049, 15)[0m
[34m2021-08-25 11:11:17.652


2021-08-25 11:30:55 Uploading - Uploading generated training model
2021-08-25 11:31:15 Completed - Training job completed
ProfilerReport-1629889196: IssuesFound
Training seconds: 1585
Billable seconds: 1585
