# Import

These requirements are necessary if you launch this notebook from SageMaker instances

In [None]:
"""!pip install mlflow
!pip install pytorch-lightning
!pip install transformers
!pip install tqdm
!pip install sagemaker

!pip install s3fs
!pip install smdebug"""

In [1]:
import sys
sys.path.append('../../../')

import os
import sys
import logging
import argparse
from pathlib import Path
from typing import Any, Dict, Optional

In [2]:
from tqdm.auto import tqdm

import torchmetrics
from torchmetrics.functional import accuracy, f1, auroc

import sagemaker
from sagemaker import get_execution_role
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.core.decorators import auto_move_data
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import MLFlowLogger


import matplotlib.pyplot as plt
from pylab import rcParams
from matplotlib import rc

import pandas as pd

Local constants, regarding the data, MLFlow server, paths, etc..: use them

In [3]:
from deep.constants import *
from deep.utils import *

In [4]:
%load_ext autoreload
%autoreload 2

# Data

You can use the data you want. We advise the `pandas` format.

In [5]:
DATA_PATH = os.path.join(
    '..', '..', '..', "data", "frameworks_data", 'data_v0.7.1'
)


tot_df = pd.read_csv(os.path.join(DATA_PATH, 'new_columns_train_val.csv')).drop_duplicates()

test_df = pd.read_csv(os.path.join(DATA_PATH, 'new_columns_test_v0.7.1.csv'))

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [18]:
minimal_results = {
    'present_prim_tags': 0.8,
    'present_sec_tags': 0.85,
    'sectors': 0.89,
    'pillars_1d': 0.85,
    'pillars_2d': 0.81,
    'subpillars_2d_part1': 0.8, 
   'subpillars_2d_part2': 0.72,
   'subpillars_1d_part1': 0.89,
   'subpillars_1d_part2': 0.85,
   'subpillars_1d_part3': 0.84,
    'gender': 0.84,
    'age': 0.84,
    'specific_needs_groups': 0.79,
    'affected_groups_levels_2_3': 0.99,
    'gender_snorkel': 0.87
}

In [None]:
columns = ['excerpt', 'entry_id',
           'present_prim_tags',
           #'sectors',
           'pillars_2d',
           #'pillars_1d',
           'subpillars_2d_part1', 
           'subpillars_2d_part2'
           #'subpillars_1d_part1',
           #'subpillars_1d_part2',
           #'subpillars_1d_part3'
        ]

In [19]:
columns = ['excerpt', 'entry_id',
           'present_prim_tags',
           'pillars_2d',
           'subpillars_2d_part1', 
           'subpillars_2d_part2'
        ]

In [None]:
columns = [
    'excerpt', 'entry_id',
    'present_sec_tags',
    'gender',
    'age',
    'specific_needs_groups',
    'affected_groups_levels_2_3',
    'gender_snorkel'
]

In [None]:
columns = [
    'excerpt', 'entry_id',
    'present_sec_tags',
    'gender',
    'age',
    'specific_needs_groups',
    'gender_snorkel'
]

In [8]:

tot_df = tot_df[columns]
test_df = test_df[columns]

In [9]:
tot_df.head()

Unnamed: 0,excerpt,entry_id,present_prim_tags,pillars_2d,subpillars_2d_part1,subpillars_2d_part2
0,"After past, partially implemented attempts, th...",489435,"['sectors', 'subpillars_2d', 'subpillars_1d']",['Capacities & Response'],['Capacities & Response->International Response'],['Capacities & Response->National Response']
1,"[10th November, NW Syria] Now with the lockdow...",194719,['subpillars_1d'],[],[],[]
2,El Salvador hace frente a una combinación de c...,186152,"['sectors', 'subpillars_2d']",['Humanitarian Conditions'],['Humanitarian Conditions->Living Standards'],[]
3,Extreme poverty and the government’s fiscal li...,489431,"['sectors', 'subpillars_2d', 'subpillars_1d']",['Humanitarian Conditions'],['Humanitarian Conditions->Living Standards'],[]
4,Las personas que habitan en cantones donde no ...,186986,['sectors'],[],[],[]


## Sagemaker Prep

### Session

Configure SageMaker

In [10]:
sess = sagemaker.Session(default_bucket=DEV_BUCKET.name)
role = SAGEMAKER_ROLE
role_arn = SAGEMAKER_ROLE_ARN
tracking_uri = MLFLOW_SERVER

In [11]:
SAGEMAKER_ROLE

'AmazonSageMaker-ExecutionRole-20210519T102514'

### Bucket upload

You need to upload data to an S3 bucket. 




In [12]:
MLFLOW_SERVER

'http://mlflow-deep-387470f3-1883319727.us-east-1.elb.amazonaws.com/'

In [13]:
DEV_BUCKET

S3Path('s3://sagemaker-deep-experiments-dev')

In [14]:
sample = True  # To make the computations faster, sample = True.

if sample:
    tot_df = tot_df.sample(n=20000)
    
job_name = f"pytorch-{formatted_time()}-all-models"  # change it as you prefer
input_path = DEV_BUCKET / 'training' / 'input_data' / job_name  # Do not change this

train_path = str(input_path / 'train.pickle')
val_path = str(input_path / 'val.pickle')


tot_df.to_pickle(train_path, protocol=4)  # protocol 4 is necessary, since SageMaker uses python 3.6
test_df.to_pickle(val_path, protocol=4)

In [15]:
job_name

'pytorch-2021-11-29-17-29-28-211-all-models'

### Estimator Definition

In [16]:
# GPU instances

instances = [
    'ml.p2.xlarge',
    'ml.p3.2xlarge'
]

The hyperparameters are passed as command line arguments to the training script. 

You can add/change them as you like. It's important to keep the `tracking_uri` and the `experiment_name` which are used by MLFlow.

The class `PyTorch` is part of the `SageMaker` python API. The parameters are important and you should probably not change most of them. The ones you may want to change are:

- `instance_type`, specify the instance you want
- `source_dir`, specify your script directory. Try to use global variable as much as possible

In [20]:
from sagemaker.pytorch import PyTorch

instance_type='ml.p3.2xlarge'

hyperparameters={
    'tracking_uri': MLFLOW_SERVER,
    'experiment_name': "pl-all-models-experiments",
    'max_len': 512,
    'epochs': 1,
    'model_name': 'microsoft/xtremedistil-l6-h256-uncased',
    'tokenizer_name': 'microsoft/xtremedistil-l6-h256-uncased',
    'output_length': 256,
    'training_names':','.join(columns[2:]),
    "instance_type": instance_type,
    'beta_f1': 0.5,
    'nb_repetitions': 1,
    'run_name': 'models_primary_tags',
    'min_results': str(minimal_results)
}

estimator = PyTorch(
    entry_point='train_mlflow.py',
    source_dir=str('../../../scripts/training/nicolo/multiclass-lightning'),
    output_path=str(DEV_BUCKET/'models/'),
    code_location=str(input_path),
    instance_type=instance_type,
    instance_count=1,
    role=role,
    framework_version="1.8",
    py_version="py36",
    hyperparameters = hyperparameters,
    job_name=job_name,
#     train_instance_count=2,
#     train_instance_type="ml.c4.xlarge",
)

In [21]:
fit_arguments = {
    'train': str(input_path),
    'test': str(input_path)
}

In [None]:
# Fit the estimator

estimator.fit(fit_arguments, job_name=job_name)

2021-11-29 16:29:59 Starting - Starting the training job...
2021-11-29 16:30:25 Starting - Launching requested ML instancesProfilerReport-1638203396: InProgress
......
2021-11-29 16:31:34 Starting - Preparing the instances for training............
2021-11-29 16:33:33 Downloading - Downloading input data
2021-11-29 16:33:33 Training - Downloading the training image.....................
2021-11-29 16:37:24 Training - Training image download completed. Training in progress..[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2021-11-29 16:37:25,491 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2021-11-29 16:37:25,525 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2021-11-29 16:37:31,756 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2021-11-29 16:37:32,225 sagemaker

[34m  Downloading boto3-1.20.9-py3-none-any.whl (131 kB)
  Downloading boto3-1.20.8-py3-none-any.whl (131 kB)
  Downloading boto3-1.20.7-py3-none-any.whl (131 kB)
  Downloading boto3-1.20.6-py3-none-any.whl (131 kB)
  Downloading boto3-1.20.5-py3-none-any.whl (131 kB)
  Downloading boto3-1.20.4-py3-none-any.whl (131 kB)
  Downloading boto3-1.20.3-py3-none-any.whl (131 kB)[0m
[34m  Downloading boto3-1.20.2-py3-none-any.whl (131 kB)
  Downloading boto3-1.20.1-py3-none-any.whl (131 kB)
  Downloading boto3-1.20.0-py3-none-any.whl (131 kB)
  Downloading boto3-1.19.12-py3-none-any.whl (131 kB)
  Downloading boto3-1.19.11-py3-none-any.whl (131 kB)
  Downloading boto3-1.19.10-py3-none-any.whl (131 kB)
  Downloading boto3-1.19.9-py3-none-any.whl (131 kB)[0m
[34m  Downloading boto3-1.19.8-py3-none-any.whl (131 kB)[0m
[34mCollecting fsspec[http]!=2021.06.0,>=2021.05.0
  Downloading fsspec-2021.11.1-py3-none-any.whl (132 kB)
  Downloading fsspec-2021.11.0-py3-none-any.whl (132 kB)
  Downloa

[34m  Building wheel for wrapt (setup.py): finished with status 'done'
  Created wheel for wrapt: filename=wrapt-1.12.1-cp36-cp36m-linux_x86_64.whl size=69750 sha256=f1bdf749fb74b12d3668261c1fa36981bafaee2d37215e4d82f8683567806c20
  Stored in directory: /root/.cache/pip/wheels/32/42/7f/23cae9ff6ef66798d00dc5d659088e57dbba01566f6c60db63
  Building wheel for idna-ssl (setup.py): started
  Building wheel for idna-ssl (setup.py): finished with status 'done'
  Created wheel for idna-ssl: filename=idna_ssl-1.1.0-py3-none-any.whl size=3160 sha256=306adde6b91d03a4651485ec4e8388c3ea17bc16b3b32d6ee6d971d2e3937f68
  Stored in directory: /root/.cache/pip/wheels/6a/f5/9c/f8331a854f7a8739cf0e74c13854e4dd7b1af11b04fe1dde13[0m
[34mSuccessfully built sagemaker aiobotocore alembic databricks-cli termcolor wrapt idna-ssl[0m
[34mInstalling collected packages: typing-extensions, six, pyasn1-modules, oauthlib, multidict, frozenlist, cachetools, yarl, smmap, requests-oauthlib, numpy, idna-ssl, google-au

[34m[2021-11-29 16:38:57.858 algo-1:80 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2021-11-29 16:38:57.894 algo-1:80 INFO profiler_config_parser.py:102] User has disabled profiler.[0m
[34m[2021-11-29 16:38:57.895 algo-1:80 INFO json_config.py:91] Creating hook from json_config at /opt/ml/input/config/debughookconfig.json.[0m
[34m[2021-11-29 16:38:57.896 algo-1:80 INFO hook.py:201] tensorboard_dir has not been set for the hook. SMDebug will not be exporting tensorboard summaries.[0m
[34m[2021-11-29 16:38:57.896 algo-1:80 INFO hook.py:255] Saving to /opt/ml/output/tensors[0m
[34m[2021-11-29 16:38:57.896 algo-1:80 INFO state_store.py:77] The checkpoint config file /opt/ml/input/config/checkpointconfig.json does not exist.[0m
[34m[2021-11-29 16:38:58.098 algo-1:80 INFO hook.py:594] name:model.l0.embeddings.word_embeddings.weight count_params:7813632[0m
[34m[2021-11-29 16:38:58.099 algo-1:80 INFO hook.py:594] name:model.l0.embeddings.position_embeddings.weig

[34m#015Validation sanity check: 0it [00:00, ?it/s]#015Validation sanity check:   0%|          | 0/2 [00:00<?, ?it/s]#015                                                              #015#015Training: 88it [00:00, ?it/s]#015Training:   0%|          | 0/595 [00:00<?, ?it/s]#015Epoch 0:   0%|          | 0/595 [00:00<?, ?it/s] #015Epoch 0:   5%|▌         | 30/595 [00:03<01:07,  8.43it/s]#015Epoch 0:   5%|▌         | 30/595 [00:03<01:07,  8.43it/s, loss=0.0229, v_num=0, val_loss_epoch=0.0433, train_loss=0.0219]#015Epoch 0:  10%|█         | 60/595 [00:06<01:02,  8.59it/s, loss=0.0229, v_num=0, val_loss_epoch=0.0433, train_loss=0.0219]#015Epoch 0:  10%|█         | 60/595 [00:06<01:02,  8.59it/s, loss=0.0205, v_num=0, val_loss_epoch=0.0433, train_loss=0.0218]#015Epoch 0:  15%|█▌        | 90/595 [00:10<00:58,  8.70it/s, loss=0.0205, v_num=0, val_loss_epoch=0.0433, train_loss=0.0218]#015Epoch 0:  15%|█▌        | 90/595 [00:10<00:58,  8.70it/s, loss=0.0193, v_num=0, val_loss_epoch=0.0433, train

[34m#015                                                           #033[A#015Epoch 0: 100%|██████████| 380/380 [00:53<00:00,  7.09it/s, loss=0.0228, v_num=0, val_loss_epoch=0.0235, train_loss=0.0184, val_loss_step=0.0248]#015Validation sanity check: 0it [00:00, ?it/s]#015Validation sanity check:   0%|          | 0/2 [00:00<?, ?it/s]#015                                                              #015#015Training: 85it [00:00, ?it/s]#015Training:   0%|          | 0/323 [00:00<?, ?it/s]#015Epoch 0:   0%|          | 0/323 [00:00<?, ?it/s] #015Epoch 0:   9%|▉         | 30/323 [00:03<00:35,  8.32it/s]#015Epoch 0:   9%|▉         | 30/323 [00:03<00:35,  8.32it/s, loss=0.0271, v_num=0, val_loss_epoch=0.0245, train_loss=0.0227]#015Epoch 0:  19%|█▊        | 60/323 [00:07<00:30,  8.50it/s, loss=0.0271, v_num=0, val_loss_epoch=0.0245, train_loss=0.0227]#015Epoch 0:  19%|█▊        | 60/323 [00:07<00:30,  8.50it/s, loss=0.0256, v_num=0, val_loss_epoch=0.0245, train_loss=0.0269]#015Epoch 0:  28%|██