# Import

These requirements are necessary if you launch this notebook from SageMaker instances

In [1]:
"""!pip install mlflow
!pip install pytorch-lightning
!pip install transformers
!pip install tqdm
!pip install sagemaker
!pip install s3fs
!pip install smdebug"""

'!pip install mlflow\n!pip install pytorch-lightning\n!pip install transformers\n!pip install tqdm\n!pip install sagemaker\n!pip install s3fs\n!pip install smdebug'

In [2]:
import sys
sys.path.append('../../../')

import os
import sys
import logging
import argparse
from pathlib import Path
from typing import Any, Dict, Optional

In [3]:
from tqdm.auto import tqdm

import torchmetrics
from torchmetrics.functional import accuracy, f1, auroc

import sagemaker
from sagemaker import get_execution_role
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.core.decorators import auto_move_data
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import MLFlowLogger


import matplotlib.pyplot as plt
from pylab import rcParams
from matplotlib import rc

import pandas as pd

Local constants, regarding the data, MLFlow server, paths, etc..: use them

In [4]:
from deep.constants import *
from deep.utils import *

In [5]:
%load_ext autoreload
%autoreload 2

# Data

You can use the data you want. We advise the `pandas` format.

In [6]:
DATA_PATH = os.path.join(
    '..', '..', '..', "data", "frameworks_data", 'data_v0.6.2','generated_entries'
)


original_df = pd.read_csv(os.path.join(DATA_PATH, 'full_dataset.csv'))
augmented_data = pd.read_csv(os.path.join(DATA_PATH, 'generated_text.csv'))

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [7]:
test_data = pd.read_excel(os.path.join('..', '..', '..', '..', 'feedback_output.xlsx'))
test_data = test_data[['Entry']].rename(columns={'Entry':'excerpt'})
test_data = test_data[test_data.excerpt.apply(lambda x: 'NONE' != x.upper())]
test_data

Unnamed: 0,excerpt
0,"In Southeast Myanmar, UNHCR and the Myanmar Re..."
1,UNHCR provided technical support to health par...
2,"In March, UNHCR and UTBA organized two interna..."
3,To strengthen the provision of legal aid for r...
4,Some 625 young women aged 18-24 have earned an...
...,...
124,"In addition to the 18,000 Rohingya already rel..."
125,"In 2020, WFP advocated with the GoT to increas..."
126,The mission of the United Nations Office for t...
127,"As of June 2021, 771,458 Syrian refugee childr..."


In [8]:
augmented_data = pd.merge(
    right=original_df.drop(columns=['excerpt']),
    left=augmented_data[['entry_id', 'excerpt']],
    on='entry_id',
    how='right'
)

In [9]:
tot_df = pd.concat([original_df, augmented_data])

In [10]:
tot_df.shape

(489869, 21)

In [11]:
tot_df = tot_df[
    ['entry_id', 'excerpt', 'lead_id',
     'sectors','demographic_groups',
     #'subpillars_1d', 'specific_needs_groups',
     #'subpillars_2d', 'affected_groups',
    #'severity'
    ]
]

## Sagemaker Prep

### Session

Configure SageMaker

In [12]:
sess = sagemaker.Session(default_bucket=DEV_BUCKET.name)
role = SAGEMAKER_ROLE
role_arn = SAGEMAKER_ROLE_ARN
tracking_uri = MLFLOW_SERVER

### Bucket upload

You need to upload data to an S3 bucket. 




In [13]:
MLFLOW_SERVER

'http://mlflow-deep-387470f3-1883319727.us-east-1.elb.amazonaws.com/'

In [14]:
sample = True  # To make the computations faster, sample = True.

if sample:
    tot_df = tot_df.sample(n=10_000)
    
job_name = f"pytorch-{formatted_time()}-all-models"  # change it as you prefer
input_path = DEV_BUCKET / 'training' / 'input_data' / job_name  # Do not change this

train_path = str(input_path / 'train.pickle')
val_path = str(input_path / 'val.pickle')


tot_df.to_pickle(train_path, protocol=4)  # protocol 4 is necessary, since SageMaker uses python 3.6
test_data.to_pickle(val_path, protocol=4)

### Estimator Definition

In [15]:
# GPU instances

instances = [
    'ml.p2.xlarge',
    'ml.p3.2xlarge'
]

The hyperparameters are passed as command line arguments to the training script. 

You can add/change them as you like. It's important to keep the `tracking_uri` and the `experiment_name` which are used by MLFlow.

The class `PyTorch` is part of the `SageMaker` python API. The parameters are important and you should probably not change most of them. The ones you may want to change are:

- `instance_type`, specify the instance you want
- `source_dir`, specify your script directory. Try to use global variable as much as possible

In [16]:
from sagemaker.pytorch import PyTorch


hyperparameters={
    'tracking_uri': MLFLOW_SERVER,
    'experiment_name': "pl-trials",
    'max_len': 256,
    'epochs': 1,
    'model_name': 'microsoft/xtremedistil-l6-h256-uncased',
    'tokenizer_name': 'microsoft/xtremedistil-l6-h256-uncased',
    'dropout_rate': 0.3,
    'pred_threshold':0.4,
    'output_length': 256,
    'learning_rate': 9e-5,
    'training_names':'sectors,demographic_groups',
    #'training_names':'subpillars_1d,specific_needs_groups,demographic_groups',
    #'training_names':'subpillars_1d,specific_needs_groups,demographic_groups',
    #'training_names':'sectors_subpillars_2d,subpillars_1d,specific_needs_groups,severity_affected_groups,demographic_groups',
    #'train_with_all_positive_examples':True,
    #'balance_trainig_data':False,
    "proportion_negative_examples_train_df":0.3,
    "model_mode":"deploy"
}

estimator = PyTorch(
    entry_point='train_mlflow.py',
    source_dir=str('../../../scripts/training/selim/multiclass-lightning'),
    output_path=str(DEV_BUCKET/'models/'),
    code_location=str(input_path),
    instance_type='ml.p2.xlarge',
    instance_count=1,
    role=role,
    framework_version='1.8',
    py_version='py36',
    hyperparameters = hyperparameters,
    job_name=job_name,
#     train_instance_count=2,
#     train_instance_type="ml.c4.xlarge",
)

In [17]:
fit_arguments = {
    'train': str(input_path),
    'test': str(input_path)
}

In [18]:
# Fit the estimator

estimator.fit(fit_arguments, job_name=job_name)

2021-10-03 01:44:13 Starting - Starting the training job...
2021-10-03 01:44:36 Starting - Launching requested ML instancesProfilerReport-1633225450: InProgress
...
2021-10-03 01:45:16 Starting - Preparing the instances for training............
2021-10-03 01:47:21 Downloading - Downloading input data
2021-10-03 01:47:21 Training - Downloading the training image...........................
2021-10-03 01:52:30 Training - Training image download completed. Training in progress..[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2021-10-03 01:52:32,160 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2021-10-03 01:52:32,185 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2021-10-03 01:52:32,864 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2021-10-03 01:52:33,746 sagema

[34mCollecting aiohttp>=3.3.1
  Downloading aiohttp-3.7.4.post0-cp36-cp36m-manylinux2014_x86_64.whl (1.3 MB)[0m
[34mCollecting aioitertools>=0.5.1
  Downloading aioitertools-0.8.0-py3-none-any.whl (21 kB)[0m
[34mCollecting idna-ssl>=1.0
  Downloading idna-ssl-1.1.0.tar.gz (3.4 kB)[0m
[34mCollecting multidict<7.0,>=4.5
  Downloading multidict-5.1.0-cp36-cp36m-manylinux2014_x86_64.whl (141 kB)[0m
[34mCollecting yarl<2.0,>=1.0
  Downloading yarl-1.6.3-cp36-cp36m-manylinux2014_x86_64.whl (293 kB)[0m
[34mCollecting async-timeout<4.0,>=3.0
  Downloading async_timeout-3.0.1-py3-none-any.whl (8.2 kB)[0m
[34mCollecting Mako
  Downloading Mako-1.1.5-py2.py3-none-any.whl (75 kB)[0m
[34mCollecting python-editor>=0.3
  Downloading python_editor-1.0.4-py3-none-any.whl (4.9 kB)[0m
[34mCollecting boto3>=1.16.32
  Downloading boto3-1.18.53-py3-none-any.whl (131 kB)[0m
[34mCollecting s3transfer<0.6.0,>=0.5.0
  Downloading s3transfer-0.5.0-py3-none-any.whl (79 kB)[0m
[34mCollecting b

[34m  Building wheel for databricks-cli (setup.py): finished with status 'done'
  Created wheel for databricks-cli: filename=databricks_cli-0.15.0-py3-none-any.whl size=105259 sha256=02d6246f712e4cd2ba569dfd01bbc6ef0ce0ed706bef7bb20aafafeb177eb6f3
  Stored in directory: /root/.cache/pip/wheels/c0/0b/2a/ba06e44bcbf2a48da34fde3c3ebcf5f7d5ef8cb975f9571305
  Building wheel for idna-ssl (setup.py): started
  Building wheel for idna-ssl (setup.py): finished with status 'done'
  Created wheel for idna-ssl: filename=idna_ssl-1.1.0-py3-none-any.whl size=3161 sha256=c711a02851538b5eaf3b537e1764c1ba080b129f32ae6b19e37ee2ed4c46b62c
  Stored in directory: /root/.cache/pip/wheels/6a/f5/9c/f8331a854f7a8739cf0e74c13854e4dd7b1af11b04fe1dde13
  Building wheel for termcolor (setup.py): started[0m
[34m  Building wheel for termcolor (setup.py): finished with status 'done'
  Created wheel for termcolor: filename=termcolor-1.1.0-py3-none-any.whl size=4830 sha256=5cd468d029659253f621db32ab34439a539c5fc16a4

[34m#015Validation sanity check: 0it [00:00, ?it/s]#015Validation sanity check:   0%|          | 0/2 [00:00<?, ?it/s][2021-10-03 01:54:54.808 algo-1:85 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2021-10-03 01:54:54.861 algo-1:85 INFO profiler_config_parser.py:102] User has disabled profiler.[0m
[34m[2021-10-03 01:54:54.862 algo-1:85 INFO json_config.py:91] Creating hook from json_config at /opt/ml/input/config/debughookconfig.json.[0m
[34m[2021-10-03 01:54:54.862 algo-1:85 INFO hook.py:201] tensorboard_dir has not been set for the hook. SMDebug will not be exporting tensorboard summaries.[0m
[34m[2021-10-03 01:54:54.863 algo-1:85 INFO hook.py:255] Saving to /opt/ml/output/tensors[0m
[34m[2021-10-03 01:54:54.863 algo-1:85 INFO state_store.py:77] The checkpoint config file /opt/ml/input/config/checkpointconfig.json does not exist.[0m
[34m[2021-10-03 01:54:55.068 algo-1:85 INFO hook.py:594] name:model.l0.embeddings.word_embeddings.weight count_params:781363

[34m#015                                                              #015#015Training: 0it [00:00, ?it/s]#015Training:   0%|          | 0/297 [00:00<?, ?it/s]#015Epoch 0:   0%|          | 0/297 [00:00<?, ?it/s] #015Epoch 0:  10%|█         | 30/297 [00:06<00:54,  4.88it/s]#015Epoch 0:  10%|█         | 30/297 [00:06<00:54,  4.88it/s, loss=3, v_num=0, val_f1_epoch=0.0817, val_loss_epoch=0.684, train_f1=0.542]#015Epoch 0:  20%|██        | 60/297 [00:11<00:46,  5.07it/s, loss=3, v_num=0, val_f1_epoch=0.0817, val_loss_epoch=0.684, train_f1=0.542]#015Epoch 0:  20%|██        | 60/297 [00:11<00:46,  5.07it/s, loss=1.81, v_num=0, val_f1_epoch=0.0817, val_loss_epoch=0.684, train_f1=0.485]#015Epoch 0:  30%|███       | 90/297 [00:17<00:39,  5.18it/s, loss=1.81, v_num=0, val_f1_epoch=0.0817, val_loss_epoch=0.684, train_f1=0.485]#015Epoch 0:  30%|███       | 90/297 [00:17<00:39,  5.18it/s, loss=1.72, v_num=0, val_f1_epoch=0.0817, val_loss_epoch=0.684, train_f1=0.474]#015Epoch 0:  40%|████      | 12


2021-10-03 01:56:39 Uploading - Uploading generated training model
2021-10-03 01:57:00 Completed - Training job completed
Training seconds: 596
Billable seconds: 596
