# Import

These requirements are necessary if you launch this notebook from SageMaker instances

In [1]:
"""!pip install mlflow
!pip install pytorch-lightning
!pip install transformers
!pip install tqdm
!pip install sagemaker
!pip install s3fs"""

'!pip install mlflow\n!pip install pytorch-lightning\n!pip install transformers\n!pip install tqdm\n!pip install sagemaker\n!pip install s3fs'

In [2]:
import sys
sys.path.append('../../../')

import os
import sys
import logging
import argparse
from pathlib import Path
from ast import literal_eval
from collections import Counter
from typing import Any, Dict, Optional

In [3]:
from tqdm.auto import tqdm

import torchmetrics
from torchmetrics.functional import accuracy, f1, auroc

import sagemaker
from sagemaker import get_execution_role
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.core.decorators import auto_move_data
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import MLFlowLogger


import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, multilabel_confusion_matrix


import matplotlib.pyplot as plt
from pylab import rcParams
from matplotlib import rc

import transformers
from transformers import (
    AdamW,
    AutoConfig,
    AutoModel,
    AutoTokenizer,
)
from transformers.optimization import (
    Adafactor,
    get_linear_schedule_with_warmup,
)

Local constants, regarding the data, MLFlow server, paths, etc..: use them

In [4]:
from deep.constants import *
from deep.utils import *

In [5]:
%load_ext autoreload
%autoreload 2

# Data

You can use the data you want. We advise the `pandas` format.

In [6]:
TRAIN_PATH = os.path.join('..', '..', '..', "data", "frameworks_data", "data_v0.4.4", "data_v0.4.4_train.csv")
VAL_PATH = os.path.join('..', '..', '..', "data", "frameworks_data", "data_v0.4.4", "data_v0.4.4_val.csv")

train_df = pd.read_csv(TRAIN_PATH)
val_df = pd.read_csv(VAL_PATH)

## Sagemaker Prep

### Session

Configure SageMaker

In [7]:
sess = sagemaker.Session(default_bucket=DEV_BUCKET.name)
role = SAGEMAKER_ROLE
role_arn = SAGEMAKER_ROLE_ARN

### Bucket upload

You need to upload data to an S3 bucket. 




In [8]:
sample = True  # To make the computations faster, sample = True.

if sample:
    train_df = train_df.sample(n=1000)
    val_df = val_df.sample(n=1000)
    
job_name = f"pytorch-{formatted_time()}-test"  # change it as you prefer
input_path = DEV_BUCKET / 'training' / 'input_data' / job_name  # Do not change this

train_path = str(input_path / 'train.pickle')
val_path = str(input_path / 'val.pickle')


train_df.to_pickle(train_path, protocol=4)  # protocol 4 is necessary, since SageMaker uses python 3.6
val_df.to_pickle(val_path, protocol=4)

### Estimator Definition

In [9]:
# GPU instances

instances = [
    'ml.p2.xlarge',
    'ml.p3.2xlarge'
]

The hyperparameters are passed as command line arguments to the training script. 

You can add/change them as you like. It's important to keep the `tracking_uri` and the `experiment_name` which are used by MLFlow.

The class `PyTorch` is part of the `SageMaker` python API. The parameters are important and you should probably not change most of them. The ones you may want to change are:

- `instance_type`, specify the instance you want
- `source_dir`, specify your script directory. Try to use global variable as much as possible

In [10]:
from sagemaker.pytorch import PyTorch


hyperparameters={
    'tracking_uri': MLFLOW_SERVER,
    'experiment_name': 'en_language_subpillars',
    'max_len': 128,
    'epochs': 1,
    'model_name': 'distilbert-base-uncased',
    'tokenizer_name': 'distilbert-base-uncased',
    'language_method': 'keep',
    'pred_threshold':0.38
}

estimator = PyTorch(
    entry_point='train.py',
    source_dir=str('../../../scripts/training/selim/multiclass-lightning'),
    output_path=str(DEV_BUCKET / 'models/'),
    code_location=str(input_path),
    instance_type='ml.p2.xlarge',
    instance_count=1,
    role=role,
    framework_version='1.8',
    py_version='py36',
    hyperparameters = hyperparameters,
    job_name=job_name,
#     train_instance_count=2,
#     train_instance_type="ml.c4.xlarge",
)

In [11]:
fit_arguments = {
    'train': str(input_path),
    'test': str(input_path)
}

In [12]:
# Fit the estimator

estimator.fit(fit_arguments, job_name=job_name)

2021-07-21 10:04:43 Starting - Starting the training job...
2021-07-21 10:05:07 Starting - Launching requested ML instancesProfilerReport-1626861880: InProgress
...
2021-07-21 10:05:47 Starting - Preparing the instances for training.........
2021-07-21 10:07:27 Downloading - Downloading input data...
2021-07-21 10:07:47 Training - Downloading the training image.............................[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2021-07-21 10:13:15,099 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2021-07-21 10:13:15,124 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2021-07-21 10:13:18,229 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2021-07-21 10:13:18,800 sagemaker-training-toolkit INFO     Installing dependencies from requirements.txt:[0m
[34m/

[34m  Building wheel for wrapt (setup.py): finished with status 'done'
  Created wheel for wrapt: filename=wrapt-1.12.1-cp36-cp36m-linux_x86_64.whl size=69745 sha256=832a7237eca0923f28192c397a3ef4539b1743db6801ec363c365af7b34fecb3
  Stored in directory: /root/.cache/pip/wheels/32/42/7f/23cae9ff6ef66798d00dc5d659088e57dbba01566f6c60db63
  Building wheel for idna-ssl (setup.py): started
  Building wheel for idna-ssl (setup.py): finished with status 'done'
  Created wheel for idna-ssl: filename=idna_ssl-1.1.0-py3-none-any.whl size=3161 sha256=0131e2e71b70596af00c55cef8e55ab8b17066b6345ca8f9ff4a66d97e278b07
  Stored in directory: /root/.cache/pip/wheels/6a/f5/9c/f8331a854f7a8739cf0e74c13854e4dd7b1af11b04fe1dde13[0m
[34mSuccessfully built nltk termcolor wrapt idna-ssl[0m
[34mInstalling collected packages: typing-extensions, six, pyasn1-modules, oauthlib, multidict, cachetools, yarl, requests-oauthlib, numpy, idna-ssl, google-auth, async-timeout, tqdm, tensorboard-plugin-wit, regex, mar


2021-07-21 10:14:49 Uploading - Uploading generated training model
2021-07-21 10:14:49 Failed - Training job failed
[34m[nltk_data] Downloading package averaged_perceptron_tagger to[0m
[34m[nltk_data]     /root/nltk_data...[0m
[34m[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.[0m
[34m[nltk_data] Downloading package wordnet to /root/nltk_data...[0m
[34m[nltk_data]   Unzipping corpora/wordnet.zip.[0m
[34m[nltk_data] Downloading package omw to /root/nltk_data...[0m
[34m[nltk_data]   Unzipping corpora/omw.zip.[0m
[34mimporting data ............[0m
[34m2021-07-21 10:14:30.789458: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0[0m
[34mTraceback (most recent call last):
  File "train.py", line 64, in <module>
    all_dataset = read_merge_data (args.training_dir, args.val_dir, data_format='pickle')
  File "/opt/ml/code/utils.py", line 41, in read_merge_data
    train_df = pd.read_pickle("

UnexpectedStatusException: Error for Training job pytorch-2021-07-21-12-04-30-848-test: Failed. Reason: AlgorithmError: ExecuteUserScriptError:
Command "/opt/conda/bin/python3.6 train.py --epochs 1 --experiment_name en_language_subpillars --language_method keep --max_len 128 --model_name distilbert-base-uncased --pred_threshold 0.38 --tokenizer_name distilbert-base-uncased --tracking_uri http://mlflow-deep-387470f3-1883319727.us-east-1.elb.amazonaws.com/"
2021-07-21 10:14:30.789458: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
Traceback (most recent call last):
  File "train.py", line 64, in <module>
    all_dataset = read_merge_data (args.training_dir, args.val_dir, data_format='pickle')
  File "/opt/ml/code/utils.py", line 41, in read_merge_data
    train_df = pd.read_pickle("f{TRAIN_PATH}/train.pickle")
  File "/opt/conda/lib/python3.6/site-packages/pandas/io/pickle.py", line 169, in read_pickle
    f, fh = get_handle(fp_or_buf, "rb", compression=compression, is_text=False)
  File "/opt/conda/lib/python3.6/site-packages/pandas/io/common