# Import

These requirements are necessary if you launch this notebook from SageMaker instances

In [1]:
"""!pip install mlflow
!pip install pytorch-lightning
!pip install transformers
!pip install tqdm
!pip install sagemaker

!pip install s3fs
!pip install smdebug"""

'!pip install mlflow\n!pip install pytorch-lightning\n!pip install transformers\n!pip install tqdm\n!pip install sagemaker\n\n!pip install s3fs\n!pip install smdebug'

In [2]:
import sys
import os
import json
import pickle
from sagemaker.pytorch import PyTorch
import sagemaker
import pandas as pd
from typing import Dict
from scripts.script_token_mean_postprocessing.merge_leads_excerpts import get_training_dict

Local constants, regarding the data, MLFlow server, paths, etc..: use them

In [3]:
sys.path.append('../../../../')
from deep.constants import *
from deep.utils import *

In [4]:
%load_ext autoreload
%autoreload 2

# Data

Load data

In [5]:
use_sample = True

DATA_PATH = os.path.join(
    "..", "..", "..", "..", "data", "frameworks_data", "data_v0.7.1"
)
EXCERPTS_PATH = os.path.join(DATA_PATH, "full_dataset_with_translations.csv")
LEADS_PATH = os.path.join(DATA_PATH, "leads_data.json")

data_folder = "data"
if use_sample:
    sample_percentage = 0.01
    data_file_name = "sample_data.csv"  # sample data
else:
    data_file_name = "full_data.csv"  # full data
    sample_percentage = 1


# tbd each time to make sure changes in files are taken into account.
data_df = get_training_dict(
    leads_data_path=LEADS_PATH,
    excerpts_df_path=EXCERPTS_PATH,
    use_sample=use_sample,
    sample_percentage=sample_percentage,
)
data_df.to_csv(os.path.join(data_folder, data_file_name), index=None)

# do ths because
data_df = pd.read_csv(os.path.join(data_folder, data_file_name))

  if (await self.run_code(code, result,  async_=asy)):
Using custom data configuration default-039825f33ea33186
Reusing dataset json (/home/selim/.cache/huggingface/datasets/json/default-039825f33ea33186/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b)
Loading cached processed dataset at /home/selim/.cache/huggingface/datasets/json/default-039825f33ea33186/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b/cache-f116e2040d270b6c.arrow


## Sagemaker Prep

### Session

Configure SageMaker

In [6]:
sess = sagemaker.Session(default_bucket=DEV_BUCKET.name)
role = SAGEMAKER_ROLE
role_arn = SAGEMAKER_ROLE_ARN
tracking_uri = MLFLOW_SERVER

### Bucket upload

You need to upload data to an S3 bucket. 




In [7]:
MLFLOW_SERVER

'http://mlflow-deep-387470f3-1883319727.us-east-1.elb.amazonaws.com/'

### send data to bucket

In [8]:
job_name = f"pytorch-{formatted_time()}-entry-extraction"  # change it as you prefer
input_path = DEV_BUCKET / 'training' / 'input_data' / job_name  # Do not change this

data_path = str(input_path / 'data.pickle') # keep it as it is

# send data to s3 bucket
# need too check protocol, depending on data type (protocol 4 was made for pandas data inputs)

#data_df = get_df_from_dict(data)
data_df.to_pickle(data_path, protocol=4)  # protocol 4 is necessary, since SageMaker uses python 3.6

### Estimator Definition

In [9]:
# GPU instances
instances = [
    'ml.p2.xlarge',
    'ml.p3.2xlarge'
]

# CPU instances
instances = [
    'ml.c4.2xlarge',
    'ml.c4.4xlarge',
    'ml.c5n.2xlarge'
]

# https://aws.amazon.com/sagemaker/pricing/instance-types

The hyperparameters are passed as command line arguments to the training script. 

You can add/change them as you like. It's important to keep the `tracking_uri` and the `experiment_name` which are used by MLFlow.

The class `PyTorch` is part of the `SageMaker` python API. The parameters are important and you should probably not change most of them. The ones you may want to change are:

- `instance_type`, specify the instance you want
- `source_dir`, specify your script directory. Try to use global variable as much as possible

In [10]:
instance_type = "ml.p3.2xlarge"

experiment_name = "entry-extraction"
run_name = experiment_name  

hyperparameters = {
    "instance_type": instance_type,
    "tracking_uri": MLFLOW_SERVER,
    "experiment_name": experiment_name,
    "run_name": run_name,
    "model_name_or_path": "nreimers/mMiniLMv2-L6-H384-distilled-from-XLMR-Large",
    "tokenizer_name_or_path": "nreimers/mMiniLMv2-L6-H384-distilled-from-XLMR-Large",
    "learning_rate": 1e-4,
    "n_epochs": 1 if use_sample else 3,
    "weight_decay": 0.01,
    "dataloader_num_workers": 6,
    "val_batch_size": 16,
    "train_batch_size": 8,
    "max_len": 512,
    "extra_context_length": 48,
    "dropout": 0.2,
    "tokens_focal_loss_gamma": 1,
    "cls_focal_loss_gamma": 1,
    "fbeta": 1,
    "sample_percentage": sample_percentage,
    "proportions_pow": 0.2,
    #"n_separate_layers": 1,
    #"per_device_train_batch_size": 1,
    #"per_device_eval_batch_size": 1,
    #"gradient_accumulation_steps": 8,
    #"save_strategy": "epoch",
    #"adam_beta1": 0.9,
    #"adam_beta2": 0.98,
    #"adam_epsilon": 1e-6,
    #"warmup_ratio": 0.3,
    #"fp16": true,
}

estimator = PyTorch(
    entry_point="train.py",
    source_dir=str(
        "scripts/script_token_mean_postprocessing"
    ),
    output_path=str(DEV_BUCKET / "models/"),
    code_location=str(input_path),
    instance_type=instance_type,
    instance_count=1,
    role=role,
    framework_version="1.8",
    py_version="py3",
    hyperparameters=hyperparameters,
    job_name=job_name,
    debugger_hook_config=False
    #     train_instance_count=2,
    #     train_instance_type="ml.c4.xlarge",
)

fit_arguments = {"train": str(input_path)}

In [11]:
# Fit the estimator
estimator.fit(fit_arguments, job_name=job_name)

2022-12-16 07:48:38 Starting - Starting the training job...
2022-12-16 07:49:04 Starting - Preparing the instances for trainingProfilerReport-1671176913: InProgress
.........
2022-12-16 07:50:50 Downloading - Downloading input data
2022-12-16 07:50:50 Training - Downloading the training image..................
2022-12-16 07:54:26 Training - Training image download completed. Training in progress....[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2022-12-16 07:54:44,295 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2022-12-16 07:54:44,323 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2022-12-16 07:54:44,325 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2022-12-16 07:54:44,536 sagemaker-training-toolkit INFO     Installing dependencies from requirements.txt: