# Import

These requirements are necessary if you launch this notebook from SageMaker instances

In [None]:
"""!pip install mlflow
!pip install pytorch-lightning
!pip install transformers
!pip install tqdm
!pip install sagemaker

!pip install s3fs
!pip install smdebug"""

In [None]:
import sys
sys.path.append('../../../')

import os
import sys

In [None]:
import sagemaker
import pandas as pd

Local constants, regarding the data, MLFlow server, paths, etc..: use them

In [None]:
from deep.constants import *
from deep.utils import *

In [None]:
%load_ext autoreload
%autoreload 2

# Data

You can use the data you want. We advise the `pandas` format.

In [None]:
DATA_PATH = os.path.join(
    '..', '..', '..', "data", "frameworks_data", 'data_v0.7.1'
)

train_val_df = pd.read_csv(os.path.join(DATA_PATH, 'new_columns_train_val.csv')).drop_duplicates()
test_df = pd.read_csv(os.path.join(DATA_PATH, 'new_columns_test_v0.7.1.csv'))[['excerpt']]

"""DATA_PATH = os.path.join(
    '..', '..', '..', "data", "frameworks_data", 'subsectors', 'training_data'
)

train_val_df = pd.read_csv(os.path.join(DATA_PATH, 'train_subsectors.csv')).drop_duplicates()
test_df = pd.read_csv(os.path.join(DATA_PATH, 'test_subsectors.csv'))[['excerpt']]"""

In [None]:
"""from ast import literal_eval

train_val_df['target'] = train_val_df['target'].apply(
    lambda x: [item for item in literal_eval(x) if 'first_level' in item]
)
train_val_df.to_csv(os.path.join(DATA_PATH, 'tmp_train_val.csv'))

"""
#train_val_df = pd.read_csv(os.path.join(DATA_PATH, 'tmp_train_val.csv'))
#train_val_df['target']

In [None]:
from ast import literal_eval
def flatten(t):
    return [item for sublist in t for item in sublist]

len(list(set(flatten(train_val_df['target'].apply(literal_eval)))))

In [None]:
train_val_df['target']

In [None]:
columns = ['excerpt', 'entry_id', 'target']
train_val_df = train_val_df[columns]

In [None]:
train_val_df.columns

In [None]:
train_val_df.shape

## Sagemaker Prep

### Session

Configure SageMaker

In [None]:
sess = sagemaker.Session(default_bucket=DEV_BUCKET.name)
role = SAGEMAKER_ROLE
role_arn = SAGEMAKER_ROLE_ARN
tracking_uri = MLFLOW_SERVER

### Bucket upload

You need to upload data to an S3 bucket. 




In [None]:
MLFLOW_SERVER

In [None]:
subsectors_list = [
    "subsector->Education->Learning Environment",
    "subsector->Education->Teachers and Education Personnel",
    "subsector->Education->Teaching and Learning",
    "subsector->Health->Health care",
    "subsector->Health->Health status",
    "subsector->Livelihoods->Expenditures",
    "subsector->Livelihoods->Income",
    "subsector->Livelihoods->Productive Assets",
    "subsector->Livelihoods->Skills & Qualifications",
    "subsector->Nutrition->Nutrition services",
    "subsector->Nutrition->Nutrition status",
    "subsector->Protection->Child Protection",
    "subsector->Protection->Civil and Political Rights",
    "subsector->Protection->Documentation",
    "subsector->Protection->Freedom of Movement",
    "subsector->Protection->Housing Land and Property",
    "subsector->Protection->Human Trafficking",
    "subsector->Protection->Human rights",
    "subsector->Protection->Justice and Rule of Law",
    "subsector->Protection->Mines and UXOs",
    "subsector->Protection->Physical Safety and Security",
    "subsector->Protection->Sexual and Gender Based Violence",
    "subsector->Shelter->Domestic Living Space",
    "subsector->Shelter->Dwelling Enveloppe",
    "subsector->Shelter->Housing Land and Property",
    "subsector->Shelter->Non Food Items",
    "subsector->WASH->Hygiene",
    "subsector->WASH->Sanitation",
    "subsector->WASH->Vector control",
    "subsector->WASH->Waste management",
    "subsector->WASH->Water Supply",
]


In [None]:
import random

sample = False  # To make the computations faster, sample = True.

if sample:
    train_val_df = train_val_df.sample(n=20_000)

"""#tmp, for test
train_val_df['target'] = train_val_df['target'].apply(
    lambda x: str(random.sample(subsectors_list, 2) + literal_eval(x))
)"""
    
job_name = f"pytorch-{formatted_time()}-all-models"  # change it as you prefer
input_path = DEV_BUCKET / 'training' / 'input_data' / job_name  # Do not change this

train_path = str(input_path / 'train.pickle')
val_path = str(input_path / 'val.pickle')

train_val_df.to_pickle(train_path, protocol=4)  # protocol 4 is necessary, since SageMaker uses python 3.6
print('finished uploading train val df.')
test_df.to_pickle(val_path, protocol=4)

### Estimator Definition

In [None]:
# GPU instances

instances = [
    'ml.p2.xlarge',
    'ml.p3.2xlarge'
]

# CPU instances
instances = [
    'ml.c4.2xlarge',
    'ml.c4.4xlarge',
    'ml.c5n.2xlarge'
]

# https://aws.amazon.com/sagemaker/pricing/instance-types

The hyperparameters are passed as command line arguments to the training script. 

You can add/change them as you like. It's important to keep the `tracking_uri` and the `experiment_name` which are used by MLFlow.

The class `PyTorch` is part of the `SageMaker` python API. The parameters are important and you should probably not change most of them. The ones you may want to change are:

- `instance_type`, specify the instance you want
- `source_dir`, specify your script directory. Try to use global variable as much as possible

In [None]:
from sagemaker.pytorch import PyTorch

instance_type = "ml.p3.2xlarge"
relabeled_columns = "none" # one of ['none', 'sectors', 'secondary_tags', 'subsector']
if relabeled_columns=='none':
    experiment_name = "pl-deep-deployment"
else:
    experiment_name = "pl-relabling"

#experiment_name = 'zero_shot_testing'

run_name = "model_small_first_release_data" #"ENDPOINT_TESTING"# "all_tags_final"

hyperparameters = {
    "tracking_uri": MLFLOW_SERVER,
    "experiment_name": experiment_name,
    "max_len": 128,
    "epochs": 5,
    #"model_name": "xlm-roberta-base",
    #"tokenizer_name": "xlm-roberta-base",
    #"output_length": 768,
    "model_name": "nreimers/mMiniLMv2-L6-H384-distilled-from-XLMR-Large",
    "tokenizer_name": "nreimers/mMiniLMv2-L6-H384-distilled-from-XLMR-Large",
    "output_length": 384,
    "dropout": 0.2,
    "learning_rate": 10e-5,
    "weight_decay": 1e-2,
    "instance_type": instance_type,
    "f_beta": 0.8,
    "nb_repetitions": 1,
    "run_name": run_name,
    "train_batch_size": 64,
    "val_batch_size": 128,
    "n_freezed_layers": 1,
    "relabeled_columns": relabeled_columns
}

estimator = PyTorch(
    entry_point="train_mlflow.py",
    source_dir=str(
        "../../../scripts/training/selim/multiclass-lightning/MultitaskAllInOne"
    ),
    output_path=str(DEV_BUCKET / "models/"),
    code_location=str(input_path),
    instance_type=instance_type,
    instance_count=1,
    role=role,
    framework_version="1.8",
    py_version="py3",
    hyperparameters=hyperparameters,
    job_name=job_name,
    #     train_instance_count=2,
    #     train_instance_type="ml.c4.xlarge",
)


In [None]:
fit_arguments = {
    'train': str(input_path),
    'test': str(input_path)
}

In [None]:
# Fit the estimator
estimator.fit(fit_arguments, job_name=job_name)