# Import

These requirements are necessary if you launch this notebook from SageMaker instances

In [None]:
"""!pip install mlflow
!pip install pytorch-lightning
!pip install transformers
!pip install tqdm
!pip install sagemaker

!pip install s3fs
!pip install smdebug"""

In [None]:
import sys
sys.path.append('../../../')

import os
import sys

import sagemaker
import pandas as pd
from ast import literal_eval

Local constants, regarding the data, MLFlow server, paths, etc..: use them

In [None]:
from deep.constants import *
from deep.utils import *

In [None]:
%load_ext autoreload
%autoreload 2

# Data

You can use the data you want. We advise the `pandas` format.

In [None]:
def custom_eval(x):
    if str(x)=='nan':
        return {}
    if str(x)=='[None]':
        return {}
    if type(x)==list:
        return x
    else:
        return literal_eval(x)

In [None]:
# CCA data
"""DATA_PATH = os.path.join("..", "..", "..", "data", "frameworks_data", "development_cca")
experiment_name = 'pl-cca'
train_val_df = pd.read_csv(
    os.path.join(DATA_PATH, "train_val_data.csv.gz"), compression="gzip"
).drop_duplicates()
test_df = pd.read_csv(os.path.join(DATA_PATH, "test_data.csv.gz"), compression="gzip")
"""

# deployed data version
DATA_PATH = os.path.join("..", "..", "..", "data", "frameworks_data", "data_v0.8")
experiment_name = "classification_v0_8"

kept_cols = [
    "en",
    "fr",
    "es",
    "pt",
    "entry_id",
    "project_id",
    "nlp_tags",
    "original_language",
]  # , 'analysis_framework_id']

all_data = pd.read_csv(
    os.path.join(DATA_PATH, "hum_data_v0.8.csv.gz"), compression="gzip"
).drop_duplicates()
all_data = all_data[all_data.confidentiality == "unprotected"][kept_cols].rename(
    columns={"nlp_tags": "target"}
)

# subsectors
"""DATA_PATH = os.path.join(
    '..', '..', '..', "data", "frameworks_data", 'subsectors', 'training_data'
)
experiment_name = 'pl-subsectors'
train_val_df = pd.read_csv(os.path.join(DATA_PATH, 'train_subsectors.csv')).drop_duplicates()
test_df = pd.read_csv(os.path.join(DATA_PATH, 'test_subsectors.csv'))[['entry_id', 'excerpt', 'target']]
train_val_df.shape[0], test_df.shape[0]"""

# livelihoods
"""DATA_PATH = os.path.join(
    "..", "..", "..", "data", "frameworks_data", "livelihoods_subsectors"
)
experiment_name = "pl-livelihoods"
train_val_df = pd.read_csv(
    os.path.join(DATA_PATH, "livelihoods_subsectors_labeled.csv"), lineterminator="\n"
).drop_duplicates()
test_df = pd.read_csv(
    os.path.join(
        DATA_PATH, "livelihoods_en_subsectors_unlabeled.csv"
    ), lineterminator="\n"
)"""

In [None]:
"""hum_mapping_sheet = pd.read_csv(os.path.join(DATA_PATH, "hum_mapping_sheet_nov2022.csv"))

relevant_cols = [
    "Original first level",
    "Original second level",
    "NLP Type",
    "NLP first level",
    "NLP second level",
    "NLP third level",
]

livelihood_mapping = hum_mapping_sheet[
    hum_mapping_sheet["Original first level"] == "livelihoods"
][relevant_cols].drop_duplicates()

livelihoods_mapping_dict = {
    f"subsectors->{row['Original first level']}->{row['Original second level']}": f"subsectors->{row['NLP first level']}->{row['NLP second level']}"
    for i, row in livelihood_mapping.iterrows()
}

train_val_df['target'] = train_val_df['target'].apply(
    lambda x: [livelihoods_mapping_dict[item.lower()] for item in custom_eval(x)]
)

train_val_df["target"] = train_val_df["target"].apply(
    lambda x: str([item for item in custom_eval(x) if "secondary" not in item])
)
"""

## Sagemaker Prep

### Session

Configure SageMaker

In [None]:
sess = sagemaker.Session(default_bucket=DEV_BUCKET.name)
role = SAGEMAKER_ROLE
role_arn = SAGEMAKER_ROLE_ARN
tracking_uri = MLFLOW_SERVER

### Bucket upload

You need to upload data to an S3 bucket. 




In [None]:
MLFLOW_SERVER

In [None]:
sample = False

if sample:
    all_data = all_data.sample(n=5_000)

print(all_data.shape)

job_name = f"pytorch-{formatted_time()}-entry-classification"  # change it as you prefer
input_path = DEV_BUCKET / 'training' / 'input_data' / job_name  # Do not change this

train_path = str(input_path / 'train.pickle')
all_data.to_pickle(train_path, protocol=4)  # protocol 4 is necessary, since SageMaker uses python 3.6

### Estimator Definition

In [None]:
# GPU instances
instances = [
    'ml.p2.xlarge',
    'ml.p3.2xlarge'
]

# CPU instances
instances = [
    'ml.c4.2xlarge',
    'ml.c4.4xlarge',
    'ml.c5n.2xlarge'
]

# https://aws.amazon.com/sagemaker/pricing/instance-types

The hyperparameters are passed as command line arguments to the training script. 

You can add/change them as you like. It's important to keep the `tracking_uri` and the `experiment_name` which are used by MLFlow.

The class `PyTorch` is part of the `SageMaker` python API. The parameters are important and you should probably not change most of them. The ones you may want to change are:

- `instance_type`, specify the instance you want
- `source_dir`, specify your script directory. Try to use global variable as much as possible

In [None]:
from sagemaker.pytorch import PyTorch

instance_type = "ml.p3.2xlarge"
run_name = experiment_name  # "ENDPOINT_TESTING"# "all_tags_final"

hyperparameters = {
    "tracking_uri": MLFLOW_SERVER,
    "experiment_name": experiment_name,
    "max_len": 150,
    "delete_long_excerpts": "false",
    "apply_preprocessing": "false",
    "explainability": "false",
    "predictions_on_test_set": "false",# "true" if not all([sample_test, sample_train]) else "false",
    "epochs": 1 if sample else 3,
    # "model_name": "xlm-roberta-base",
    # "tokenizer_name": "xlm-roberta-base",
    # "output_length": 768,
    "model_name": "nreimers/mMiniLMv2-L6-H384-distilled-from-XLMR-Large",
    "tokenizer_name": "nreimers/mMiniLMv2-L6-H384-distilled-from-XLMR-Large",
    "output_length": 384,
    "dropout": 0.2,
    "learning_rate": 1e-4,
    "weight_decay": 5e-3,
    "instance_type": instance_type,
    "f_beta": 1,
    "nb_repetitions": 1,
    "run_name": run_name,
    "train_batch_size": 64,
    "val_batch_size": 128,
    "n_freezed_layers": 1,
    "loss_gamma": 2,
    "proportions_pow": 1,
    "min_entries_per_proj": 30 if sample else 1_000,
    "relabling_min_ratio": 1,
    "apply_relabling": "true"
}

estimator = PyTorch(
    entry_point="train.py",
    source_dir=str(
        "../../../scripts/training/selim/entry_classification/MultitaskWithRelabling"
    ),
    output_path=str(DEV_BUCKET / "models/"),
    code_location=str(input_path),
    instance_type=instance_type,
    instance_count=1,
    role=role,
    framework_version="1.8",
    py_version="py3",
    hyperparameters=hyperparameters,
    job_name=job_name,
    debugger_hook_config=False
    #     train_instance_count=2,
    #     train_instance_type="ml.c4.xlarge",
)

fit_arguments = {"train": str(input_path), "test": str(input_path)}

In [None]:
# Fit the estimator
estimator.fit(fit_arguments, job_name=job_name)