# Import

These requirements are necessary if you launch this notebook from SageMaker instances

In [None]:
"""!pip install mlflow
!pip install pytorch-lightning
!pip install transformers
!pip install tqdm
!pip install sagemaker

!pip install s3fs
!pip install smdebug"""

In [None]:
import sys
sys.path.append('../../../')

import os
import sys
import logging
import argparse
from pathlib import Path
from typing import Any, Dict, Optional

In [None]:
from tqdm.auto import tqdm

import torchmetrics
from torchmetrics.functional import accuracy, f1, auroc

import sagemaker
from sagemaker import get_execution_role
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.core.decorators import auto_move_data
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import MLFlowLogger


import matplotlib.pyplot as plt
from pylab import rcParams
from matplotlib import rc

import pandas as pd

Local constants, regarding the data, MLFlow server, paths, etc..: use them

In [None]:
from deep.constants import *
from deep.utils import *

In [None]:
%load_ext autoreload
%autoreload 2

# Data

You can use the data you want. We advise the `pandas` format.

In [None]:
DATA_PATH = os.path.join(
    '..', '..', '..', "data", "frameworks_data", 'data_v0.7.1'
)

"""
original_df = pd.read_csv(os.path.join(DATA_PATH, 'full_dataset.csv'))
augmented_data = pd.read_csv(os.path.join(DATA_PATH, 'generated_text.csv'))"""


tot_df = pd.read_csv(os.path.join(DATA_PATH, 'prim_tags.csv'))
test_df = pd.read_csv(os.path.join(DATA_PATH, 'test_v0.7.1.csv'))[['excerpt']]
"""full_df = pd.read_csv(os.path.join(DATA_PATH, 'full_dataset_with_translations.csv'))"""

In [None]:
tot_df.columns

In [None]:
"""train_val_ids = pd.concat([train_df, val_df]).entry_id.unique()
train_val_df = full_df[full_df.entry_id.isin(train_val_ids)]

fr_df = train_val_df[['entry_id', 'translation_fr']]\
        .rename(columns={'translation_fr':'excerpt'}).dropna()
en_df = train_val_df[['entry_id', 'translation_en']]\
        .rename(columns={'translation_en':'excerpt'}).dropna()
es_df = train_val_df[['entry_id', 'translation_es']]\
        .rename(columns={'translation_es':'excerpt'}).dropna()

augmented_data = pd.concat([en_df, fr_df, es_df])


print('fr:', fr_df.shape[0], 'en:', en_df.shape[0], 'es:', es_df.shape[0])"""

In [None]:
"""def preprocess_columns(tmp_col):
    if str(tmp_col) == 'nan':
        return []
    evaluated_column = literal_eval(tmp_col)
    cleaned_column = [item for item in evaluated_column if str(item)!='NOT_MAPPED' and str(item)!='None']
    return cleaned_column"""

In [None]:
"""from ast import literal_eval
columns = [
     'sectors',
     'severity',
     'age',
     'gender',
     'subpillars_1d', 
     'specific_needs_groups',
     'subpillars_2d', 
    ]
def flatten(t):
    return [item for sublist in t for item in sublist]
for column in columns:
    train_val_df[column] = train_val_df[column].apply(preprocess_columns)
"""

In [None]:
"""augmented_data = pd.merge(
    right=train_val_df.drop(columns=[
        'excerpt', 'translation_en', 'translation_fr', 'translation_es']
                           ),
    left=augmented_data[['entry_id', 'excerpt']],
    on='entry_id',
    how='right'
)"""

In [None]:
"""tot_df = pd.concat([train_val_df, augmented_data])"""

In [None]:
"""columns = [
     'sectors',
     'severity',
     'age',
     'gender',
     'subpillars_1d', 
     'specific_needs_groups',
     'subpillars_2d', 
    ]
def flatten(t):
    return [item for sublist in t for item in sublist]
tot_df['column_present'] = tot_df.apply(
    lambda x: [column for column in columns if len(x[column])>2], axis=1
               )"""

In [None]:
"""tot_df.to_csv(os.path.join(DATA_PATH, 'train_val_df.csv'))"""

In [None]:
"""augmented_data = pd.merge(
    right=original_df.drop(columns=['excerpt']),
    left=augmented_data[['entry_id', 'excerpt']],
    on='entry_id',
    how='right'
)"""

In [None]:
"""tot_df = pd.concat([original_df, augmented_data])"""

In [None]:
"""from ast import literal_eval"""

In [None]:
"""tot_df['pillars_1d'] = tot_df.subpillars_1d.apply(
    lambda item: [x.split('->')[0] for x in literal_eval(item)]
)
tot_df['pillars_2d'] = tot_df.subpillars_2d.apply(
    lambda item: [x.split('->')[0] for x in literal_eval(item)]
)"""

In [None]:
columns = ['excerpt', 'entry_id',
           'sectors',
           'present_prim_tags',
           'pillars_2d',
           'pillars_1d', 
           'impact_capresp_humcond', 
           'need_intervention_risk',
           'context_covid', 
           'displacement_shockevent',
           'access_infcom_casualities'
        ]

tot_df = tot_df[columns]

## Sagemaker Prep

### Session

Configure SageMaker

In [None]:
sess = sagemaker.Session(default_bucket=DEV_BUCKET.name)
role = SAGEMAKER_ROLE
role_arn = SAGEMAKER_ROLE_ARN
tracking_uri = MLFLOW_SERVER

### Bucket upload

You need to upload data to an S3 bucket. 




In [None]:
MLFLOW_SERVER

In [None]:
sample = False  # To make the computations faster, sample = True.

if sample:
    tot_df = tot_df.sample(n=20_000)
    
job_name = f"pytorch-{formatted_time()}-all-models"  # change it as you prefer
input_path = DEV_BUCKET / 'training' / 'input_data' / job_name  # Do not change this

train_path = str(input_path / 'train.pickle')
val_path = str(input_path / 'val.pickle')


tot_df.to_pickle(train_path, protocol=4)  # protocol 4 is necessary, since SageMaker uses python 3.6
test_df.to_pickle(val_path, protocol=4)

### Estimator Definition

In [None]:
# GPU instances

instances = [
    'ml.p2.xlarge',
    'ml.p3.2xlarge'
]

The hyperparameters are passed as command line arguments to the training script. 

You can add/change them as you like. It's important to keep the `tracking_uri` and the `experiment_name` which are used by MLFlow.

The class `PyTorch` is part of the `SageMaker` python API. The parameters are important and you should probably not change most of them. The ones you may want to change are:

- `instance_type`, specify the instance you want
- `source_dir`, specify your script directory. Try to use global variable as much as possible

In [None]:
from sagemaker.pytorch import PyTorch

instance_type='ml.p3.2xlarge'

hyperparameters={
    'tracking_uri': MLFLOW_SERVER,
    'experiment_name': "pl-all-models-experiments",
    'max_len': 512,
    'epochs': 5,
    'model_name': 'microsoft/xtremedistil-l6-h256-uncased',
    'tokenizer_name': 'microsoft/xtremedistil-l6-h256-uncased',
    'dropout_rate': 0.4,
    'output_length': 256,
    'training_names':','.join(columns[2:]),
    "instance_type": instance_type,
    'beta_f1': 0.8
}

estimator = PyTorch(
    entry_point='train_mlflow.py',
    source_dir=str('../../../scripts/training/selim/multiclass-lightning'),
    output_path=str(DEV_BUCKET/'models/'),
    code_location=str(input_path),
    instance_type=instance_type,
    instance_count=1,
    role=role,
    framework_version="1.8",
    py_version="py36",
    hyperparameters = hyperparameters,
    job_name=job_name,
#     train_instance_count=2,
#     train_instance_type="ml.c4.xlarge",
)

In [None]:
fit_arguments = {
    'train': str(input_path),
    'test': str(input_path)
}

In [None]:
# Fit the estimator

estimator.fit(fit_arguments, job_name=job_name)