# Import

These requirements are necessary if you launch this notebook from SageMaker instances

In [1]:
"""!pip install mlflow
!pip install pytorch-lightning
!pip install transformers
!pip install tqdm
!pip install sagemaker
!pip install s3fs"""

'!pip install mlflow\n!pip install pytorch-lightning\n!pip install transformers\n!pip install tqdm\n!pip install sagemaker\n!pip install s3fs'

In [2]:
import sys
sys.path.append('../../../')

import os
import sys
import logging
import argparse
from pathlib import Path
from ast import literal_eval
from collections import Counter
from typing import Any, Dict, Optional

In [3]:
from tqdm.auto import tqdm

import torchmetrics
from torchmetrics.functional import accuracy, f1, auroc

import sagemaker
from sagemaker import get_execution_role
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.core.decorators import auto_move_data
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import MLFlowLogger


import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, multilabel_confusion_matrix


import matplotlib.pyplot as plt
from pylab import rcParams
from matplotlib import rc


Local constants, regarding the data, MLFlow server, paths, etc..: use them

In [4]:
from deep.constants import *
from deep.utils import *

  and should_run_async(code)


In [5]:
%load_ext autoreload
%autoreload 2

# Data

You can use the data you want. We advise the `pandas` format.

In [6]:
DATA_PATH = os.path.join('..', 
                         '..', 
                         '..', 
                         "data", 
                         "frameworks_data", 
                         "data_v0.6", 
                         "generated_entries", 
                         "entries_df.csv")
#VAL_PATH = os.path.join('..', '..', '..', "data", "frameworks_data", "data_v0.4.4", "data_v0.4.4_val.csv")

df = pd.read_csv(DATA_PATH, index_col=0).dropna()

train_df, val_df = train_test_split(df)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [7]:
train_df

  and should_run_async(code)


Unnamed: 0,id,excerpt,language
137401,287487,A raíz de la pandemia por covid-19 la salud me...,es
195421,389587,No existe información de contraste de estos in...,es
134780,290891,Internews media partners continue to address m...,en
14045,64297,Con respecto a: Las capturas de ciudadanos ven...,es
39188,96415,Drones and helicopters usually reserved for ch...,en
...,...,...,...
184363,309176,(15-21 Nov 2020)Under the Global Fund multicou...,en
113870,266787,"UNICEF data shows that some 40,000 children wo...",en
127944,264692,"[3rd Feb 2021, Bangladesh] Thirteen more peopl...",en
32949,21148,There are severe shortages of basic items such...,en


## Sagemaker Prep

### Session

Configure SageMaker

In [8]:
sess = sagemaker.Session(default_bucket=DEV_BUCKET.name)
role = SAGEMAKER_ROLE
role_arn = SAGEMAKER_ROLE_ARN

  and should_run_async(code)


### Bucket upload

You need to upload data to an S3 bucket. 




In [9]:
sample = False  # To make the computations faster, sample = True.

if sample:
    train_df = train_df.sample(n=1000)
    val_df = val_df.sample(n=1000)
    
job_name = f"pytorch-{formatted_time()}-subpillars-model"  # change it as you prefer
input_path = DEV_BUCKET / 'training' / 'input_data' / job_name  # Do not change this

train_path = str(input_path / 'train.pickle')
val_path = str(input_path / 'val.pickle')


train_df.to_pickle(train_path, protocol=4)  # protocol 4 is necessary, since SageMaker uses python 3.6
val_df.to_pickle(val_path, protocol=4)

### Estimator Definition

In [10]:
# GPU instances

instances = [
    'ml.p2.xlarge',
    'ml.p3.2xlarge'
]

In [11]:
DEV_BUCKET

S3Path('s3://sagemaker-deep-experiments-dev')

The hyperparameters are passed as command line arguments to the training script. 

You can add/change them as you like. It's important to keep the `tracking_uri` and the `experiment_name` which are used by MLFlow.

The class `PyTorch` is part of the `SageMaker` python API. The parameters are important and you should probably not change most of them. The ones you may want to change are:

- `instance_type`, specify the instance you want
- `source_dir`, specify your script directory. Try to use global variable as much as possible

In [12]:
from sagemaker.pytorch import PyTorch


hyperparameters={
    'tracking_uri': MLFLOW_SERVER,
    'experiment_name': 'all_languages_subpillars',
    'batch_size': 4,

}

estimator = PyTorch(
    entry_point='augment_data.py',
    source_dir=str('../../../scripts/training/selim/data-augmentation'),
    output_path=str(DEV_BUCKET/'models/'),
    code_location=str(input_path),
    instance_type='ml.p3.2xlarge',
    instance_count=1,
    role=role,
    framework_version='1.8',
    py_version='py36',
    hyperparameters = hyperparameters,
    job_name=job_name,
#     train_instance_count=2,
#     train_instance_type="ml.c4.xlarge",
)

In [13]:
fit_arguments = {
    'train': str(input_path),
    'test': str(input_path)
}

In [14]:
# Fit the estimator

estimator.fit(fit_arguments, job_name=job_name)

2021-09-02 14:00:30 Starting - Starting the training job...
2021-09-02 14:00:54 Starting - Launching requested ML instancesProfilerReport-1630591227: InProgress
.........
2021-09-02 14:02:35 Starting - Preparing the instances for training.........
2021-09-02 14:04:19 Downloading - Downloading input data
2021-09-02 14:04:19 Training - Downloading the training image...........................[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2021-09-02 14:09:05,121 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2021-09-02 14:09:05,145 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2021-09-02 14:09:05,383 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2021-09-02 14:09:06,230 sagemaker-training-toolkit INFO     Installing dependencies from requirements.txt:[0m
[34m


2021-09-02 14:09:57 Training - Training image download completed. Training in progress.[34m  Downloading boto3-1.17.108-py2.py3-none-any.whl (131 kB)
  Downloading boto3-1.17.107-py2.py3-none-any.whl (131 kB)
  Downloading boto3-1.17.106-py2.py3-none-any.whl (131 kB)[0m
[34mCollecting gitdb<5,>=4.0.1
  Downloading gitdb-4.0.7-py3-none-any.whl (63 kB)[0m
[34mCollecting smmap<5,>=3.0.1
  Downloading smmap-4.0.0-py2.py3-none-any.whl (24 kB)[0m
[34mCollecting tensorboard-data-server<0.7.0,>=0.6.0
  Downloading tensorboard_data_server-0.6.1-py3-none-manylinux2010_x86_64.whl (4.9 MB)[0m
[34mCollecting google-auth<2,>=1.6.3
  Downloading google_auth-1.35.0-py2.py3-none-any.whl (152 kB)[0m
[34mCollecting markdown>=2.6.8
  Downloading Markdown-3.3.4-py3-none-any.whl (97 kB)[0m
[34mCollecting google-auth-oauthlib<0.5,>=0.4.1
  Downloading google_auth_oauthlib-0.4.6-py2.py3-none-any.whl (18 kB)[0m
[34mCollecting tensorboard-plugin-wit>=1.6.0
  Downloading tensorboard_plugin_wit-1.

[34m      Successfully uninstalled six-1.16.0
  Attempting uninstall: typing-extensions
    Found existing installation: typing-extensions 3.10.0.0
    Uninstalling typing-extensions-3.10.0.0:
      Successfully uninstalled typing-extensions-3.10.0.0
  Attempting uninstall: numpy
    Found existing installation: numpy 1.19.1[0m
[34m    Uninstalling numpy-1.19.1:
      Successfully uninstalled numpy-1.19.1[0m
[34m  Attempting uninstall: botocore
    Found existing installation: botocore 1.20.110
    Uninstalling botocore-1.20.110:
      Successfully uninstalled botocore-1.20.110[0m
[34m  Attempting uninstall: tqdm
    Found existing installation: tqdm 4.51.0
    Uninstalling tqdm-4.51.0:
      Successfully uninstalled tqdm-4.51.0[0m
[34m  Attempting uninstall: h5py
    Found existing installation: h5py 2.8.0
    Uninstalling h5py-2.8.0:[0m
[34m      Successfully uninstalled h5py-2.8.0[0m
[34m  Attempting uninstall: boto3
    Found existing installation: boto3 1.17.110
    U

[34m[2021-09-02 14:10:54.687 algo-1:85 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2021-09-02 14:10:54.723 algo-1:85 INFO profiler_config_parser.py:102] User has disabled profiler.[0m
[34m[2021-09-02 14:10:54.724 algo-1:85 INFO json_config.py:91] Creating hook from json_config at /opt/ml/input/config/debughookconfig.json.[0m
[34m[2021-09-02 14:10:54.725 algo-1:85 INFO hook.py:201] tensorboard_dir has not been set for the hook. SMDebug will not be exporting tensorboard summaries.[0m
[34m[2021-09-02 14:10:54.725 algo-1:85 INFO hook.py:255] Saving to /opt/ml/output/tensors[0m
[34m[2021-09-02 14:10:54.725 algo-1:85 INFO state_store.py:77] The checkpoint config file /opt/ml/input/config/checkpointconfig.json does not exist.[0m
[34m[2021-09-02 14:10:54.727 algo-1:85 INFO hook.py:594] name:embed_tokens.weight count_params:33280512[0m
[34m[2021-09-02 14:10:54.727 algo-1:85 INFO hook.py:594] name:layers.0.self_attn.k_proj.weight count_params:262144[0m
[34m[202


2021-09-02 16:01:07 Stopping - Stopping the training job

KeyboardInterrupt: 