Using kernel `conda_pytorch_latest_p36`

# Import

In [1]:
import sys
sys.path.append('../../../')

In [2]:
from pathlib import Path
import os
import random
import json
from datetime import datetime

In [3]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report
import torch
import sagemaker
from sagemaker import get_execution_role
import boto3

In [4]:
from deep.constants import *
from deep.utils import *

In [5]:
%load_ext autoreload
%autoreload 2

## Data

In [6]:
train = pd.read_csv(FRAMEWORKS_PATH / 'data_v0.3_train.csv')
val = pd.read_csv(FRAMEWORKS_PATH / 'data_v0.3_val.csv')
test = pd.read_csv(FRAMEWORKS_PATH / 'data_v0.3_test.csv')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [7]:
columns_to_keep = ['excerpt'] + DIMENSION_CLASSES

In [8]:
def process_for_sector(df, sector):
    df = df[['excerpt', sector]]
        
    return df

In [9]:
train_df = process_for_sector(train, 'Humanitarian Conditions')
test_df = process_for_sector(test, 'Humanitarian Conditions')

## Sagemaker Prep

### Session

In [10]:
sess = sagemaker.Session(default_bucket=DEV_BUCKET.name)
role = 'AmazonSageMaker-ExecutionRole-20210519T102514'

### Bucket upload

In [11]:
sample = True

if sample:
    train_df = train_df.sample(1000)
    test_df = test_df.sample(1000)

    
job_name = f"pytorch-training-{formatted_time()}"
input_path = DEV_BUCKET / 'training' / 'input_data' / job_name

s3_train_data = str(input_path / 'train_df.pickle')
s3_validation_data = str(input_path / 'test_df.pickle')

train_df.to_pickle(s3_train_data, protocol=4)
test_df.to_pickle(s3_validation_data, protocol=4)

### Estimator Definition

In [12]:
instances = [
    'ml.p2.xlarge',
    'ml.p3.2xlarge'
]

In [13]:
metric_definitions=[
    {'Name': 'loss', 'Regex': "'loss': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'learning_rate', 'Regex': "'learning_rate': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_loss', 'Regex': "'eval_loss': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_accuracy', 'Regex': "'eval_accuracy': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_f1', 'Regex': "'eval_f1': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_precision', 'Regex': "'eval_precision': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_recall', 'Regex': "'eval_recall': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_stupid_metric', 'Regex': "'eval_stupid_metric': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_runtime', 'Regex': "'eval_runtime': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_samples_per_second', 'Regex': "'eval_samples_per_second': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'epoch', 'Regex': "'epoch': ([0-9]+(.|e\-)[0-9]+),?"}]

In [14]:
# # set True if you need spot instance
# use_spot = True
# train_max_run_secs =   2*24 * 60 * 60
# spot_wait_sec =  5 * 60
# max_wait_time_secs = train_max_run_secs +  spot_wait_sec

# if not use_spot:
#     max_wait_time_secs = None
    
# # During local mode, no spot.., use smaller dataset
# if instance_type == 'local':
#     use_spot = False
#     max_wait_time_secs = 0
#     wait = True
#     # Use smaller dataset to run locally
#     inputs = inputs_sample


In [15]:
from sagemaker.pytorch import PyTorch

hyperparameters={'epochs': 1,
                 'train_batch_size': 32,
                 'model_name': 'distilbert-base-uncased'
                 }

estimator = PyTorch(
    entry_point='train.py',
    source_dir=str(SCRIPTS_TRAINING_PATH / 'stefano/pytorch_estimator_base'),
    output_path=str(DEV_BUCKET / 'models/'),
    code_location=str(input_path),
    instance_type='ml.p3.2xlarge',
    instance_count=1,
    role=role,
    framework_version='1.8',
    py_version='py36',
    hyperparameters = hyperparameters,
    metric_definitions=metric_definitions,
    job_name=job_name,
#     train_instance_count=2,
#     train_instance_type="ml.c4.xlarge",
)

In [16]:
fit_arguments = {
    'train': str(input_path),
    'test': str(input_path)
}

In [17]:
estimator.fit(fit_arguments, job_name=job_name)

2021-05-31 07:53:10 Starting - Starting the training job...
2021-05-31 07:53:33 Starting - Launching requested ML instancesProfilerReport-1622447587: InProgress
......
2021-05-31 07:54:33 Starting - Preparing the instances for training......
2021-05-31 07:55:54 Downloading - Downloading input data...
2021-05-31 07:56:14 Training - Downloading the training image...............
2021-05-31 07:58:59 Training - Training image download completed. Training in progress.[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2021-05-31 07:59:00,502 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2021-05-31 07:59:00,525 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2021-05-31 07:59:00,533 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2021-05-31 07:59:00,872 sagemaker-training-

[34m2021-05-31 07:59:24,823 - filelock - INFO - Lock 139830359412576 released on /root/.cache/huggingface/transformers/9c169103d7e5a73936dd2b627e42851bec0831212b677c637033ee4bce9ab5ee.126183e36667471617ae2f0835fab707baa54b731f991507ebbb55ea85adb12a.lock[0m
[34mSome weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias'][0m
[34m- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).[0m
[34m- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForS

[34m{'loss': 0.6801, 'learning_rate': 2.5e-06, 'epoch': 0.78}[0m
[34m{'loss': 0.7009, 'learning_rate': 2.6e-06, 'epoch': 0.81}[0m
[34m{'loss': 0.6959, 'learning_rate': 2.7e-06, 'epoch': 0.84}[0m
[34m{'loss': 0.6941, 'learning_rate': 2.8000000000000003e-06, 'epoch': 0.88}[0m
[34m{'loss': 0.7043, 'learning_rate': 2.9e-06, 'epoch': 0.91}[0m
[34m{'loss': 0.695, 'learning_rate': 3e-06, 'epoch': 0.94}[0m
[34m{'loss': 0.6843, 'learning_rate': 3.1e-06, 'epoch': 0.97}[0m
[34m{'loss': 0.6808, 'learning_rate': 3.2000000000000003e-06, 'epoch': 1.0}[0m
[34m{'eval_loss': 0.6882852911949158, 'eval_accuracy': 0.561, 'eval_f1': 0.5990867579908675, 'eval_precision': 0.5273311897106109, 'eval_recall': 0.693446088794926, 'eval_stupid_metric': 1.0, 'eval_runtime': 4.6093, 'eval_samples_per_second': 216.952, 'epoch': 1.0}[0m
[34m{'train_runtime': 22.6518, 'train_samples_per_second': 1.413, 'epoch': 1.0}[0m
[34m***** Eval results *****[0m
[34m#015Downloading:   0%|          | 0.00/442 


2021-05-31 08:00:15 Uploading - Uploading generated training model
2021-05-31 08:00:45 Completed - Training job completed
Training seconds: 296
Billable seconds: 296


In [18]:
from sagemaker import TrainingJobAnalytics

# Captured metrics can be accessed as a Pandas dataframe
df = TrainingJobAnalytics(training_job_name=estimator.latest_training_job.name).dataframe()
df.head(10)

Unnamed: 0,timestamp,metric_name,value
0,0.0,loss,0.697225
1,0.0,learning_rate,3.35
2,0.0,eval_loss,0.688285
3,0.0,eval_accuracy,0.561
4,0.0,eval_f1,0.599087
5,0.0,eval_precision,0.527331
6,0.0,eval_recall,0.693446
7,0.0,eval_stupid_metric,1.0
8,0.0,eval_runtime,4.6093
9,0.0,eval_samples_per_second,216.952


In [19]:
df

Unnamed: 0,timestamp,metric_name,value
0,0.0,loss,0.697225
1,0.0,learning_rate,3.35
2,0.0,eval_loss,0.688285
3,0.0,eval_accuracy,0.561
4,0.0,eval_f1,0.599087
5,0.0,eval_precision,0.527331
6,0.0,eval_recall,0.693446
7,0.0,eval_stupid_metric,1.0
8,0.0,eval_runtime,4.6093
9,0.0,eval_samples_per_second,216.952
