Using kernel `conda_pytorch_latest_p36`

In [1]:
import sys
sys.path.append('../../../')

In [2]:
from pathlib import Path
import os
import random

In [3]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report
import torch

In [4]:
from deep.constants import *

In [5]:
%load_ext autoreload
%autoreload 2

## Data

In [6]:
data = FRAMEWORKS_PATH

In [7]:
afexportable = pd.read_csv(data /'afexportable_of_af_of_projects_of_interest.csv')
all_afs = pd.read_csv(data /'all_afs.csv')

proj_interest = pd.read_csv(data / 'projects_of_interest.csv')
entr_proj_interest = pd.read_csv(data / 'entries_of_projects_of_interest.csv')
exp_proj_interest = pd.read_csv(data / 'exportdata_of_entries_of_projects_of_interest.csv')
wid_proj_interest = pd.read_csv(data / 'widgets_of_afs_of_interest.csv')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [8]:
train = pd.read_csv(FRAMEWORKS_PATH / 'data_v0.3_train.csv')
val = pd.read_csv(FRAMEWORKS_PATH / 'data_v0.3_val.csv')
test = pd.read_csv(FRAMEWORKS_PATH / 'data_v0.3_test.csv')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [9]:
columns_to_keep = ['excerpt'] + DIMENSION_CLASSES

In [10]:
def process_for_sector(df, sector):
    df = df[['excerpt', sector]]
        
    return df

In [11]:
train_df = process_for_sector(train, 'Humanitarian Conditions')
test_df = process_for_sector(test, 'Humanitarian Conditions')

In [12]:
train_df.sample(1000).to_pickle('train_df.pickle', protocol=4)
test_df.sample(1000).to_pickle('test_df.pickle', protocol=4)

## Sagemaker Prep

In [13]:
import sagemaker
from sagemaker import get_execution_role
import json
import boto3

default_bucket = 'deep-experiments-sagemaker-bucket'
sess = sagemaker.Session(default_bucket=default_bucket)

role = 'AmazonSageMaker-ExecutionRole-20210519T102514'
print(
    role
)  # This is the role that SageMaker would use to leverage AWS resources (S3, CloudWatch) on your behalf

bucket = SAGEMAKER_BUCKET
prefix = "huggingface/first"  # Replace with the prefix under which you want to store the data if needed


AmazonSageMaker-ExecutionRole-20210519T102514


### Bucket upload

In [14]:
bucket_path = 'test1/data'
train_channel = bucket_path + "/train_df.pickle"
validation_channel = bucket_path + "/test_df.pickle"

sess.upload_data(path="train_df.pickle", bucket=SAGEMAKER_BUCKET, key_prefix=bucket_path)
sess.upload_data(path="test_df.pickle", bucket=SAGEMAKER_BUCKET, key_prefix=bucket_path)

s3_train_data = f"s3://{SAGEMAKER_BUCKET}/{train_channel}"
s3_validation_data = f"s3://{SAGEMAKER_BUCKET}/{validation_channel}"

s3_output_location = f"s3://{SAGEMAKER_BUCKET}/{bucket_path}/output"

In [15]:
instances = [
    'ml.p2.xlarge',
    'ml.p3.2xlarge'
]

In [16]:
metric_definitions=[
    {'Name': 'loss', 'Regex': "'loss': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'learning_rate', 'Regex': "'learning_rate': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_loss', 'Regex': "'eval_loss': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_accuracy', 'Regex': "'eval_accuracy': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_f1', 'Regex': "'eval_f1': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_precision', 'Regex': "'eval_precision': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_recall', 'Regex': "'eval_recall': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_runtime', 'Regex': "'eval_runtime': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_samples_per_second', 'Regex': "'eval_samples_per_second': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'epoch', 'Regex': "'epoch': ([0-9]+(.|e\-)[0-9]+),?"}]

In [None]:
# set True if you need spot instance
use_spot = True
train_max_run_secs =   2*24 * 60 * 60
spot_wait_sec =  5 * 60
max_wait_time_secs = train_max_run_secs +  spot_wait_sec

if not use_spot:
    max_wait_time_secs = None
    
# During local mode, no spot.., use smaller dataset
if instance_type == 'local':
    use_spot = False
    max_wait_time_secs = 0
    wait = True
    # Use smaller dataset to run locally
    inputs = inputs_sample


In [17]:
from sagemaker.pytorch import PyTorch

hyperparameters={'epochs': 4,
                 'train_batch_size': 32,
                 'model_name': 'distilbert-base-uncased'
                 }

estimator = PyTorch(
    entry_point='train.py',
    source_dir=str(SCRIPTS_TRAINING_PATH / 'stefano/pytorch_estimator_base'),
    output_path=f's3://{SAGEMAKER_BUCKET}/models/',
    instance_type='ml.p3.2xlarge',
    instance_count=1,
    role=role,
    framework_version='1.8',
    py_version='py36',
    hyperparameters = hyperparameters,
    metric_definitions=metric_definitions
#     train_instance_count=2,
#     train_instance_type="ml.c4.xlarge",
)

In [18]:
fit_arguments = {
    'train': f's3://{SAGEMAKER_BUCKET}/{bucket_path}',
    'test': f's3://{SAGEMAKER_BUCKET}/{bucket_path}'
}

In [19]:
estimator.fit(fit_arguments)

2021-05-27 08:20:44 Starting - Starting the training job...
2021-05-27 08:21:10 Starting - Launching requested ML instancesProfilerReport-1622103642: InProgress
......
2021-05-27 08:22:10 Starting - Preparing the instances for training.........
2021-05-27 08:23:51 Downloading - Downloading input data
2021-05-27 08:23:51 Training - Downloading the training image....................[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2021-05-27 08:27:25,292 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2021-05-27 08:27:25,318 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2021-05-27 08:27:25,327 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2021-05-27 08:27:25,795 sagemaker-training-toolkit INFO     Installing dependencies from requirements.txt:[0m
[34m/opt/conda

[34m2021-05-27 08:27:45,693 - __main__ - INFO -  loaded train_dataset length is: (1000, 2)[0m
[34m2021-05-27 08:27:45,693 - __main__ - INFO -  loaded test_dataset length is: (1000, 2)[0m
[34m2021-05-27 08:27:45,889 - filelock - INFO - Lock 140223429300856 acquired on /root/.cache/huggingface/transformers/0e1bbfda7f63a99bb52e3915dcf10c3c92122b827d92eb2d34ce94ee79ba486c.d789d64ebfe299b0e416afc4a169632f903f693095b4629a7ea271d5a0cf2c99.lock[0m
[34m2021-05-27 08:27:45,941 - filelock - INFO - Lock 140223429300856 released on /root/.cache/huggingface/transformers/0e1bbfda7f63a99bb52e3915dcf10c3c92122b827d92eb2d34ce94ee79ba486c.d789d64ebfe299b0e416afc4a169632f903f693095b4629a7ea271d5a0cf2c99.lock[0m
[34m2021-05-27 08:27:45,977 - filelock - INFO - Lock 140223429300856 acquired on /root/.cache/huggingface/transformers/75abb59d7a06f4f640158a9bfcde005264e59e8d566781ab1415b139d2e4c603.7f2721073f19841be16f41b0a70b600ca6b880c8f3df6f3535cbc704371bdfa4.lock[0m
[34m2021-05-27 08:27:46,027 - f

[34m{'loss': 0.69, 'learning_rate': 6.000000000000001e-07, 'epoch': 0.19}[0m
[34m{'loss': 0.6903, 'learning_rate': 7.000000000000001e-07, 'epoch': 0.22}[0m
[34m{'loss': 0.6891, 'learning_rate': 8.000000000000001e-07, 'epoch': 0.25}[0m
[34m{'loss': 0.6858, 'learning_rate': 9e-07, 'epoch': 0.28}[0m
[34m{'loss': 0.6786, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.31}[0m
[34m{'loss': 0.6897, 'learning_rate': 1.1e-06, 'epoch': 0.34}[0m
[34m{'loss': 0.6946, 'learning_rate': 1.2000000000000002e-06, 'epoch': 0.38}[0m
[34m{'loss': 0.6874, 'learning_rate': 1.3e-06, 'epoch': 0.41}[0m
[34m{'loss': 0.6997, 'learning_rate': 1.4000000000000001e-06, 'epoch': 0.44}[0m
[34m{'loss': 0.7027, 'learning_rate': 1.5e-06, 'epoch': 0.47}[0m
[34m{'loss': 0.6908, 'learning_rate': 1.6000000000000001e-06, 'epoch': 0.5}[0m
[34m{'loss': 0.6998, 'learning_rate': 1.7000000000000002e-06, 'epoch': 0.53}[0m
[34m{'loss': 0.6765, 'learning_rate': 1.8e-06, 'epoch': 0.56}[0m
[34m{'loss': 0.6

[34m{'loss': 0.4976, 'learning_rate': 1.1500000000000002e-05, 'epoch': 3.59}[0m
[34m{'loss': 0.5648, 'learning_rate': 1.16e-05, 'epoch': 3.62}[0m
[34m{'loss': 0.5483, 'learning_rate': 1.1700000000000001e-05, 'epoch': 3.66}[0m
[34m{'loss': 0.4571, 'learning_rate': 1.18e-05, 'epoch': 3.69}[0m
[34m{'loss': 0.6299, 'learning_rate': 1.19e-05, 'epoch': 3.72}[0m
[34m{'loss': 0.4893, 'learning_rate': 1.2e-05, 'epoch': 3.75}[0m
[34m{'loss': 0.5084, 'learning_rate': 1.2100000000000001e-05, 'epoch': 3.78}[0m
[34m{'loss': 0.4941, 'learning_rate': 1.22e-05, 'epoch': 3.81}[0m
[34m{'loss': 0.4271, 'learning_rate': 1.23e-05, 'epoch': 3.84}[0m
[34m{'loss': 0.6889, 'learning_rate': 1.24e-05, 'epoch': 3.88}[0m
[34m{'loss': 0.5439, 'learning_rate': 1.25e-05, 'epoch': 3.91}[0m
[34m{'loss': 0.56, 'learning_rate': 1.2600000000000001e-05, 'epoch': 3.94}[0m
[34m{'loss': 0.4209, 'learning_rate': 1.27e-05, 'epoch': 3.97}[0m
[34m{'loss': 0.9936, 'learning_rate': 1.2800000000000001e-05, 


2021-05-27 08:29:52 Uploading - Uploading generated training model
2021-05-27 08:30:32 Completed - Training job completed
ProfilerReport-1622103642: NoIssuesFound
Training seconds: 407
Billable seconds: 407


In [20]:
from sagemaker import TrainingJobAnalytics

# Captured metrics can be accessed as a Pandas dataframe
df = TrainingJobAnalytics(training_job_name=estimator.latest_training_job.name).dataframe()
df.head(10)

Unnamed: 0,timestamp,metric_name,value
0,0.0,loss,0.677442
1,60.0,loss,0.66984
2,0.0,learning_rate,5.025
3,60.0,learning_rate,2.86
4,0.0,eval_loss,0.67667
5,60.0,eval_loss,0.60749
6,0.0,eval_accuracy,0.585
7,60.0,eval_accuracy,0.6655
8,0.0,eval_f1,0.474391
9,60.0,eval_f1,0.648575


In [21]:
df

Unnamed: 0,timestamp,metric_name,value
0,0.0,loss,0.677442
1,60.0,loss,0.66984
2,0.0,learning_rate,5.025
3,60.0,learning_rate,2.86
4,0.0,eval_loss,0.67667
5,60.0,eval_loss,0.60749
6,0.0,eval_accuracy,0.585
7,60.0,eval_accuracy,0.6655
8,0.0,eval_f1,0.474391
9,60.0,eval_f1,0.648575
