Using kernel `conda_pytorch_latest_p36`

In [1]:
import sys
sys.path.append('../../../')

In [2]:
from pathlib import Path
import os
import random

In [3]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report
import torch

In [4]:
from deep.constants import *

In [5]:
%load_ext autoreload
%autoreload 2

## Data

In [6]:
data = FRAMEWORKS_PATH

In [7]:
afexportable = pd.read_csv(data /'afexportable_of_af_of_projects_of_interest.csv')
all_afs = pd.read_csv(data /'all_afs.csv')

proj_interest = pd.read_csv(data / 'projects_of_interest.csv')
entr_proj_interest = pd.read_csv(data / 'entries_of_projects_of_interest.csv')
exp_proj_interest = pd.read_csv(data / 'exportdata_of_entries_of_projects_of_interest.csv')
wid_proj_interest = pd.read_csv(data / 'widgets_of_afs_of_interest.csv')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [8]:
train = pd.read_csv(FRAMEWORKS_PATH / 'data_v0.3_train.csv')
val = pd.read_csv(FRAMEWORKS_PATH / 'data_v0.3_val.csv')
test = pd.read_csv(FRAMEWORKS_PATH / 'data_v0.3_test.csv')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [9]:
columns_to_keep = ['excerpt'] + DIMENSION_CLASSES

In [10]:
def process_for_sector(df, sector):
    df = df[['excerpt', sector]]
        
    return df

In [11]:
train_df = process_for_sector(train, 'Humanitarian Conditions')
test_df = process_for_sector(test, 'Humanitarian Conditions')

In [12]:
train_df.sample(1000).to_pickle('train_df.pickle', protocol=4)
test_df.sample(1000).to_pickle('test_df.pickle', protocol=4)

## Sagemaker Prep

In [13]:
import sagemaker
from sagemaker import get_execution_role
import json
import boto3

default_bucket = 'deep-experiments-sagemaker-bucket'
sess = sagemaker.Session(default_bucket=default_bucket)

role = 'AmazonSageMaker-ExecutionRole-20210519T102514'
print(
    role
)  # This is the role that SageMaker would use to leverage AWS resources (S3, CloudWatch) on your behalf

bucket = SAGEMAKER_BUCKET
prefix = "huggingface/first"  # Replace with the prefix under which you want to store the data if needed


AmazonSageMaker-ExecutionRole-20210519T102514


### Bucket upload

In [14]:
bucket_path = 'test1/data'
train_channel = bucket_path + "/train_df.pickle"
validation_channel = bucket_path + "/test_df.pickle"

sess.upload_data(path="train_df.pickle", bucket=SAGEMAKER_BUCKET, key_prefix=bucket_path)
sess.upload_data(path="test_df.pickle", bucket=SAGEMAKER_BUCKET, key_prefix=bucket_path)

s3_train_data = f"s3://{SAGEMAKER_BUCKET}/{train_channel}"
s3_validation_data = f"s3://{SAGEMAKER_BUCKET}/{validation_channel}"

s3_output_location = f"s3://{SAGEMAKER_BUCKET}/{bucket_path}/output"

In [15]:
instances = [
    'ml.p2.xlarge',
    'ml.p3.2xlarge'
]

In [16]:
metric_definitions=[
    {'Name': 'loss', 'Regex': "'loss': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'learning_rate', 'Regex': "'learning_rate': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_loss', 'Regex': "'eval_loss': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_accuracy', 'Regex': "'eval_accuracy': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_f1', 'Regex': "'eval_f1': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_precision', 'Regex': "'eval_precision': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_recall', 'Regex': "'eval_recall': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_runtime', 'Regex': "'eval_runtime': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_samples_per_second', 'Regex': "'eval_samples_per_second': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'epoch', 'Regex': "'epoch': ([0-9]+(.|e\-)[0-9]+),?"}]

In [17]:
from sagemaker.pytorch import PyTorch

hyperparameters={'epochs': 1,
                 'train_batch_size': 32,
                 'model_name': 'distilbert-base-uncased'
                 }

estimator = PyTorch(
    entry_point='train.py',
    source_dir=str(SCRIPTS_TRAINING_PATH / 'stefano/pytorch_estimator_base'),
    instance_type='ml.p3.2xlarge',
    instance_count=1,
    role=role,
    framework_version='1.8',
    py_version='py36',
    hyperparameters = hyperparameters,
    metric_definitions=metric_definitions
#     train_instance_count=2,
#     train_instance_type="ml.c4.xlarge",
)

In [18]:
fit_arguments = {
    'train': f's3://{SAGEMAKER_BUCKET}/{bucket_path}',
    'test': f's3://{SAGEMAKER_BUCKET}/{bucket_path}'
}

In [19]:
estimator.fit(fit_arguments)

2021-05-26 13:34:08 Starting - Starting the training job...
2021-05-26 13:34:31 Starting - Launching requested ML instancesProfilerReport-1622036046: InProgress
......
2021-05-26 13:35:31 Starting - Preparing the instances for training.........
2021-05-26 13:37:12 Downloading - Downloading input data...
2021-05-26 13:37:52 Training - Downloading the training image............
2021-05-26 13:39:58 Training - Training image download completed. Training in progress..[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2021-05-26 13:39:59,165 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2021-05-26 13:39:59,189 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2021-05-26 13:39:59,197 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2021-05-26 13:39:59,615 sagemaker-training

[34mSuccessfully installed absl-py-0.12.0 aiohttp-3.7.4.post0 async-timeout-3.0.1 cachetools-4.2.2 filelock-3.0.12 google-auth-1.30.1 google-auth-oauthlib-0.4.4 grpcio-1.38.0 huggingface-hub-0.0.8 idna-ssl-1.1.0 markdown-3.3.4 multidict-5.1.0 oauthlib-3.1.0 pyDeprecate-0.3.0 pyasn1-modules-0.2.8 pytorch-lightning-1.3.2 regex-2021.4.4 requests-oauthlib-1.3.0 sacremoses-0.0.45 tensorboard-2.4.1 tensorboard-plugin-wit-1.8.0 tokenizers-0.10.3 torchmetrics-0.3.2 transformers-4.6.1 yarl-1.6.3[0m
[0m
[34m2021-05-26 13:40:14,062 sagemaker-training-toolkit INFO     Invoking user script
[0m
[34mTraining Env:
[0m
[34m{
    "additional_framework_parameters": {},
    "channel_input_dirs": {
        "test": "/opt/ml/input/data/test",
        "train": "/opt/ml/input/data/train"
    },
    "current_host": "algo-1",
    "framework_module": "sagemaker_pytorch_container.training:main",
    "hosts": [
        "algo-1"
    ],
    "hyperparameters": {
        "train_batch_size": 32,
        "model_n

[34m{'loss': 0.7023, 'learning_rate': 6.000000000000001e-07, 'epoch': 0.19}[0m
[34m{'loss': 0.7023, 'learning_rate': 7.000000000000001e-07, 'epoch': 0.22}[0m
[34m{'loss': 0.7105, 'learning_rate': 8.000000000000001e-07, 'epoch': 0.25}[0m
[34m{'loss': 0.7045, 'learning_rate': 9e-07, 'epoch': 0.28}[0m
[34m{'loss': 0.6919, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.31}[0m
[34m{'loss': 0.6914, 'learning_rate': 1.1e-06, 'epoch': 0.34}[0m
[34m{'loss': 0.7104, 'learning_rate': 1.2000000000000002e-06, 'epoch': 0.38}[0m
[34m{'loss': 0.7016, 'learning_rate': 1.3e-06, 'epoch': 0.41}[0m
[34m{'loss': 0.6949, 'learning_rate': 1.4000000000000001e-06, 'epoch': 0.44}[0m
[34m{'loss': 0.6902, 'learning_rate': 1.5e-06, 'epoch': 0.47}[0m
[34m{'loss': 0.7148, 'learning_rate': 1.6000000000000001e-06, 'epoch': 0.5}[0m
[34m{'loss': 0.6915, 'learning_rate': 1.7000000000000002e-06, 'epoch': 0.53}[0m
[34m{'loss': 0.698, 'learning_rate': 1.8e-06, 'epoch': 0.56}[0m
[34m{'loss': 0.


2021-05-26 13:41:44 Completed - Training job completed
Training seconds: 279
Billable seconds: 279


In [20]:
estimator.model_data

's3://sagemaker-us-east-1-961104659532/pytorch-training-2021-05-26-13-34-05-285/output/model.tar.gz'

In [21]:
from sagemaker import TrainingJobAnalytics

# Captured metrics can be accessed as a Pandas dataframe
df = TrainingJobAnalytics(training_job_name=estimator.latest_training_job.name).dataframe()
df.head(10)

Unnamed: 0,timestamp,metric_name,value
0,0.0,loss,0.694025
1,0.0,learning_rate,2.275
2,0.0,eval_loss,0.690179
3,0.0,eval_accuracy,0.54
4,0.0,eval_f1,0.215017
5,0.0,eval_precision,0.473684
6,0.0,eval_recall,0.139073
7,0.0,eval_runtime,4.6122
8,0.0,eval_samples_per_second,216.815
9,0.0,epoch,0.768


In [22]:
df

Unnamed: 0,timestamp,metric_name,value
0,0.0,loss,0.694025
1,0.0,learning_rate,2.275
2,0.0,eval_loss,0.690179
3,0.0,eval_accuracy,0.54
4,0.0,eval_f1,0.215017
5,0.0,eval_precision,0.473684
6,0.0,eval_recall,0.139073
7,0.0,eval_runtime,4.6122
8,0.0,eval_samples_per_second,216.815
9,0.0,epoch,0.768
