Using kernel `conda_pytorch_latest_p36`

In [1]:
import sys
sys.path.append('../../../')

In [2]:
from pathlib import Path
import os
import random

In [3]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report
import torch

In [4]:
from deep.constants import *

In [5]:
classes = [
    'Agricolture',
    'Cross',
    'Education',
    'Food Security',
    'Health',
    'Livelihoods',
    'Logistics',
    'Nutrition',
    'Protection',
    'Shelter',
    'Wash'
]

## Data

### Data preparation

In [6]:
data = IMMAP_PATH / 'final_data/en/'

train = pd.read_csv(data / 'sentences_en_train.csv')
test = pd.read_csv(data / 'sentences_en_test.csv')

In [7]:
train

Unnamed: 0,doc_id,sentence_id,sentence_text,is_relevant,sector_ids
0,51787,38,Climate Change and Agriculture: Subsistence Fa...,0,[]
1,51787,44,"Bohorquez-Penuela, C., & Otero-Cortes, A (2020).",0,[]
2,51787,45,Blame it on the Rain: The Effects of Weather S...,0,[]
3,51787,49,Increasing frequency of extreme El Ni o events...,0,[]
4,51787,53,What do we learn from the weather?,0,[]
...,...,...,...,...,...
191932,34512,112,Emergency shelters in India had reportedly bee...,0,[]
191933,34512,116,In these situations humanitarian operations ma...,0,[]
191934,34512,120,Governments of EU member states and several Af...,0,[]
191935,34512,124,You can find an overview of all ACAPS resource...,0,[]


In [8]:
def process_for_sector(df, sector, train):
    relevant_train = df[df.is_relevant == 1]
    relevant_train.sector_ids = relevant_train.sector_ids.apply(eval)
    relevant_train = relevant_train[relevant_train.sector_ids.apply(len) > 0]
    
    positive_train = relevant_train[relevant_train.sector_ids.apply(lambda x: sector in x)]
    negative_train = relevant_train[relevant_train.sector_ids.apply(lambda x: sector not in x)]

    positive_train.sector_ids = 1
    negative_train.sector_ids = 0
    train_df = pd.concat([positive_train, negative_train])
    train_df = train_df.sample(frac=1).reset_index(drop=True)
    
    train_df['is_valid'] = False if train else True
        
    return train_df

In [9]:
train_df = process_for_sector(train, 4, True)
test_df = process_for_sector(test, 4, False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the d

In [10]:
train_df

Unnamed: 0,doc_id,sentence_id,sentence_text,is_relevant,sector_ids,is_valid
0,40704,111,engagement of CP/social workers in coordinatio...,1,0,False
1,35141,32,Restaurants will remain closed for the time be...,1,0,False
2,39662,97,General waste workers and cleaners appear to b...,1,1,False
3,34594,160,Firewood (reported at 61 collective shelters) ...,1,0,False
4,34696,5,Official figures show that cases have increase...,1,1,False
...,...,...,...,...,...,...
22967,41137,26,"A total of 10,801 COVID-19 tests have been con...",1,1,False
22968,46567,12,This is the highest recorded price since WFP s...,1,0,False
22969,49047,35,Many require additional support from household...,1,0,False
22970,34808,192,"During the reporting period, one case was inve...",1,1,False


In [11]:
train_df.to_pickle('train_df.pickle')
test_df.to_pickle('test_df.pickle')

## Sagemaker Prep

In [12]:
import sagemaker
from sagemaker import get_execution_role
import json
import boto3

sess = sagemaker.Session()

role = get_execution_role()
print(
    role
)  # This is the role that SageMaker would use to leverage AWS resources (S3, CloudWatch) on your behalf

bucket = SAGEMAKER_BUCKET
prefix = "huggingface/first"  # Replace with the prefix under which you want to store the data if needed


arn:aws:iam::961104659532:role/stefano-test-terraform


### Bucket upload

In [39]:
bucket_path = 'test1/data'
train_channel = bucket_path + "/train_df.pickle"
validation_channel = bucket_path + "/test_df.pickle"

sess.upload_data(path="train_df.pickle", bucket=SAGEMAKER_BUCKET, key_prefix=bucket_path)
sess.upload_data(path="test_df.pickle", bucket=SAGEMAKER_BUCKET, key_prefix=bucket_path)

s3_train_data = f"s3://{SAGEMAKER_BUCKET}/{train_channel}"
s3_validation_data = f"s3://{SAGEMAKER_BUCKET}/{validation_channel}"

s3_output_location = f"s3://{SAGEMAKER_BUCKET}/{bucket_path}/output"

In [40]:
from sagemaker.huggingface import HuggingFace


# hyperparameters, which are passed into the training job
hyperparameters={'epochs': 1,
                 'per_device_train_batch_size': 32,
                 'model_name': 'distilbert-base-uncased'
                 }

# create the Estimator
huggingface_estimator = HuggingFace(
        entry_point='train.py',
        source_dir=str(SCRIPTS_MODELS_PATH / 'stefano'),
        instance_type='ml.p3.2xlarge',
        instance_count=1,
        role=role,
        transformers_version='4.4',
        pytorch_version='1.6',
        py_version='py36',
        hyperparameters = hyperparameters
)

In [41]:
fit_arguments = {
    'train': f's3://{SAGEMAKER_BUCKET}/{bucket_path}',
    'test': f's3://{SAGEMAKER_BUCKET}/{bucket_path}'
}

In [None]:
huggingface_estimator.fit(fit_arguments)

2021-05-20 12:10:26 Starting - Starting the training job...
2021-05-20 12:10:51 Starting - Launching requested ML instancesProfilerReport-1621512626: InProgress
......
2021-05-20 12:11:52 Starting - Preparing the instances for training.........
2021-05-20 12:13:23 Downloading - Downloading input data...
2021-05-20 12:13:52 Training - Downloading the training image..................
2021-05-20 12:16:58 Training - Training image download completed. Training in progress..[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2021-05-20 12:16:59,254 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2021-05-20 12:16:59,277 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2021-05-20 12:17:00,710 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2021-05-20 12:17:01,316 sagemaker-tr

[34m[2021-05-20 12:17:26.714 algo-1:25 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2021-05-20 12:17:26.963 algo-1:25 INFO profiler_config_parser.py:102] User has disabled profiler.[0m
[34m[2021-05-20 12:17:26.963 algo-1:25 INFO json_config.py:91] Creating hook from json_config at /opt/ml/input/config/debughookconfig.json.[0m
[34m[2021-05-20 12:17:26.964 algo-1:25 INFO hook.py:199] tensorboard_dir has not been set for the hook. SMDebug will not be exporting tensorboard summaries.[0m
[34m[2021-05-20 12:17:27.077 algo-1:25 INFO hook.py:253] Saving to /opt/ml/output/tensors[0m
[34m[2021-05-20 12:17:27.078 algo-1:25 INFO state_store.py:67] The checkpoint config file /opt/ml/input/config/checkpointconfig.json does not exist.[0m
[34m[2021-05-20 12:17:27.289 algo-1:25 INFO hook.py:550] name:distilbert.embeddings.word_embeddings.weight count_params:23440896[0m
[34m[2021-05-20 12:17:27.289 algo-1:25 INFO hook.py:550] name:distilbert.embeddings.position_embeddings

[34m{'loss': 0.4246, 'learning_rate': 5e-05, 'epoch': 0.7}[0m
[34m{'eval_loss': 0.3309085965156555, 'eval_accuracy': 0.8730213351686167, 'eval_f1': 0.8422402736212057, 'eval_precision': 0.9128822984244671, 'eval_recall': 0.7817460317460317, 'eval_runtime': 12.8369, 'eval_samples_per_second': 226.379, 'epoch': 1.0}[0m
[34m{'train_runtime': 348.5822, 'train_samples_per_second': 2.06, 'epoch': 1.0}[0m

2021-05-20 12:23:35 Uploading - Uploading generated training model[34m***** Eval results *****[0m
[34m#015Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]#015Downloading: 100%|██████████| 232k/232k [00:00<00:00, 41.6MB/s][0m
[34m#015Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]#015Downloading: 100%|██████████| 466k/466k [00:00<00:00, 41.0MB/s][0m
[34m#015Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]#015Downloading: 100%|██████████| 28.0/28.0 [00:00<00:00, 31.8kB/s][0m
[34m#015Downloading:   0%|          | 0.00/442 [00:00<?, ?B/s]#015Downloading: 100


2021-05-20 12:25:55 Completed - Training job completed


In [None]:
1

## Train

In [None]:
region_name = boto3.Session().region_name

In [None]:
container = sagemaker.amazon.amazon_estimator.get_image_uri(region_name, "blazingtext", "latest")
print("Using SageMaker BlazingText container: {} ({})".format(container, region_name))

In [None]:
bt_model = sagemaker.estimator.Estimator(
    container,
    role,
    instance_count=1,
    instance_type="ml.c4.4xlarge",
    volume_size=30,
    max_run=360000,
    input_mode="File",
    output_path=s3_output_location,
    hyperparameters={
        "mode": "supervised",
        "epochs": 4,
        "min_count": 2,
        "learning_rate": 0.05,
        "vector_dim": 10,
#         "early_stopping": False,
        "patience": 4,
#         "min_epochs": 5,
#         "word_ngrams": 2,
    },
)

In [None]:
# Create a train data channel with S3_data_type as 'AugmentedManifestFile' and attribute names.
# train_data = sagemaker.session.s3_input(
#     your_augmented_manifest_file,
#     distribution='FullyReplicated',
#     content_type='application/x-recordio',
#     s3_data_type='AugmentedManifestFile',
#     attribute_names=['source-ref', 'annotations'],
#     input_mode='Pipe',
#     record_wrapping='RecordIO'
# )

In [None]:
train_data = sagemaker.inputs.TrainingInput(
    s3_train_data,
    distribution="FullyReplicated",
    content_type="text/plain",
    s3_data_type="AugmentedManifestFile",
    input_mode='Pipe',
    attribute_names=['source', 'label'],
    record_wrapping='RecordIO',
)
validation_data = sagemaker.inputs.TrainingInput(
    s3_validation_data,
    distribution="FullyReplicated",
    content_type="text/plain",
    s3_data_type="AugmentedManifestFile",
    input_mode='Pipe',
    attribute_names=['source', 'label'],
    record_wrapping='RecordIO',
)
data_channels = {"train": train_data, "validation": validation_data}

In [None]:
bt_model.fit(inputs=data_channels, logs=True)

In [None]:
1

In [None]:
from sagemaker.serializers import JSONSerializer

text_classifier = bt_model.deploy(
    initial_instance_count=1, instance_type="ml.m4.xlarge", serializer=JSONSerializer()
)


In [None]:
sentences = list(test_df.sentence_text)

# using the same nltk tokenizer that we used during data preparation for training
tokenized_sentences = [" ".join(nltk.word_tokenize(sent)) for sent in sentences]

payload = {"instances": tokenized_sentences, "configuration": {"k": 5}}

response = text_classifier.predict(payload)

predictions = json.loads(response)
print(json.dumps(predictions, indent=2))

In [None]:
t = test_df.copy()

In [None]:
a = []
thresh = 0.05
for pred in predictions:
    labels = [int(x.replace('__label__', '')) for x, y in zip(pred['label'], pred['prob']) if y > thresh]
    a.append(labels)

In [None]:
t['preds'] = a

In [None]:
indexes = []
recalls = []
precisions = []
f1_scores = []

for i, class_ in enumerate(classes):
    class_preds = [1 if i in x else 0 for x in t.preds]
    class_targets = [1 if i in x else 0 for x in t.sector_ids]

    indexes.append(class_)
    precisions.append(precision_score(class_targets, class_preds))
    recalls.append(recall_score(class_targets, class_preds))    
    f1_scores.append(f1_score(class_targets, class_preds))        


all_metrics = pd.DataFrame(
    {
        'class': indexes,
        'recall': recalls,
        'precision': precisions,
        'f1_score': f1_scores
    }
).set_index('class', drop=True)

In [None]:
all_metrics.plot(
    figsize=(20, 10), xticks=range(12), yticks=[x/10 for x in range(11)], ylim=(0, 1), grid=True
)

In [None]:
t