Using kernel `conda_pytorch_latest_p36`

In [1]:
import torch

In [2]:
from pathlib import Path
import os
import random

In [3]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report

In [4]:
from random import shuffle
import multiprocessing
from multiprocessing import Pool
import csv
import nltk
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score

nltk.download("punkt")

[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
classes = [
    'Agricolture',
    'Cross',
    'Education',
    'Food Security',
    'Health',
    'Livelihoods',
    'Logistics',
    'Nutrition',
    'Protection',
    'Shelter',
    'Wash'
]

## Sagemaker Prep

In [6]:
import sagemaker
from sagemaker import get_execution_role
import json
import boto3

sess = sagemaker.Session()

role = get_execution_role()
print(
    role
)  # This is the role that SageMaker would use to leverage AWS resources (S3, CloudWatch) on your behalf

bucket = 'sagemaker-stefano'
print(bucket)
prefix = "huggingface/first"  # Replace with the prefix under which you want to store the data if needed


arn:aws:iam::519887312542:role/AmazonSageMakerFullAccessRole
sagemaker-stefano


In [12]:
from sagemaker.huggingface import HuggingFace


# hyperparameters, which are passed into the training job
hyperparameters={'epochs': 1,
                 'per_device_train_batch_size': 32,
                 'model_name_or_path': 'distilbert-base-uncased'
                 }

# create the Estimator
huggingface_estimator = HuggingFace(
        entry_point='train.py',
        source_dir='./',
        instance_type='ml.p3.2xlarge',
        instance_count=1,
        role=role,
        transformers_version='4.4',
        pytorch_version='1.6',
        py_version='py36',
        hyperparameters = hyperparameters
)

In [None]:
huggingface_estimator.fit(
  {'train': 's3://sagemaker-us-east-1-558105141721/samples/datasets/imdb/train',
   'test': 's3://sagemaker-us-east-1-558105141721/samples/datasets/imdb/test'},
)

In [None]:
1

## Data

### Data preparation

In [None]:
data = Path('data_prep/final_data/en/')
raw = pd.read_csv('data_prep/data/entries_raw.csv')

train = pd.read_csv(data / 'sentences_en_train.csv')
test = pd.read_csv(data / 'sentences_en_test.csv')

In [None]:
def preprocessing(df, train):
    relevant_train = df[df.is_relevant == 1]
    relevant_train.sector_ids = relevant_train.sector_ids.apply(eval)
    relevant_train = relevant_train[relevant_train.sector_ids.apply(len) > 0]
    
    relevant_train['tokenized_sentence'] = relevant_train.sentence_text.apply(
        lambda x: nltk.word_tokenize(x.lower())
    )
    relevant_train['file_content'] = relevant_train.apply(
        lambda x: {"source":' '.join(x.tokenized_sentence), "label":x.sector_ids}, axis=1
    )
        
    return relevant_train

In [None]:
train_df = preprocessing(train, True)
test_df = preprocessing(test, False)

In [None]:
with open('immap.train', 'w') as f:
    for line in train_df.file_content:
        f.write(json.dumps(line) + '\n')
        
with open('immap.validation', 'w') as f:
    for line in test_df.file_content:
        f.write(json.dumps(line) + '\n')

### Bucket upload

In [None]:
train_channel = prefix + "/train/immap.train"
validation_channel = prefix + "/validation/immap.validation"

sess.upload_data(path="immap.train", bucket=bucket, key_prefix=train_channel)
sess.upload_data(path="immap.validation", bucket=bucket, key_prefix=validation_channel)

s3_train_data = "s3://{}/{}".format(bucket, train_channel)
s3_validation_data = "s3://{}/{}".format(bucket, validation_channel)

s3_output_location = "s3://{}/{}/output".format(bucket, prefix)

## Train

In [None]:
region_name = boto3.Session().region_name

In [None]:
container = sagemaker.amazon.amazon_estimator.get_image_uri(region_name, "blazingtext", "latest")
print("Using SageMaker BlazingText container: {} ({})".format(container, region_name))

In [None]:
bt_model = sagemaker.estimator.Estimator(
    container,
    role,
    instance_count=1,
    instance_type="ml.c4.4xlarge",
    volume_size=30,
    max_run=360000,
    input_mode="File",
    output_path=s3_output_location,
    hyperparameters={
        "mode": "supervised",
        "epochs": 4,
        "min_count": 2,
        "learning_rate": 0.05,
        "vector_dim": 10,
#         "early_stopping": False,
        "patience": 4,
#         "min_epochs": 5,
#         "word_ngrams": 2,
    },
)

In [None]:
# Create a train data channel with S3_data_type as 'AugmentedManifestFile' and attribute names.
# train_data = sagemaker.session.s3_input(
#     your_augmented_manifest_file,
#     distribution='FullyReplicated',
#     content_type='application/x-recordio',
#     s3_data_type='AugmentedManifestFile',
#     attribute_names=['source-ref', 'annotations'],
#     input_mode='Pipe',
#     record_wrapping='RecordIO'
# )

In [None]:
train_data = sagemaker.inputs.TrainingInput(
    s3_train_data,
    distribution="FullyReplicated",
    content_type="text/plain",
    s3_data_type="AugmentedManifestFile",
    input_mode='Pipe',
    attribute_names=['source', 'label'],
    record_wrapping='RecordIO',
)
validation_data = sagemaker.inputs.TrainingInput(
    s3_validation_data,
    distribution="FullyReplicated",
    content_type="text/plain",
    s3_data_type="AugmentedManifestFile",
    input_mode='Pipe',
    attribute_names=['source', 'label'],
    record_wrapping='RecordIO',
)
data_channels = {"train": train_data, "validation": validation_data}

In [None]:
bt_model.fit(inputs=data_channels, logs=True)

In [None]:
1

In [None]:
from sagemaker.serializers import JSONSerializer

text_classifier = bt_model.deploy(
    initial_instance_count=1, instance_type="ml.m4.xlarge", serializer=JSONSerializer()
)


In [None]:
sentences = list(test_df.sentence_text)

# using the same nltk tokenizer that we used during data preparation for training
tokenized_sentences = [" ".join(nltk.word_tokenize(sent)) for sent in sentences]

payload = {"instances": tokenized_sentences, "configuration": {"k": 5}}

response = text_classifier.predict(payload)

predictions = json.loads(response)
print(json.dumps(predictions, indent=2))

In [None]:
t = test_df.copy()

In [None]:
a = []
thresh = 0.05
for pred in predictions:
    labels = [int(x.replace('__label__', '')) for x, y in zip(pred['label'], pred['prob']) if y > thresh]
    a.append(labels)

In [None]:
t['preds'] = a

In [None]:
indexes = []
recalls = []
precisions = []
f1_scores = []

for i, class_ in enumerate(classes):
    class_preds = [1 if i in x else 0 for x in t.preds]
    class_targets = [1 if i in x else 0 for x in t.sector_ids]

    indexes.append(class_)
    precisions.append(precision_score(class_targets, class_preds))
    recalls.append(recall_score(class_targets, class_preds))    
    f1_scores.append(f1_score(class_targets, class_preds))        


all_metrics = pd.DataFrame(
    {
        'class': indexes,
        'recall': recalls,
        'precision': precisions,
        'f1_score': f1_scores
    }
).set_index('class', drop=True)

In [None]:
all_metrics.plot(
    figsize=(20, 10), xticks=range(12), yticks=[x/10 for x in range(11)], ylim=(0, 1), grid=True
)

In [None]:
t