Using kernel `conda_pytorch_latest_p36`

We take inspiration from [this paper](https://arxiv.org/pdf/2104.14690.pdf)

In [1]:
# !pip install cloudpathlib
# !pip install s3fs
# !pip install transformers
# !pip install pytorch-lightning

# Import

In [2]:
import sys
sys.path.append('../../../')

In [3]:
from pathlib import Path
import os
import random
import json
from datetime import datetime

In [4]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report
import torch
from torch import nn

import sagemaker
from sagemaker import get_execution_role
import boto3
import torch.nn as nn
from sklearn.preprocessing import MultiLabelBinarizer
from torch.utils.data import DataLoader,Dataset
from tqdm import tqdm

import boto3
import pytorch_lightning as pl
import torch.nn as nn
from transformers import (
    AutoModelForSequenceClassification,
    BertForMaskedLM,
    DistilBertForMaskedLM,
    AutoModel,
    AutoTokenizer,
    AdamW,
    get_linear_schedule_with_warmup,
    Trainer,
    TrainingArguments
)
from torch.utils.data import DataLoader,Dataset

In [5]:
from deep.constants import *
from deep.utils import *

In [6]:
%load_ext autoreload
%autoreload 2

## Data

In [7]:
def preprocessing(df):
    df = df.copy()
    df['sectors'] = df['sectors'].apply(eval)    
    df['pillars'] = df['pillars'].apply(eval)
    df['pillars'] = df['pillars'].apply(lambda x: list(set(x)))
    df['subpillars'] = df['subpillars'].apply(eval)
    return df

In [8]:
column = 'pillars'
classes=PILLARS
text_column = 'excerpt'
text_column_mlm = 'excerpt_mlm'
label_column = 'labels'

In [9]:
train = preprocessing(pd.read_csv(LATEST_DATA_PATH / 'data_v0.4.4_train.csv', index_col=0))
val = preprocessing(pd.read_csv(LATEST_DATA_PATH / 'data_v0.4.4_val.csv', index_col=0))
test = preprocessing(pd.read_csv(LATEST_DATA_PATH / 'data_v0.4.4_test.csv', index_col=0))

In [10]:
classes = PILLARS
len_longest_class =  max([len(x.split()) for x in classes])
append_string = [
    f'[SEP] Is this sentence about {c}? [MASK]'
    for c in classes
]
# append_string = ' '.join(append_string)

In [11]:
def preprocess_mlm(df):
    df = df.copy()
    df = df[df[column].apply(len)>0]
    df = df[df['language']=='en']
    df['append'] = [append_string for _ in range(df.shape[0])]
    df[label_column] = [[1 if y in x else 0 for y in classes] for x in df[column]]
    df = df.set_index(['entry_id',text_column])[['append', label_column]].apply(pd.Series.explode).reset_index()
    df[text_column_mlm] = df[text_column] + df['append']
    return df

In [12]:
train_df = preprocess_mlm(train)
val_df = preprocess_mlm(val)
test_df = preprocess_mlm(test)

## Sagemaker Prep

### Session

In [13]:
sess = sagemaker.Session(default_bucket=DEV_BUCKET.name)
role = 'AmazonSageMaker-ExecutionRole-20210519T102514'

### Bucket upload

In [14]:
sample = False

if sample:
    df = df.sample(100)
    
job_name = f"pytorch-{formatted_time()}-entailment-masked-en"
input_path = DEV_BUCKET / 'training' / 'input_data' / job_name

s3_train_data = str(input_path / 'train_df.pickle')
s3_val_data = str(input_path / 'val_df.pickle')
s3_test_data = str(input_path / 'test_df.pickle')


train_df.to_pickle(s3_train_data, protocol=4)
val_df.to_pickle(s3_val_data, protocol=4)
test_df.to_pickle(s3_test_data, protocol=4)

### Estimator Definition

In [15]:
instances = [
    'ml.p2.xlarge',
    'ml.p3.2xlarge'
]

In [16]:
input_path

S3Path('s3://sagemaker-deep-experiments-dev/training/input_data/pytorch-2021-06-09-14-03-18-647-entailment-masked-en')

In [17]:
from sagemaker.pytorch import PyTorch

hyperparameters={
    'epochs': 3,
    'train-batch-size': 16,
    'eval-batch-size': 16,
    'learning_rate': 5e-5,
    'model_name': 'distilbert-base-uncased',
    'text_column_mlm': text_column_mlm,
    'label_column': label_column,
}

estimator = PyTorch(
    entry_point='train.py',
    source_dir=str(SCRIPTS_TRAINING_PATH / 'stefano/huggingface_lmbff'),
    output_path=str(DEV_BUCKET / 'models/'),
    code_location=str(input_path),
    instance_type='ml.p3.2xlarge',
    instance_count=1,
    role=role,
    framework_version='1.8',
    py_version='py36',
    hyperparameters = hyperparameters,
    job_name=job_name,
#     train_instance_count=2,
#     train_instance_type="ml.c4.xlarge",
)

In [18]:
fit_arguments = {
    'train': str(input_path),
    'test': str(input_path)
}

In [19]:
estimator.fit(fit_arguments, job_name=job_name, wait=False)

In [20]:
from sagemaker import TrainingJobAnalytics

# Captured metrics can be accessed as a Pandas dataframe
df = TrainingJobAnalytics(training_job_name=estimator.latest_training_job.name).dataframe()
df.head(10)

KeyError: 'MetricDefinitions'

In [None]:
df