Using kernel `conda_pytorch_latest_p36`

We take inspiration from [this paper](https://arxiv.org/pdf/2104.14690.pdf)

In [1]:
# !pip install cloudpathlib
# !pip install s3fs
# !pip install transformers\
# !pip install pytorch-lightning

# Import

In [2]:
import sys
sys.path.append('../../../')

In [3]:
from pathlib import Path
import os
import random
import json
from datetime import datetime

In [234]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report
import torch
from torch import nn

import sagemaker
from sagemaker import get_execution_role
import boto3
import torch.nn as nn
from sklearn.preprocessing import MultiLabelBinarizer
from torch.utils.data import DataLoader,Dataset
from tqdm import tqdm
from langdetect import detect

import boto3
import pytorch_lightning as pl
import torch.nn as nn
from transformers import (
    AutoModelForSequenceClassification,
    BertForMaskedLM,
    DistilBertForMaskedLM,
    AutoModel,
    AutoTokenizer,
    AdamW,
    get_linear_schedule_with_warmup,
)
from torch.utils.data import DataLoader,Dataset

In [5]:
from deep.constants import *
from deep.utils import *

In [6]:
%load_ext autoreload
%autoreload 2

## Data

In [7]:
def preprocessing(df):
    df = df.copy()
    df['sectors'] = df['sectors'].apply(eval)    
    df['pillars'] = df['pillars'].apply(eval)
    df['pillars'] = df['pillars'].apply(lambda x: list(set(x)))
    df['subpillars'] = df['subpillars'].apply(eval)
    return df

In [298]:
column = 'pillars'
classes=PILLARS
text_column = 'excerpt'
label_column = 'labels'

In [299]:
train = preprocessing(pd.read_csv(LATEST_DATA_PATH / 'data_v0.4.4_train.csv', index_col=0))
val = preprocessing(pd.read_csv(LATEST_DATA_PATH / 'data_v0.4.4_val.csv', index_col=0))
test = preprocessing(pd.read_csv(LATEST_DATA_PATH / 'data_v0.4.4_test.csv', index_col=0))

In [293]:
def preprocess_mlm(df):
    df['labels'] = [[1 if y in x else 0 for y in classes] for x in df[column]]
    return df

In [300]:
train_df = preprocess_mlm(train)
val_df = preprocess_mlm(val)
test_df = preprocess_mlm(test)

# Model

In [301]:
classes = PILLARS
len_longest_class =  max([len(x.split()) for x in classes])
append_string = [
    f'[SEP] Is this sentence about {c}? [MASK]'
    for c in classes
]
append_string = ' '.join(append_string)

In [302]:
model_name = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = DistilBertForMaskedLM.from_pretrained(model_name)

In [304]:
tokenizer.convert_tokens_to_ids('yes')

2748

In [322]:
class LMBFFDataset(Dataset):
    def __init__(self, tokenizer, df, text_col, label_col, append_string):
        self.tokenizer = tokenizer
        self.df = df
        self.texts = df[text_col].values + append_string
        self.labels = self.compute_labels(df[label_col])
        
    def compute_labels(self, labels):
        yes_token = self.tokenizer.convert_tokens_to_ids('yes')
        no_token = self.tokenizer.convert_tokens_to_ids('no')
        labels = [[yes_token if y else no_token for y in label] for label in labels]
        return torch.tensor(labels)

    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, idx):
        text = self.texts[idx] 
        inputs = self.tokenizer.encode_plus(
            text,
#             truncation=True,
            padding='max_length',
            return_tensors='pt',
        )
        inputs['input_ids'] = inputs['input_ids'].squeeze(0)
        labels = self.labels[idx]
        inputs['labels'] = self.mlm_labels(inputs, labels)
        return inputs

    def mlm_labels(self, inputs, label):
        labels = inputs['input_ids'].clone()
        labels[labels != tokenizer.mask_token_id] = -100
        labels[labels == tokenizer.mask_token_id] = label
        return labels

In [323]:
dataset = LMBFFDataset(tokenizer, train, 'excerpt', label_column, append_string)
batch_size = 16

In [324]:
train_dataloader = DataLoader(
    dataset, 
    batch_size=batch_size, 
    shuffle=True,
    num_workers=0, 
    pin_memory=False
)

In [326]:
optimizer = AdamW(model.parameters(), lr=5e-5)

In [327]:
for batch in train_dataloader:
    outputs = model(**batch)
    loss = outputs.loss
    loss.backward()
    
    optimizer.step()
    optimizer.zero_grad()
    break

In [328]:
outputs

MaskedLMOutput(loss=tensor(10.3498, grad_fn=<NllLossBackward>), logits=tensor([[[ -5.8382,  -5.8248,  -5.8095,  ...,  -5.1800,  -4.9848,  -3.1095],
         [ -6.4971,  -6.6986,  -6.6697,  ...,  -4.6652,  -5.9208,  -2.0425],
         [ -8.4311,  -8.2863,  -8.8108,  ...,  -7.7188,  -7.1976,  -5.6503],
         ...,
         [ -3.6936,  -3.7121,  -4.0092,  ...,  -3.1034,  -4.7977,  -3.5473],
         [ -6.0738,  -5.9762,  -5.8656,  ...,  -5.8428,  -5.8106,  -3.3913],
         [ -4.3260,  -4.6040,  -4.5952,  ...,  -4.1918,  -4.5281,  -1.8303]],

        [[ -5.7506,  -5.7316,  -5.7304,  ...,  -5.1290,  -4.8876,  -3.1849],
         [ -4.6835,  -5.1957,  -5.0777,  ...,  -2.4479,  -4.2939,  -2.6147],
         [ -3.6053,  -3.6985,  -4.1453,  ...,  -2.6684,  -3.8221,  -2.7757],
         ...,
         [ -5.3343,  -5.5377,  -5.1127,  ...,  -4.7392,  -4.4485,  -2.1090],
         [ -4.6459,  -4.8420,  -4.5672,  ...,  -3.7292,  -3.9857,  -2.5699],
         [ -5.3769,  -5.5618,  -5.2627,  ...,  -4.92

In [85]:
input_['input_ids'].shape

torch.Size([1, 1002])

In [None]:
tokenizer.convert_ids_to_tokens(4650)

In [None]:
tokenizer.convert_tokens_to_ids('humanitarian')

In [None]:
outputs = model(**input_)

In [None]:
out = outputs.logits.detach()[0]

In [None]:
tokenizer.convert_ids_to_tokens(out.argmax(axis=1))

## Sagemaker Prep

### Session

In [None]:
sess = sagemaker.Session(default_bucket=DEV_BUCKET.name)
role = 'AmazonSageMaker-ExecutionRole-20210519T102514'

### Bucket upload

In [None]:
sample = False

if sample:
    df = df.sample(100)
    
job_name = f"pytorch-{formatted_time()}-fastai-entail-english"
input_path = DEV_BUCKET / 'training' / 'input_data' / job_name

s3_data = str(input_path / 'df.pickle')


df.to_pickle(s3_data, protocol=4)

### Estimator Definition

In [None]:
instances = [
    'ml.p2.xlarge',
    'ml.p3.2xlarge'
]

In [None]:
input_path

In [None]:
from sagemaker.pytorch import PyTorch

hyperparameters={
    'epochs': 3,
    'train_batch_size': 64,
    'learning_rate': 0.02,
    'text_col': text_column,
    'label_col': label_column,
    'multi_category': 0
}

estimator = PyTorch(
    entry_point='train.py',
    source_dir=str(SCRIPTS_TRAINING_PATH / 'stefano/multiclass-fastai'),
    output_path=str(DEV_BUCKET / 'models/'),
    code_location=str(input_path),
    instance_type='ml.p3.2xlarge',
    instance_count=1,
    role=role,
    framework_version='1.8',
    py_version='py36',
    hyperparameters = hyperparameters,
    job_name=job_name,
#     train_instance_count=2,
#     train_instance_type="ml.c4.xlarge",
)

In [None]:
fit_arguments = {
    'train': str(input_path),
    'test': str(input_path)
}

In [None]:
estimator.fit(fit_arguments, job_name=job_name, wait=False)

In [None]:
from sagemaker import TrainingJobAnalytics

# Captured metrics can be accessed as a Pandas dataframe
df = TrainingJobAnalytics(training_job_name=estimator.latest_training_job.name).dataframe()
df.head(10)

In [None]:
df