Using kernel `conda_pytorch_latest_p36`

In [None]:
!pip install cloudpathlib

In [None]:
!pip install transformers

In [None]:
!pip install pytorch-lightning

# Import

In [1]:
import sys
sys.path.append('../../../')

In [2]:
from pathlib import Path
import os
import random
import json
from datetime import datetime

In [3]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report
import torch
import sagemaker
from sagemaker import get_execution_role
import boto3
import pytorch_lightning as pl
import torch.nn as nn
from transformers import (
    AutoModelForSequenceClassification,
    AutoModel,
    AutoTokenizer,
    AdamW,
    get_linear_schedule_with_warmup,
)
from sklearn.preprocessing import MultiLabelBinarizer
from torch.utils.data import DataLoader,Dataset
from tqdm import tqdm

In [4]:
from deep.constants import *
from deep.utils import *

In [5]:
%load_ext autoreload
%autoreload 2

## Data

In [6]:
train = pd.read_csv(FRAMEWORKS_PATH / 'data_v0.3_train.csv')
val = pd.read_csv(FRAMEWORKS_PATH / 'data_v0.3_val.csv')
test = pd.read_csv(FRAMEWORKS_PATH / 'data_v0.3_test.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [7]:
def process(df):
    df = df.copy()
    df['dimension_ids'] = df['dimension_ids'].apply(eval)
#     df['dimension_ids'] = df['dimension_ids'].apply(lambda x: torch.tensor(x, dtype=torch.float))
    
    mlb = MultiLabelBinarizer()
    labels = mlb.fit_transform(list(df['dimension_ids']))
    df['labels'] = list(labels)
    
    df = df[['excerpt', 'labels']]
    df = df.rename(columns={'excerpt': 'texts'})
        
    return df

In [8]:
train_df = process(train)
val_df = process(val)
test_df = process(test)

In [9]:
model_name = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [10]:
class TransformerDataset(Dataset):
    def __init__(self, tokenizer, df):
        self.tokenizer=tokenizer
        self.labels = df['labels']
        self.texts = df['texts']
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        inputs = self.tokenizer.encode_plus(
            text,
            truncation=True,
            padding='max_length',
            return_tensors='pt',
        )
        inputs = {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'label': torch.tensor(self.labels[idx], dtype=torch.float)
        }
        return inputs
    

In [11]:
class DataModule (pl.LightningDataModule):
    
    def __init__(self, train_df, val_df, test_df ,tokenizer, batch_size=16, max_token_len=200, sample=False):
        super().__init__()
        self.train_df = train_df if not sample else train_df.iloc[:100]
        self.val_df = val_df if not sample else val_df.iloc[:100]
        self.test_df = test_df if not sample else test_df.iloc[:100]
        self.tokenizer = tokenizer
        self.batch_size = batch_size
        self.max_token_len = max_token_len

    def setup(self, stage=None):
        self.train_dataset = TransformerDataset(tokenizer=self.tokenizer, df=self.train_df)
        self.val_dataset = TransformerDataset(tokenizer=self.tokenizer, df=self.val_df)
        self.test_dataset = TransformerDataset(tokenizer=self.tokenizer, df=self.test_df)
        
        
    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True)

    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=self.batch_size)

    def test_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=self.batch_size)


In [12]:
# we will use the BERT base model(the smaller one)
class DistilClassifier(pl.LightningModule):
    # Set up the classifier
    def __init__(self,n_classes=10,steps_per_epoch=None,n_epochs=1, lr=2e-5):
        super().__init__()

        self.model=AutoModel.from_pretrained(model_name)
        self.classifier=nn.Linear(self.model.config.hidden_size,
    n_classes) 
        self.steps_per_epoch = steps_per_epoch
        self.n_epochs = n_epochs
        self.lr = lr
        self.criterion = nn.BCEWithLogitsLoss()
        
    def forward(self,batch):
        
        output = self.model(input_ids=batch['input_ids'],attention_mask=batch['attention_mask'])
        pooler_output = output.last_hidden_state[:,0,:]
        output = self.classifier(pooler_output)

        return output
    
    def training_step(self,batch,batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['label']
        
        outputs = self(batch)
        loss = self.criterion(outputs,labels)
        self.log('train_loss',loss , prog_bar=True,logger=True)
        
        return {"loss" :loss, "predictions":outputs, "labels": labels }


    def validation_step(self,batch,batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['label']
        
        outputs = self(batch)
        loss = self.criterion(outputs,labels)
        self.log('val_loss',loss , prog_bar=True,logger=True)
        
        return loss

    def test_step(self,batch,batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['label']
        
        outputs = self(batch)
        loss = self.criterion(outputs,labels)
        self.log('test_loss',loss , prog_bar=True,logger=True)
        
        return loss
    
    
    def configure_optimizers(self):
        optimizer = AdamW(self.parameters() , lr=self.lr)
#         warmup_steps = self.steps_per_epoch//3
#         total_steps = self.steps_per_epoch * self.n_epochs - warmup_steps

        scheduler = get_linear_schedule_with_warmup(optimizer,500,10000)

        return [optimizer], [scheduler]

In [13]:
data = DataModule(train_df, val_df, test_df, tokenizer, sample=True)
data.setup()

In [14]:
model = DistilClassifier()

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [15]:
trainer = pl.Trainer(gpus=1, min_epochs=2, max_epochs=2)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores


In [17]:
trainer.fit(model, data)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name       | Type              | Params
-------------------------------------------------
0 | model      | DistilBertModel   | 66.4 M
1 | classifier | Linear            | 7.7 K 
2 | criterion  | BCEWithLogitsLoss | 0     
-------------------------------------------------
66.4 M    Trainable params
0         Non-trainable params
66.4 M    Total params
265.482   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]



Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]



In [18]:
preds = trainer.predict(model, dataloaders=data.test_dataloader())

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 1it [00:00, ?it/s]

In [21]:
torch.cat(preds).shape

torch.Size([100, 10])

In [None]:
dl = data.test_dataloader()
with torch.no_grad():
    preds = []
    for batch in tqdm(dl):
        a = model.forward(batch['input_ids'], batch['attention_mask'])
        preds.append(a)

In [None]:
preds = torch.cat(preds)

In [None]:
preds

In [None]:
dl = data.train_dataloader()
d = next(iter(dl))
output = model.forward(d['input_ids'], d['attention_mask'])

## Sagemaker Prep

### Session

In [None]:
sess = sagemaker.Session(default_bucket=DEV_BUCKET.name)
role = 'AmazonSageMaker-ExecutionRole-20210519T102514'

### Bucket upload

In [None]:
sample = False

if sample:
    train_df = train_df.sample(1000)
    test_df = test_df.sample(1000)

    
job_name = f"pytorch-training-{formatted_time()}"
input_path = DEV_BUCKET / 'training' / 'input_data' / job_name

s3_train_data = str(input_path / 'train_df.pickle')
s3_validation_data = str(input_path / 'test_df.pickle')

train_df.to_pickle(s3_train_data, protocol=4)
test_df.to_pickle(s3_validation_data, protocol=4)

### Estimator Definition

In [None]:
instances = [
    'ml.p2.xlarge',
    'ml.p3.2xlarge'
]

In [None]:
metric_definitions=[
    {'Name': 'loss', 'Regex': "'loss': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'learning_rate', 'Regex': "'learning_rate': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_loss', 'Regex': "'eval_loss': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_accuracy', 'Regex': "'eval_accuracy': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_f1', 'Regex': "'eval_f1': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_precision', 'Regex': "'eval_precision': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_recall', 'Regex': "'eval_recall': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_stupid_metric', 'Regex': "'eval_stupid_metric': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_runtime', 'Regex': "'eval_runtime': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_samples_per_second', 'Regex': "'eval_samples_per_second': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'epoch', 'Regex': "'epoch': ([0-9]+(.|e\-)[0-9]+),?"}]

In [None]:
# # set True if you need spot instance
# use_spot = True
# train_max_run_secs =   2*24 * 60 * 60
# spot_wait_sec =  5 * 60
# max_wait_time_secs = train_max_run_secs +  spot_wait_sec

# if not use_spot:
#     max_wait_time_secs = None
    
# # During local mode, no spot.., use smaller dataset
# if instance_type == 'local':
#     use_spot = False
#     max_wait_time_secs = 0
#     wait = True
#     # Use smaller dataset to run locally
#     inputs = inputs_sample


In [None]:
from sagemaker.pytorch import PyTorch

hyperparameters={'epochs': 2,
                 'train_batch_size': 32,
                 'model_name': 'distilbert-base-uncased'
                 }

estimator = PyTorch(
    entry_point='train.py',
    source_dir=str(SCRIPTS_TRAINING_PATH / 'stefano/pytorch_estimator_example'),
    output_path=str(DEV_BUCKET / 'models/'),
    code_location=str(input_path),
    instance_type='ml.p3.2xlarge',
    instance_count=1,
    role=role,
    framework_version='1.8',
    py_version='py36',
    hyperparameters = hyperparameters,
    metric_definitions=metric_definitions,
    job_name=job_name,
#     train_instance_count=2,
#     train_instance_type="ml.c4.xlarge",
)

In [None]:
fit_arguments = {
    'train': str(input_path),
    'test': str(input_path)
}

In [None]:
estimator.fit(fit_arguments, job_name=job_name, wait=False)

In [None]:
from sagemaker import TrainingJobAnalytics

# Captured metrics can be accessed as a Pandas dataframe
df = TrainingJobAnalytics(training_job_name=estimator.latest_training_job.name).dataframe()
df.head(10)

In [None]:
df