<a href="https://colab.research.google.com/github/xiaoyufan/nbme/blob/main/baseline_deberta.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NBME Baseline DeBERTa

## Configurations

In [None]:
class Config:
  batch_size = 16
  device_name = 'gpu'
  epochs = 1
  input_dir = '/content/drive/MyDrive/CS7150 Deep Learning Project/Dataset/Preprocessed'
  learning_rate = 1e-4
  mode = 'dev'
  model = 'microsoft/deberta-base'
  output_dir = '/content/drive/MyDrive/CS7150 Deep Learning Project/Dataset/Output'
  sequence_max_length = 466
  tpu_cores = 1

## Packages

In [None]:
!pip install ipython-autotime
%load_ext autotime



In [None]:
!pip install transformers
!pip install tokenizers

!pip install --upgrade git+https://github.com/xiaoyufan/nbme.git@e8aadf2b3fe5faa8bf1b3e884a4d2f4d7fbd026b

In [None]:
import numpy as np
import os
import pandas as pd
import torch
import torch.nn as nn

from ast import literal_eval
from nbme_utils.location import locations_to_spans, spans_to_locations
from nbme_utils.prediction import logits_to_spans
from transformers import AutoConfig, AutoModel, AutoTokenizer, PreTrainedTokenizer
from torch.utils.data import DataLoader, Dataset
from tqdm.notebook import tqdm

## TPU

In [None]:
if Config.device_name == 'tpu':
  assert os.environ['COLAB_TPU_ADDR'], 'Make sure to select TPU from Edit > Notebook settings > Hardware accelerator'

In [None]:
!pip install cloud-tpu-client==0.10 torch==1.11.0 https://storage.googleapis.com/tpu-pytorch/wheels/colab/torch_xla-1.11-cp37-cp37m-linux_x86_64.whl

In [None]:
if Config.device_name == 'tpu':
  import torch_xla.core.xla_model as xm
  import torch_xla.distributed.parallel_loader as pl
  import torch_xla.distributed.xla_multiprocessing as xmp
  
  from torch.utils.data.distributed import DistributedSampler

## Tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(Config.model)
tokenizer.save_pretrained(f'{Config.output_dir}/tokenizer')

## Data Loading

### Defining Dataset

In [None]:
train = pd.read_csv(f'{Config.input_dir}/train.csv')
valid = pd.read_csv(f'{Config.input_dir}/validate.csv')
test = pd.read_csv(f'{Config.input_dir}/test.csv')
train.shape, valid.shape, test.shape

In [None]:
if Config.mode == 'dev':
  train = train.sample(n=400, random_state=0).reset_index(drop=True)
  valid = valid.sample(n=100, random_state=0).reset_index(drop=True)
train.shape, valid.shape, test.shape

In [None]:
train.head()

In [None]:
test.head()

In [None]:
def generate_labels(encoded: PreTrainedTokenizer, sample: pd.DataFrame):
  labels = torch.zeros(len(encoded['input_ids']))

  location_spans = locations_to_spans(literal_eval(sample['location']))

  for idx, (seq_id, offsets)in enumerate(zip(encoded['sequence_ids'],
                                             encoded['offset_mapping'])):
    # None for special tokens added around or between sequences,
    # 0 for tokens corresponding to words in the first sequence,
    # 1 for tokens corresponding to words in the second sequence when a pair of sequences was jointly encoded.
    # Labels are generated from patient notes, which are encoded as the first sequence.
    if seq_id != 0:
      labels[idx] = -1
      continue

    subtoken_start, subtoken_end = offsets

    if any([subtoken_start >= location_start and subtoken_end <= location_end
            for location_start, location_end in location_spans]):
      labels[idx] = 1

  return labels

class NBMEDataset(Dataset):
  def __init__(self, data: pd.DataFrame, tokenizer: AutoTokenizer, config: Config,
               testing: bool = False):
    self.data = data
    self.tokenizer = tokenizer
    self.config = config
    self.testing = testing

  def __len__(self):
    return len(self.data)

  def __getitem__(self, idx: int):
    sample = self.data.iloc[idx]

    encoded = self.tokenizer(
      sample['pn_history'],
      sample['feature_text'],
      # TODO: Compute max length of sequences
      max_length=self.config.sequence_max_length,
      padding='max_length',
      return_offsets_mapping=True,
    )
    encoded['sequence_ids'] = np.array(encoded.sequence_ids()).astype('float16')

    x = {k: torch.tensor(v, dtype=torch.long) for k, v in encoded.items()}

    if self.testing:
      return x

    y_true = generate_labels(encoded, sample)
    return x, y_true

In [None]:
train_dataset = NBMEDataset(train, tokenizer, Config)
valid_dataset = NBMEDataset(valid, tokenizer, Config)
test_dataset = NBMEDataset(test, tokenizer, Config, testing=True)

## Model

In [None]:
class NBMEDebertaBaseline(nn.Module):
  def __init__(self):
    super().__init__()

    config = AutoConfig.from_pretrained(Config.model, output_hidden_states=True)
    self.model = AutoModel.from_pretrained(Config.model, config=config)

    self.fc = nn.Linear(config.hidden_size, 1)

  def forward(self, input_ids, attention_mask):
    hidden_states = self.model(input_ids=input_ids, attention_mask=attention_mask)
    logits = self.fc(hidden_states[0])
    logits = torch.sigmoid(logits)
    logits = logits.squeeze(-1)
    return logits

In [None]:
model = NBMEDebertaBaseline()

## Training

In [None]:
criterion = nn.BCEWithLogitsLoss(reduction = "none")

def loss_fn(y_pred, y_true):
  loss = criterion(y_pred, y_true)
  # loss = torch.masked_select(loss, y_true >= 0).mean()
  loss = loss.mean()
  return loss

### Training on GPU/CPU

In [None]:
def train_fn(model, device, data_loader):
  optimizer = torch.optim.AdamW(model.parameters(), lr=Config.learning_rate)

  all_loss = []

  for x, y_true in tqdm(data_loader):
    optimizer.zero_grad()

    input_ids = x['input_ids'].to(device)
    attention_mask = x['attention_mask'].to(device)
    y_true = y_true.to(device)

    logits = model(input_ids, attention_mask)
    loss = loss_fn(logits, y_true)
    all_loss.append(loss)

    loss.backward()
    
    if Config.divice_name == 'tpu':
      xm.optimizer_step(optimizer)
    else:
      optimizer.step()

def valid_fn(model, device, data_loader):
  model.eval()

  true_spans = []
  all_logits = []
  all_offsets = []
  all_sequence_ids = []
  
  for x, y_true in tqdm(data_loader):
    true_spans.append(y_true)
    
    input_ids = x['input_ids'].to(device)
    attention_mask = x['attention_mask'].to(device)

    logits = model(input_ids, attention_mask)

    all_logits.append(logits.detach().cpu().numpy())
    all_offsets.append(x['offset_mapping'].numpy())
    all_sequence_ids.append(x['sequence_ids'].numpy())
  
  all_logits = np.concatenate(all_logits, axis=0)
  all_offsets = np.concatenate(all_offsets, axis=0)
  all_sequence_ids = np.concatenate(all_sequence_ids, axis=0)

  pred_spans = logits_to_spans(all_logits, all_offsets, all_sequence_ids)
  print(all_logits.shape, pred_spans.shape, true_spans.shape)

### Training on GPU

In [None]:
def run_non_tpu():
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
  print(f'Using {device}')

  model.to(device)

  train_loader = DataLoader(
      train_dataset,
      batch_size=Config.batch_size,
      shuffle=True)
  valid_loader = DataLoader(
      valid_dataset,
      batch_size=Config.batch_size,
      shuffle=False)

  for epoch in range(Config.epochs):
    train_fn(model, device, train_loader)
    valid_fn(model, device, valid_loader)

if Config.device_name != 'tpu':
  run_non_tpu()

In [None]:
def run_tpu(index):
  device = xm.xla_device()
  print(f'Process {index} is using {xm.xla_real_devices([str(device)])[0]}')

  model.to(device)

  train_sampler = DistributedSampler(
    train_dataset,
    num_replicas=xm.xrt_world_size(),
    rank=xm.get_ordinal(),
    shuffle=True)
  valid_sampler = DistributedSampler(
    valid_dataset,
    num_replicas=xm.xrt_world_size(),
    rank=xm.get_ordinal(),
    shuffle=False)

  train_loader = DataLoader(
    train_dataset,
    batch_size=Config.batch_size,
    sampler=train_sampler,
    drop_last=True)
  valid_loader = DataLoader(
    valid_dataset,
    batch_size=Config.batch_size,
    sampler=valid_sampler,
    drop_last=True)

  train_loader = pl.MpDeviceLoader(train_loader, device)
  valid_loader = pl.MpDeviceLoader(valid_loader, device)

  for epoch in range(Config.epochs):
    train_fn(model, device, train_loader)
    
    with torch.no_grad():
      valid_fn(model, device, valid_loader)

  # Barrier to prevent master from exiting before workers connect.
  xm.rendezvous('init')

if Config.device_name == 'tpu':
  xmp.spawn(run_tpu, args=(), nprocs=Config.tpu_cores, start_method='fork')

## Testing

In [None]:
def test_fn(model, data_loader):
  device = xm.xla_device()
  
  model.eval()

  all_logits = []
  all_offsets = []
  all_sequence_ids = []
  
  for x in tqdm(data_loader):
    input_ids = x['input_ids'].to(device)
    attention_mask = x['attention_mask'].to(device)

    with torch.no_grad():
      logits = model(input_ids, attention_mask)

    all_logits.append(logits.detach().cpu().numpy())
    all_offsets.append(x['offset_mapping'].numpy())
    all_sequence_ids.append(x['sequence_ids'].numpy())
  
  all_logits = np.concatenate(all_logits, axis=0)
  all_offsets = np.concatenate(all_offsets, axis=0)
  all_sequence_ids = np.concatenate(all_sequence_ids, axis=0)

  all_spans = logits_to_spans(all_logits, all_offsets, all_sequence_ids)
  locations = [spans_to_locations(spans) for spans in all_spans]
  return locations

In [None]:
test_fn(model)