<a href="https://colab.research.google.com/github/xiaoyufan/nbme/blob/main/baseline_deberta.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NBME Baseline DeBERTa

## Configurations

In [1]:
class Config:
  batch_size = 8
  device_name = 'gpu'
  epochs = 1
  input_dir = '/content/drive/MyDrive/CS7150 Deep Learning Project/Dataset/Preprocessed'
  learning_rate = 1e-4
  mode = 'train' # train/dev
  model = 'microsoft/deberta-base'
  output_dir = '/content/drive/MyDrive/CS7150 Deep Learning Project/Output'
  seed = 42
  sequence_max_length = 416
  tpu_cores = 1

## Packages

In [2]:
!pip install ipython-autotime
%load_ext autotime

Collecting ipython-autotime
  Downloading ipython_autotime-0.3.1-py2.py3-none-any.whl (6.8 kB)
Installing collected packages: ipython-autotime
Successfully installed ipython-autotime-0.3.1
time: 130 µs (started: 2022-05-01 15:29:34 +00:00)


In [3]:
!pip install transformers
!pip install tokenizers

!pip install --upgrade git+https://github.com/xiaoyufan/nbme.git@3e8153a7bdd4dd8a1d0a29d60bf319bbc1865591

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 15.4 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 69.7 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 73.9 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 7.4 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 68.9 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    F

In [4]:
import numpy as np
import os
import pandas as pd
import spacy
import torch
import torch.nn as nn

from ast import literal_eval
from nbme_utils.location import locations_to_spans, spans_to_locations, generate_labels
from nbme_utils.prediction import logits_to_spans
from nbme_utils.scoring import span_micro_f1
from transformers import AutoConfig, AutoModel, AutoTokenizer, PreTrainedTokenizer
from torch.utils.data import DataLoader, Dataset
from tqdm.notebook import tqdm

time: 6.9 s (started: 2022-05-01 15:29:49 +00:00)


## TPU

In [5]:
if Config.device_name == 'tpu':
  assert os.environ['COLAB_TPU_ADDR'], 'Make sure to select TPU from Edit > Notebook settings > Hardware accelerator'

time: 1.49 ms (started: 2022-05-01 15:29:56 +00:00)


In [6]:
!pip install cloud-tpu-client==0.10 torch==1.11.0 https://storage.googleapis.com/tpu-pytorch/wheels/colab/torch_xla-1.11-cp37-cp37m-linux_x86_64.whl

Collecting torch-xla==1.11
  Downloading https://storage.googleapis.com/tpu-pytorch/wheels/colab/torch_xla-1.11-cp37-cp37m-linux_x86_64.whl (152.9 MB)
[K     |████████████████████████████████| 152.9 MB 1.2 MB/s 
[?25hCollecting cloud-tpu-client==0.10
  Downloading cloud_tpu_client-0.10-py3-none-any.whl (7.4 kB)
Collecting google-api-python-client==1.8.0
  Downloading google_api_python_client-1.8.0-py3-none-any.whl (57 kB)
[K     |████████████████████████████████| 57 kB 4.7 MB/s 
Installing collected packages: google-api-python-client, torch-xla, cloud-tpu-client
  Attempting uninstall: google-api-python-client
    Found existing installation: google-api-python-client 1.12.11
    Uninstalling google-api-python-client-1.12.11:
      Successfully uninstalled google-api-python-client-1.12.11
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
earthengine-api 0.

time: 18.6 s (started: 2022-05-01 15:29:56 +00:00)


In [7]:
if Config.device_name == 'tpu':
  import torch_xla.core.xla_model as xm
  import torch_xla.distributed.parallel_loader as pl
  import torch_xla.distributed.xla_multiprocessing as xmp
  
  from torch.utils.data.distributed import DistributedSampler

time: 2.33 ms (started: 2022-05-01 15:30:15 +00:00)


## Tokenizer

In [8]:
tokenizer = AutoTokenizer.from_pretrained(Config.model)
tokenizer.save_pretrained(f'{Config.output_dir}/tokenizer')

Downloading:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/474 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

('/content/drive/MyDrive/CS7150 Deep Learning Project/Output/tokenizer/tokenizer_config.json',
 '/content/drive/MyDrive/CS7150 Deep Learning Project/Output/tokenizer/special_tokens_map.json',
 '/content/drive/MyDrive/CS7150 Deep Learning Project/Output/tokenizer/vocab.json',
 '/content/drive/MyDrive/CS7150 Deep Learning Project/Output/tokenizer/merges.txt',
 '/content/drive/MyDrive/CS7150 Deep Learning Project/Output/tokenizer/added_tokens.json',
 '/content/drive/MyDrive/CS7150 Deep Learning Project/Output/tokenizer/tokenizer.json')

time: 10.1 s (started: 2022-05-01 15:30:15 +00:00)


## Data Loading

In [9]:
train = pd.read_csv(f'{Config.input_dir}/train.csv')
valid = pd.read_csv(f'{Config.input_dir}/validate.csv')
test = pd.read_csv(f'{Config.input_dir}/test.csv')

train.shape, valid.shape, test.shape

((11342, 8), (2958, 8), (5, 6))

time: 1.58 s (started: 2022-05-01 15:30:25 +00:00)


In [10]:
if Config.mode == 'dev':
  train = train.sample(n=80, random_state=Config.seed).reset_index(drop=True)
  valid = valid.sample(n=20, random_state=Config.seed).reset_index(drop=True)
  
train.shape, valid.shape, test.shape

((11342, 8), (2958, 8), (5, 6))

time: 4.59 ms (started: 2022-05-01 15:30:27 +00:00)


In [11]:
train.head()

Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location,pn_history,feature_text
0,00016_000,0,16,0,['dad with recent heart attcak'],['696 724'],HPI: 17yo M presents with palpitations. Patien...,Family-history-of-MI-OR-Family-history-of-myoc...
1,00016_001,0,16,1,"['mom with ""thyroid disease']",['668 693'],HPI: 17yo M presents with palpitations. Patien...,Family-history-of-thyroid-disorder
2,00016_002,0,16,2,['chest pressure'],['203 217'],HPI: 17yo M presents with palpitations. Patien...,Chest-pressure
3,00016_003,0,16,3,"['intermittent episodes', 'episode']","['70 91', '176 183']",HPI: 17yo M presents with palpitations. Patien...,Intermittent-symptoms
4,00016_004,0,16,4,['felt as if he were going to pass out'],['222 258'],HPI: 17yo M presents with palpitations. Patien...,Lightheaded


time: 16.1 ms (started: 2022-05-01 15:30:27 +00:00)


In [13]:
test.head()

Unnamed: 0,id,case_num,pn_num,feature_num,pn_history,feature_text
0,00016_000,0,16,0,HPI: 17yo M presents with palpitations. Patien...,Family-history-of-MI-OR-Family-history-of-myoc...
1,00016_001,0,16,1,HPI: 17yo M presents with palpitations. Patien...,Family-history-of-thyroid-disorder
2,00016_002,0,16,2,HPI: 17yo M presents with palpitations. Patien...,Chest-pressure
3,00016_003,0,16,3,HPI: 17yo M presents with palpitations. Patien...,Intermittent-symptoms
4,00016_004,0,16,4,HPI: 17yo M presents with palpitations. Patien...,Lightheaded


time: 7.16 ms (started: 2022-05-01 15:30:27 +00:00)


### Dataset

In [14]:
class NBMEDataset(Dataset):
  def __init__(self, data: pd.DataFrame, tokenizer: AutoTokenizer, config: Config,
               testing: bool = False):
    self.data = data
    self.tokenizer = tokenizer
    self.config = config
    self.testing = testing

  def __len__(self):
    return len(self.data)

  def __getitem__(self, idx: int):
    sample = self.data.iloc[idx]

    encoded = self.tokenizer(
      sample['pn_history'],
      sample['feature_text'],
      # TODO: Compute max length of sequences
      max_length=self.config.sequence_max_length,
      padding='max_length',
      truncation='only_second',
      return_offsets_mapping=True,
    )
    encoded['sequence_ids'] = encoded.sequence_ids()

    x = encoded.copy()
    x['input_ids'] = np.array(x['input_ids'])
    x['attention_mask'] = np.array(x['attention_mask'])
    x['offset_mapping'] = np.array(x['offset_mapping'])
    x['sequence_ids'] = np.array(x['sequence_ids']).astype('float16')

    if self.testing:
      return x

    location_spans = locations_to_spans(literal_eval(sample['location']))
    labels = generate_labels(location_spans, encoded['sequence_ids'],
                             encoded['offset_mapping'])
    labels = np.array(labels).astype('float32')
    return x, labels

time: 18.8 ms (started: 2022-05-01 15:30:27 +00:00)


In [15]:
train_dataset = NBMEDataset(train, tokenizer, Config)
valid_dataset = NBMEDataset(valid, tokenizer, Config)
test_dataset = NBMEDataset(test, tokenizer, Config, testing=True)

time: 1.33 ms (started: 2022-05-01 15:30:27 +00:00)


### Data Loader

In [16]:
def get_data_loader(config: Config, dataset, training: bool):
  if config.device_name == 'tpu':
    sampler = DistributedSampler(
      dataset,
      num_replicas=xm.xrt_world_size(),
      rank=xm.get_ordinal(),
      shuffle=training)
    return DataLoader(
      dataset,
      batch_size=Config.batch_size,
      sampler=sampler,
      drop_last=True)

  return DataLoader(
      dataset,
      batch_size=config.batch_size,
    shuffle=training)

time: 3.22 ms (started: 2022-05-01 15:30:27 +00:00)


## Model

In [17]:
class NBMEDebertaBaseline(nn.Module):
  def __init__(self):
    super().__init__()

    config = AutoConfig.from_pretrained(Config.model)
    self.model = AutoModel.from_pretrained(Config.model, config=config)

    self.fc1 = nn.Linear(config.hidden_size, 1)

  def forward(self, input_ids, attention_mask):
    hidden_states = self.model(input_ids=input_ids, attention_mask=attention_mask)
    logits = self.fc1(hidden_states[0])
    logits = logits.squeeze(-1)
    return logits

time: 5.59 ms (started: 2022-05-01 15:30:27 +00:00)


In [18]:
model = NBMEDebertaBaseline()

Downloading:   0%|          | 0.00/533M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaModel: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


time: 11.6 s (started: 2022-05-01 15:30:27 +00:00)


## Training

### Loss Function

In [19]:
# This loss combines a Sigmoid layer and the BCELoss in one single class.
# This version is more numerically stable than using a plain Sigmoid followed
# by a BCELoss as, by combining the operations into one layer, we take advantage
# of the log-sum-exp trick for numerical stability.
criterion = nn.BCEWithLogitsLoss(reduction='none')

def loss_fn(logits, labels):
  loss = criterion(logits, labels)
  loss = torch.masked_select(loss, labels >= 0)
  loss = loss.mean()
  return loss

time: 2.62 ms (started: 2022-05-01 15:30:38 +00:00)


### Train / Valid Functions

In [20]:
def train_fn(model, device, data_loader):
  optimizer = torch.optim.AdamW(model.parameters(), lr=Config.learning_rate)

  all_loss = []

  for idx, (x, labels) in enumerate(tqdm(data_loader)):
    optimizer.zero_grad()

    input_ids = x['input_ids'].to(device)
    attention_mask = x['attention_mask'].to(device)
    labels = labels.to(device)

    logits = model(input_ids, attention_mask)
    loss = loss_fn(logits, labels)
    all_loss.append(loss.item() * input_ids.size(0))

    loss.backward()
    
    if Config.device_name == 'tpu':
      xm.optimizer_step(optimizer)
    else:
      optimizer.step()

  all_loss = np.mean(all_loss)
  return all_loss

def test_fn(model, device, data_loader):
  model.eval()

  true_spans = []
  all_logits = []
  all_offsets = []
  all_sequence_ids = []
  all_loss = []
  
  for x, labels in tqdm(data_loader):
    true_spans.append(labels)
    
    input_ids = x['input_ids'].to(device)
    attention_mask = x['attention_mask'].to(device)
    labels = labels.to(device)

    with torch.no_grad():
      logits = model(input_ids, attention_mask)
    loss = loss_fn(logits, labels)
    all_loss.append(loss.item() * input_ids.size(0))

    all_logits.append(logits.detach().cpu().numpy())
    all_offsets.append(x['offset_mapping'].numpy())
    all_sequence_ids.append(x['sequence_ids'].numpy())
  
  all_logits = np.concatenate(all_logits, axis=0)
  all_offsets = np.concatenate(all_offsets, axis=0)
  all_sequence_ids = np.concatenate(all_sequence_ids, axis=0)
  true_spans = np.concatenate(true_spans, axis=0)

  pred_spans = logits_to_spans(all_logits, all_offsets, all_sequence_ids)
  print(len(pred_spans))
  print('pred_spans:', pred_spans)
  true_spans = logits_to_spans(true_spans, all_offsets, all_sequence_ids)
  score = span_micro_f1(pred_spans, true_spans)
  all_loss = np.mean(all_loss)
  return all_loss, score

time: 121 ms (started: 2022-05-01 15:30:38 +00:00)


### Train Loop

In [22]:
def train_loop_non_tpu():
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
  print(f'Device: {device}')

  model.to(device)

  train_loader = get_data_loader(Config, train_dataset, True)

  for epoch in range(Config.epochs):
    print(f'Epoch: {epoch}')
    
    train_loss = train_fn(model, device, train_loader)
    print(f'Train loss: {train_loss}')

    torch.save(model.state_dict(), f'{Config.output_dir}/models/model.pth')

def train_loop_tpu(index):
  device = xm.xla_device()
  print(f'Process {index} is using {xm.xla_real_devices([str(device)])[0]}')

  model.to(device)

  train_loader = get_data_loader(Config, train_dataset, True)
  train_loader = pl.MpDeviceLoader(train_loader, device)

  for epoch in range(Config.epochs):
    train_fn(model, device, train_loader)

  # Barrier to prevent master from exiting before workers connect.
  xm.rendezvous('init')

if Config.device_name == 'tpu':
  xmp.spawn(train_loop_tpu, args=(), nprocs=Config.tpu_cores, start_method='fork')
else:
  train_loop_non_tpu()

Device: cuda
Epoch: 0


  0%|          | 0/1418 [00:00<?, ?it/s]

Train loss: 0.1635013900325162
time: 8min 6s (started: 2022-05-01 15:33:31 +00:00)


In [23]:
def valid_non_tpu():
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
  print(f'Device: {device}')

  model.load_state_dict(torch.load(f'{Config.output_dir}/models/model.pth', map_location=device))

  valid_loader = get_data_loader(Config, valid_dataset, False)

  valid_loss = test_fn(model, device, valid_loader)
  print(f'Valid loss: {valid_loss}')

valid_non_tpu()

Device: cuda


  0%|          | 0/370 [00:00<?, ?it/s]

2958
pred_spans: [[], [(431, 443)], [], [], [], [], [(395, 401)], [], [], [], [(43, 45)], [(0, 2)], [], [], [(531, 543)], [], [], [], [], [], [(228, 235)], [], [], [], [], [], [], [], [], [], [], [(340, 344)], [(532, 535)], [(230, 237)], [(396, 399)], [(85, 88)], [(40, 48)], [], [], [], [], [], [], [(269, 276)], [], [], [], [], [], [(28, 36)], [(0, 5)], [], [], [(748, 761)], [], [], [(368, 374)], [(520, 524)], [(700, 703)], [], [], [], [(50, 52)], [], [], [(932, 936)], [], [], [], [], [], [], [], [], [(9, 11)], [(63, 76)], [], [], [(900, 904)], [(930, 942)], [], [], [], [], [(306, 308)], [(556, 563)], [], [(84, 91)], [], [], [], [], [(361, 369)], [], [], [], [(174, 186)], [], [(85, 92)], [], [], [(23, 30)], [(6, 12)], [], [(684, 688)], [(659, 663)], [], [], [], [], [(541, 544)], [], [], [], [(169, 173)], [], [], [], [(608, 621)], [], [], [(293, 300)], [], [(506, 508)], [(314, 321)], [(730, 734)], [], [(17, 19)], [(0, 2)], [], [(471, 472)], [(490, 499)], [], [], [], [], [(130, 134)], []

## Inference

In [24]:
def infer_fn():
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
  print(f'Device: {device}')

  model.load_state_dict(torch.load(f'{Config.output_dir}/models/model.pth', map_location=device))

  test_loader = get_data_loader(Config, test_dataset, False)

infer_fn()

Device: cuda
time: 593 ms (started: 2022-05-01 15:42:44 +00:00)
