<a href="https://colab.research.google.com/github/xiaoyufan/nbme/blob/main/baseline_deberta.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NBME Baseline DeBERTa

## Configurations

In [1]:
class Config:
  batch_size = 16
  device_name = 'tpu'
  epochs = 1
  input_dir = '/content/drive/MyDrive/CS7150 Deep Learning Project/Dataset/Preprocessed'
  mode = 'dev'
  model = 'microsoft/deberta-base'
  output_dir = '/content/drive/MyDrive/CS7150 Deep Learning Project/Dataset/Output'
  sequence_max_length = 466

## Packages

In [2]:
!pip install transformers
!pip install tokenizers

!pip install --force-reinstall git+https://github.com/xiaoyufan/nbme.git

!pip install cloud-tpu-client==0.10 torch==1.11.0 https://storage.googleapis.com/tpu-pytorch/wheels/colab/torch_xla-1.11-cp37-cp37m-linux_x86_64.whl

Collecting git+https://github.com/xiaoyufan/nbme.git
  Cloning https://github.com/xiaoyufan/nbme.git to /tmp/pip-req-build-_fvm2tmc
  Running command git clone -q https://github.com/xiaoyufan/nbme.git /tmp/pip-req-build-_fvm2tmc
Building wheels for collected packages: nbme
  Building wheel for nbme (setup.py) ... [?25l[?25hdone
  Created wheel for nbme: filename=nbme-1.0-py3-none-any.whl size=3183 sha256=9da37f64863e218154e64da1b6e46a8d04b52734e6e2ce74839f19df2d2a13e3
  Stored in directory: /tmp/pip-ephem-wheel-cache-av005i2w/wheels/e2/dc/dd/b061e30220e414c0a096e4fbaddd405c1b526d1453d9a447e3
Successfully built nbme
Installing collected packages: nbme
  Attempting uninstall: nbme
    Found existing installation: nbme 1.0
    Uninstalling nbme-1.0:
      Successfully uninstalled nbme-1.0
Successfully installed nbme-1.0
Collecting torch-xla==1.11
  Using cached https://storage.googleapis.com/tpu-pytorch/wheels/colab/torch_xla-1.11-cp37-cp37m-linux_x86_64.whl (152.9 MB)


In [3]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn

from ast import literal_eval
from nbme_utils.location import locations_to_spans, spans_to_locations
from nbme_utils.prediction import logits_to_spans
from transformers import AutoConfig, AutoModel, AutoTokenizer, PreTrainedTokenizer
from torch.utils.data import DataLoader, Dataset
from tqdm.notebook import tqdm

## Device

In [4]:
import os

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

if Config.device_name == 'tpu':
  assert os.environ['COLAB_TPU_ADDR'], 'Make sure to select TPU from Edit > Notebook settings > Hardware accelerator'

  import torch_xla
  import torch_xla.core.xla_model as xm

  DEVICE = xm.xla_device()

DEVICE



device(type='xla', index=1)

## Tokenizer

In [5]:
tokenizer = AutoTokenizer.from_pretrained(Config.model)
tokenizer.save_pretrained(f'{Config.output_dir}/tokenizer')

('/content/drive/MyDrive/CS7150 Deep Learning Project/Dataset/Output/tokenizer/tokenizer_config.json',
 '/content/drive/MyDrive/CS7150 Deep Learning Project/Dataset/Output/tokenizer/special_tokens_map.json',
 '/content/drive/MyDrive/CS7150 Deep Learning Project/Dataset/Output/tokenizer/vocab.json',
 '/content/drive/MyDrive/CS7150 Deep Learning Project/Dataset/Output/tokenizer/merges.txt',
 '/content/drive/MyDrive/CS7150 Deep Learning Project/Dataset/Output/tokenizer/added_tokens.json',
 '/content/drive/MyDrive/CS7150 Deep Learning Project/Dataset/Output/tokenizer/tokenizer.json')

## Data Loading

### Defining Dataset

In [6]:
train = pd.read_csv(f'{Config.input_dir}/train.csv')
valid = pd.read_csv(f'{Config.input_dir}/validate.csv')
test = pd.read_csv(f'{Config.input_dir}/test.csv')
train.shape, valid.shape, test.shape

((11342, 8), (2958, 8), (5, 6))

In [7]:
if Config.mode == 'dev':
  train = train.sample(n=800, random_state=0).reset_index(drop=True)
  valid = valid.sample(n=200, random_state=0).reset_index(drop=True)
train.shape, valid.shape, test.shape

((800, 8), (200, 8), (5, 6))

In [8]:
train.head()

Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location,pn_history,feature_text
0,00225_008,0,225,8,"['drinks coffee', 'drinks energy drinks']","['461 467;483 489', '461 467;506 519']",17 y/o previously healthy male here with heart...,Caffeine-use
1,81979_803,8,81979,803,[],[],67yo woman presents with 3 weeks of difficulty...,Auditory-hallucination-once
2,31800_311,3,31800,311,[],[],HPI: Mr. Hamilton is a 35 year old male who pr...,No-blood-in-stool
3,20158_216,2,20158,216,['44'],['22 24'],"Ms Dolores Montgmery, 44 yo F c/o Irregular me...",44-year
4,21591_201,2,21591,201,[],[],Dolores Montgomery is a 44 year old female wit...,Last-Pap-smear-I-year-ago


In [9]:
test.head()

Unnamed: 0,id,case_num,pn_num,feature_num,pn_history,feature_text
0,00016_000,0,16,0,HPI: 17yo M presents with palpitations. Patien...,Family-history-of-MI-OR-Family-history-of-myoc...
1,00016_001,0,16,1,HPI: 17yo M presents with palpitations. Patien...,Family-history-of-thyroid-disorder
2,00016_002,0,16,2,HPI: 17yo M presents with palpitations. Patien...,Chest-pressure
3,00016_003,0,16,3,HPI: 17yo M presents with palpitations. Patien...,Intermittent-symptoms
4,00016_004,0,16,4,HPI: 17yo M presents with palpitations. Patien...,Lightheaded


In [10]:
def generate_labels(encoded: PreTrainedTokenizer, sample: pd.DataFrame):
  labels = torch.zeros(len(encoded['input_ids']))

  location_spans = locations_to_spans(literal_eval(sample['location']))

  for idx, (seq_id, offsets)in enumerate(zip(encoded['sequence_ids'],
                                             encoded['offset_mapping'])):
    # None for special tokens added around or between sequences,
    # 0 for tokens corresponding to words in the first sequence,
    # 1 for tokens corresponding to words in the second sequence when a pair of sequences was jointly encoded.
    # Labels are generated from patient notes, which are encoded as the first sequence.
    if seq_id != 0:
      labels[idx] = -1
      continue

    subtoken_start, subtoken_end = offsets

    if any([subtoken_start >= location_start and subtoken_end <= location_end
            for location_start, location_end in location_spans]):
      labels[idx] = 1

  return labels

class NBMEDataset(Dataset):
  def __init__(self, data: pd.DataFrame, tokenizer: AutoTokenizer, config: Config,
               testing: bool = False):
    self.data = data
    self.tokenizer = tokenizer
    self.config = config
    self.testing = testing

  def __len__(self):
    return len(self.data)

  def __getitem__(self, idx: int):
    sample = self.data.iloc[idx]

    encoded = self.tokenizer(
      sample['pn_history'],
      sample['feature_text'],
      # TODO: Compute max length of sequences
      max_length=self.config.sequence_max_length,
      padding='max_length',
      return_offsets_mapping=True,
    )
    encoded['sequence_ids'] = np.array(encoded.sequence_ids()).astype('float16')

    x = {k: torch.tensor(v, dtype=torch.long) for k, v in encoded.items()}

    if self.testing:
      return x

    y_true = generate_labels(encoded, sample)
    return x, y_true

In [11]:
train_dataset = NBMEDataset(train, tokenizer, Config)
valid_dataset = NBMEDataset(valid, tokenizer, Config)
test_dataset = NBMEDataset(test, tokenizer, Config, testing=True)

### Dataset Loader

In [12]:
train_loader = DataLoader(
    train_dataset,
    batch_size=Config.batch_size,
    shuffle=True)
valid_loader = DataLoader(
    valid_dataset,
    batch_size=Config.batch_size,
    shuffle=False)
test_loader = DataLoader(
    test_dataset,
    batch_size=Config.batch_size,
    shuffle=False)

In [13]:
x, y_true = next(iter(train_loader))

## Model

In [14]:
class NBMEDebertaBaseline(nn.Module):
  def __init__(self):
    super().__init__()

    config = AutoConfig.from_pretrained(Config.model, output_hidden_states=True)
    self.model = AutoModel.from_pretrained(Config.model, config=config)

    self.fc = nn.Linear(config.hidden_size, 1)

  def forward(self, input_ids, attention_mask):
    hidden_states = self.model(input_ids=input_ids, attention_mask=attention_mask)
    logits = self.fc(hidden_states[0])
    logits = torch.sigmoid(logits)
    return logits

In [15]:
model = NBMEDebertaBaseline().to(DEVICE)

Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaModel: ['lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Training

## Testing

In [16]:
def test_fn(model, device):
  model.eval()

  all_logits = []
  all_offsets = []
  all_sequence_ids = []
  
  for batch in tqdm(test_loader):
      x = batch

      input_ids = x['input_ids'].to(device)
      attention_mask = x['attention_mask'].to(device)
  
      logits = model(input_ids, attention_mask)

      all_logits.append(logits.detach().cpu().numpy())
      all_offsets.append(x['offset_mapping'].numpy())
      all_sequence_ids.append(x['sequence_ids'].numpy())
  
  all_logits = np.concatenate(all_logits, axis=0)
  all_offsets = np.concatenate(all_offsets, axis=0)
  all_sequence_ids = np.concatenate(all_sequence_ids, axis=0)

  all_spans = logits_to_spans(all_logits, all_offsets, all_sequence_ids)
  locations = [spans_to_locations(spans) for spans in all_spans]
  return locations

In [17]:
test_fn(model, DEVICE)

  0%|          | 0/1 [00:00<?, ?it/s]

[['4; 11',
  '29; 38',
  '39; 55',
  '66; 69',
  '82; 94',
  '125; 134',
  '154; 156',
  '163; 183',
  '188; 193',
  '198; 208',
  '221; 226',
  '235; 246',
  '254; 258',
  '267; 272',
  '277; 281',
  '295; 303',
  '353; 354',
  '364; 371',
  '385; 392',
  '426; 430',
  '446; 447',
  '454; 460',
  '464; 475',
  '481; 491',
  '604; 612',
  '623; 625',
  '768; 774',
  '899; 917',
  '930; 938'],
 ['4; 11',
  '32; 38',
  '39; 55',
  '66; 69',
  '91; 94',
  '125; 134',
  '154; 156',
  '163; 183',
  '188; 193',
  '198; 208',
  '221; 226',
  '235; 246',
  '254; 258',
  '267; 272',
  '277; 281',
  '295; 303',
  '353; 354',
  '364; 371',
  '385; 392',
  '426; 430',
  '454; 460',
  '464; 475',
  '481; 485',
  '487; 491',
  '595; 601',
  '604; 612',
  '623; 625',
  '899; 917',
  '930; 938'],
 ['4; 11',
  '32; 38',
  '39; 55',
  '91; 94',
  '125; 134',
  '154; 156',
  '163; 183',
  '188; 193',
  '198; 208',
  '221; 226',
  '235; 246',
  '254; 258',
  '267; 272',
  '277; 281',
  '295; 303',
  '353;