**Train the BioNER model on N2C2 2018 Track 2 dataset using Clinical-BERT. Save to /model**

**Chosen as BERT baseline**
**BERT-CRF final model performed better**

**Data versions**
- v1 = Sentence-level input + Overlap filtering + max seq len(~192)
- (BEST)v2 = reduced max seq length to ~100

**Model versions**
- v1 = Bio_Discharge_Summary_BERT(data=v1)
- v2 = Bio_Discharge_Summary_BERT(data=v1) trained with weights, scheduler
- v3 = reduced max seq length 128 , 150 epoch, 16 batch, 2e-5 lr(val= 70)
- (BEST)v4 = reduced max seq length 128 , 150 epoch, 32 batch, 3e-5 lr, dropout = 0.1(val= 70)

Note- max seq length ~350 (Stopped as the f1 was 0.40 at 80th epoch, because much info was scrapped after word pieces took total length over max seq length)


In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

# Initialize Parameters


In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
!ls '/content/gdrive/My Drive/projects/biomedical_ner/model'

v1  v3	v4


In [None]:
DATA_VER = "v2"
MODEL_VER = "v4"
PARENT_DIR = "/content/gdrive/My Drive/projects/biomedical_ner"
DATA_DIR = PARENT_DIR + "/data/" + DATA_VER
MODEL_DIR = PARENT_DIR + "/model/" + MODEL_VER
TRAIN_DIR = DATA_DIR + "/train"
VAL_DIR = DATA_DIR + "/val"
OUTPUT_DIR = PARENT_DIR + "/output/" + MODEL_VER

MODEL_PATH = MODEL_DIR + "/pytorch_model.bin"
CONFIG_PATH = MODEL_DIR + "/config.json"
VOCAB_PATH = MODEL_DIR + "/vocab.txt"
BERT_VARIANT = "emilyalsentzer/Bio_Discharge_Summary_BERT"

In [None]:
import os
if not os.path.exists(MODEL_DIR):
  os.makedirs(MODEL_DIR)
if not os.path.exists(OUTPUT_DIR):
  os.makedirs(OUTPUT_DIR)

In [None]:
batch_size = 16
max_len = 272 # tried 384
epochs = 100
lr = 3e-5
pad_label = "X"
max_grad_norm = 1.0
full_finetuning = True
dropout = 0.1

# Requirements Installation

In [None]:
!pip install seqeval
!pip install transformers

Collecting seqeval
[?25l  Downloading https://files.pythonhosted.org/packages/9d/2d/233c79d5b4e5ab1dbf111242299153f3caddddbb691219f363ad55ce783d/seqeval-1.2.2.tar.gz (43kB)
[K     |███████▌                        | 10kB 26.0MB/s eta 0:00:01[K     |███████████████                 | 20kB 30.6MB/s eta 0:00:01[K     |██████████████████████▌         | 30kB 36.0MB/s eta 0:00:01[K     |██████████████████████████████  | 40kB 34.2MB/s eta 0:00:01[K     |████████████████████████████████| 51kB 8.2MB/s 
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-cp36-none-any.whl size=16171 sha256=daa7106640a393197838a1cc60ce8db2da897d724072cf83d616b952487d510c
  Stored in directory: /root/.cache/pip/wheels/52/df/1b/45d75646c37428f7e626214704a0e35bd3cfc32eda37e59e5f
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2
Collecting transform

In [None]:
import pandas as pd
import math
import numpy as np
from seqeval.metrics import f1_score
from seqeval.metrics import classification_report,accuracy_score,f1_score
import torch.nn.functional as F
from torch.nn import CrossEntropyLoss

import torch
import os
from tqdm import tqdm,trange
from torch.optim import Adam
from torch.utils.data import DataLoader, SequentialSampler, Dataset, ConcatDataset
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from transformers import AutoConfig, AutoModelForTokenClassification, AutoTokenizer, AdamW, BertTokenizer, BertForTokenClassification
from transformers import get_linear_schedule_with_warmup
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
# Check library version
!pip list | grep -E 'transformers|torch|Keras'

Keras                         2.4.3          
Keras-Preprocessing           1.1.2          
torch                         1.7.0+cu101    
torchsummary                  1.5.1          
torchtext                     0.3.1          
torchvision                   0.8.1+cu101    
transformers                  3.5.1          


# Setup Mapping

In [None]:
tag2idx = {'B-Drug': 0,
          'I-Drug': 1,
          'B-Reason': 2,
          'I-Reason': 3,
          'B-ADE': 4,
          'I-ADE': 5,
          'O': 6,
          'X': 7,
          '[CLS]': 8,
          '[SEP]': 9
          }
tag2name = {tag2idx[key] : key for key in tag2idx}
# class_weights = torch.tensor([5.667039548812603, 30.35792759051186, 24.878964599959076, 28.26208740120874, 99.69946699466995, 116.96344396344396, 0.11770158405624111, 0, 9.980995772277634, 9.980995772277634])

# Setup GPU

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
n_gpu

1

# Prepare Data- Load, Concatenate, Tokenize

In [None]:
!ls '$TRAIN_DIR' | wc -l

265


In [None]:
!ls '$VAL_DIR' | wc -l

38


In [None]:
class ClinicalDataset(Dataset):
    def __init__(self, file, path, max_seq_len, tag2idx, tokenizer):
        self.max_seq_len = max_seq_len;
        self.path = os.path.join(path, file)
        self.df = pd.read_csv(self.path, names=['patientID', 'sentenceID', 'token', 'tag'], keep_default_na=False)
        self.tag2idx = tag2idx
        self.tokenizer = tokenizer
        # Convert Tokens to indices
        self.prepare_data()

    def prepare_data(self):
        sentences, labels = self.get_sentences(self.df)
        tokenized_texts, word_piece_labels = self.tokenize_text(sentences, labels)
        # print(tokenized_texts)
        # print(word_piece_labels)

        # Make text token into id
        input_ids = pad_sequences([self.tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                                  maxlen=self.max_seq_len, dtype="long", truncating="post", padding="post")

        # Make label into id, pad with "X" meaning others/wrong
        tags = pad_sequences([[tag2idx[l] for l in lab] for lab in word_piece_labels],
                             maxlen=self.max_seq_len, value=self.tag2idx[pad_label],
                             padding="post", dtype="long", truncating="post")

        # For fine tune of predict, with token mask is 1,pad token is 0
        attention_masks = [[int(i > 0) for i in ii] for ii in input_ids]

        self.Sentences = torch.tensor(input_ids)
        self.label_data = torch.tensor(tags)
        self.attention_masks = torch.tensor(attention_masks)

    def get_sentences(self, data):
        agg_func = lambda s: [(w, t) for w, t in zip(s["token"].values.tolist(), s["tag"].values.tolist())]
        grouped = data.groupby("sentenceID").apply(agg_func)
        tokenstags = [s for s in grouped]
        sentences = [[s[0] for s in sent] for sent in tokenstags]
        labels = [[s[1] for s in sent] for sent in tokenstags]
        return sentences, labels

    def tokenize_text(self, sentences, labels):
        tokenized_texts = []
        word_piece_labels = []
        i_inc = 0
        for word_list, label in (zip(sentences,labels)):
            temp_label = []
            temp_token = []

            # Add [CLS] at the front
            temp_label.append('[CLS]')
            temp_token.append('[CLS]')

            for word,lab in zip(word_list,label):
                token_list = self.tokenizer.tokenize(word)
                for m,token in enumerate(token_list):
                    temp_token.append(token)
                    if lab.startswith('B'):
                        if m==0:
                            temp_label.append(lab)
                        else:
                            temp_label.append('I-'+lab.split('-')[1])
                    else:
                        temp_label.append(lab)

            # Add [SEP] at the end
            temp_token.append('[SEP]')
            temp_label.append('[SEP]')

            tokenized_texts.append(temp_token)
            word_piece_labels.append(temp_label)

        return tokenized_texts, word_piece_labels

    def __len__(self):
        return len(self.Sentences)

    def __getitem__(self, idx):
        return self.Sentences[idx], self.attention_masks[idx], self.label_data[idx]

In [None]:
# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(BERT_VARIANT)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=385.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…




In [None]:
# TRAIN DATASET
train_datasets = []

for doc in os.listdir(TRAIN_DIR):
    train_datasets.append(ClinicalDataset(doc, TRAIN_DIR, max_len, tag2idx, tokenizer))

# concatenate CSV data
train_dataset = ConcatDataset(train_datasets)

train_sampler = SequentialSampler(train_dataset)

train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=batch_size) # drop_last=True

In [None]:
print(f'Dataset length - {len(train_dataset)}, Dataloader length - {len(train_dataloader)}')

Dataset length - 8121, Dataloader length - 508


In [None]:
# VAL DATASET
val_datasets = []

for doc in os.listdir(VAL_DIR):
    val_datasets.append(ClinicalDataset(doc, VAL_DIR, max_len, tag2idx, tokenizer))

# concatenate CSV data
val_dataset = ConcatDataset(val_datasets)

val_sampler = SequentialSampler(val_dataset)

val_dataloader = DataLoader(val_dataset, sampler=val_sampler, batch_size=batch_size)

# Train Model

In [None]:
config = AutoConfig.from_pretrained(BERT_VARIANT, num_labels=len(tag2idx), hidden_dropout_prob=dropout, finetuning_task="ClinicalNER")
model = AutoModelForTokenClassification.from_config(config)

In [None]:
model.cuda();
# loss_weights = torch.FloatTensor(class_weights).cuda()

In [None]:
if full_finetuning:
    # Fine tune model all layer parameters
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    # Only fine tune classifier parameters
    param_optimizer = list(model.classifier.named_parameters()) 
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]
optimizer = AdamW(optimizer_grouped_parameters, lr=lr, eps=1e-8) # (default=1e-6)

In [None]:
# Scheduler
# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataset) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

In [None]:
print("\n***** Running training *****")
print("  Num examples = %d"%(len(train_dataset)))
print("  Batch size = %d"%(batch_size))
loss_values, val_loss_values = [], []
best_f1 = float("-inf")
invalid_tags = set(["X", "[CLS]", "[SEP]"])
for _ in trange(epochs,desc="Epoch"):
    model.train();
    tr_loss = 0
    # nb_tr_examples = 0
    for step, batch in enumerate(train_dataloader):
        # add batch to gpu
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        
        # clear any previously calculated gradients
        model.zero_grad()

        # forward pass
        outputs = model(b_input_ids, token_type_ids=None, 
                        attention_mask=b_input_mask, labels = b_labels)
        loss = outputs[0]
        

        # Custom loss calculation
        # outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels = None)
        # logits = outputs[0]
        # loss = None
        # attention_mask = b_input_mask
        # labels = b_labels

        # loss_fct = CrossEntropyLoss(weight=loss_weights)
        # if attention_mask is not None:
        #     active_loss = attention_mask.view(-1) == 1
        #     active_logits = logits.view(-1, len(tag2idx))
        #     active_labels = torch.where(active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels))
        #     loss = loss_fct(active_logits, active_labels)
        # else:
        #     loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        # backward pass
        loss.backward()
        
        # track train loss
        tr_loss += loss.item()
        # nb_tr_examples += b_input_ids.size(0)
        
        # Clip the norm of the gradient
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        
        # update parameters
        optimizer.step()
        scheduler.step()
        

    # print and store train loss
    train_loss = (tr_loss / len(train_dataset))
    loss_values.append(train_loss)
    print("Train loss: {}".format(train_loss))

    # VALIDATION STEP
    model.eval();
    val_loss = 0
    # nb_eval_examples = 0
    predictions , true_labels = [], []

    for batch in val_dataloader:
        # add batch to gpu
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        
        with torch.no_grad():
          # forward pass
          outputs = model(b_input_ids, token_type_ids=None,
          attention_mask=b_input_mask, labels=b_labels)

        # Move logits and labels to CPU
        logits = outputs[1].detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Calculate the loss for this batch of test sentences.
        val_loss += outputs[0].item()
        # nb_eval_examples += b_input_ids.size(0)
        predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
        true_labels.extend(label_ids)
        
    eval_loss = (val_loss / len(val_dataset))
    val_loss_values.append(eval_loss)
    print("Validation loss: {}".format(eval_loss))

    # pred_tags = [[tag2name[p_i] for p_i, l_i in zip(p, l) if (tag2name[l_i] != "X" and tag2name[l_i] != "[CLS]" and tag2name[l_i] != "[SEP]")] for p, l in zip(predictions, true_labels)]
    pred_tags = [[tag2name[p_i] for p_i, l_i in zip(p, l) if tag2name[l_i] not in invalid_tags] for p, l in zip(predictions, true_labels)]
    # valid_tags = [[tag2name[l_i] for l_i in l if (tag2name[l_i] != "X" and tag2name[l_i] != "[CLS]" and tag2name[l_i] != "[SEP]")] for l in true_labels]
    valid_tags = [[tag2name[l_i] for l_i in l if tag2name[l_i] not in invalid_tags] for l in true_labels]

    report = classification_report(valid_tags, pred_tags,digits=4)
    print("***** Eval results *****")
    print("\n%s"%(report))
    f1 = f1_score(valid_tags, pred_tags)
    print("F1 score: %f"%(f1))
    print("Accuracy score: %f"%(accuracy_score(valid_tags, pred_tags)))

    # SAVE MODEL
    if f1 > best_f1:
      best_f1 = f1
      print('Saving model for BEST f1 - ', best_f1)
      savemodel = model.module if hasattr(model, 'module') else model
      torch.save(savemodel.state_dict(), MODEL_PATH)
      savemodel.config.to_json_file(CONFIG_PATH)
      tokenizer.save_vocabulary(MODEL_DIR)
    
    print()

Epoch:   0%|          | 0/100 [00:00<?, ?it/s]


***** Running training *****
  Num examples = 8121
  Batch size = 16
Train loss: 0.013464989623837215
Validation loss: 0.00861716721204771


  _warn_prf(average, modifier, msg_start, len(result))


***** Eval results *****

              precision    recall  f1-score   support

         ADE     0.0000    0.0000    0.0000       103
        Drug     0.2021    0.3084    0.2441      2043
      Reason     0.2060    0.1230    0.1541       447

   micro avg     0.2024    0.2642    0.2292      2593
   macro avg     0.1360    0.1438    0.1327      2593
weighted avg     0.1947    0.2642    0.2189      2593

F1 score: 0.229174
Accuracy score: 0.961824
Saving model for BEST f1 -  0.22917363666778187


Epoch:   1%|          | 1/100 [07:04<11:41:12, 424.97s/it]


Train loss: 0.009158228743800626
Validation loss: 0.008042696476096851
***** Eval results *****

              precision    recall  f1-score   support

         ADE     0.0000    0.0000    0.0000       103
        Drug     0.2043    0.3514    0.2584      2043
      Reason     0.2520    0.1409    0.1808       447

   micro avg     0.2074    0.3012    0.2457      2593
   macro avg     0.1521    0.1641    0.1464      2593
weighted avg     0.2044    0.3012    0.2347      2593

F1 score: 0.245675
Accuracy score: 0.963138
Saving model for BEST f1 -  0.24567474048442905


Epoch:   2%|▏         | 2/100 [14:10<11:34:34, 425.25s/it]


Train loss: 0.008206785914529193
Validation loss: 0.007837432802793512
***** Eval results *****

              precision    recall  f1-score   support

         ADE     0.0000    0.0000    0.0000       103
        Drug     0.2356    0.4356    0.3058      2043
      Reason     0.2574    0.1365    0.1784       447

   micro avg     0.2369    0.3668    0.2879      2593
   macro avg     0.1643    0.1907    0.1614      2593
weighted avg     0.2300    0.3668    0.2717      2593

F1 score: 0.287876
Accuracy score: 0.963358
Saving model for BEST f1 -  0.28787649462691084


Epoch:   3%|▎         | 3/100 [21:15<11:27:20, 425.16s/it]


Train loss: 0.007626741156881115
Validation loss: 0.007912627666605889
***** Eval results *****

              precision    recall  f1-score   support

         ADE     0.2000    0.0097    0.0185       103
        Drug     0.2387    0.4821    0.3193      2043
      Reason     0.1870    0.1477    0.1650       447

   micro avg     0.2346    0.4057    0.2973      2593
   macro avg     0.2086    0.2132    0.1676      2593
weighted avg     0.2283    0.4057    0.2808      2593

F1 score: 0.297301
Accuracy score: 0.962147
Saving model for BEST f1 -  0.29730111629221423


Epoch:   4%|▍         | 4/100 [28:21<11:20:22, 425.23s/it]


Train loss: 0.007144841066253119
Validation loss: 0.008119916241621294
***** Eval results *****

              precision    recall  f1-score   support

         ADE     0.0625    0.0097    0.0168       103
        Drug     0.2447    0.5443    0.3376      2043
      Reason     0.1865    0.1611    0.1729       447

   micro avg     0.2395    0.4570    0.3143      2593
   macro avg     0.1646    0.2384    0.1758      2593
weighted avg     0.2274    0.4570    0.2964      2593

F1 score: 0.314324
Accuracy score: 0.960367
Saving model for BEST f1 -  0.31432360742705573


Epoch:   5%|▌         | 5/100 [35:26<11:13:16, 425.23s/it]


Train loss: 0.006643779838149843
Validation loss: 0.008999924229220345
***** Eval results *****

              precision    recall  f1-score   support

         ADE     0.0282    0.0194    0.0230       103
        Drug     0.2467    0.6779    0.3617      2043
      Reason     0.1167    0.1767    0.1406       447

   micro avg     0.2304    0.5654    0.3274      2593
   macro avg     0.1305    0.2914    0.1751      2593
weighted avg     0.2156    0.5654    0.3101      2593

F1 score: 0.327378
Accuracy score: 0.954073
Saving model for BEST f1 -  0.32737829388119694


Epoch:   6%|▌         | 6/100 [42:31<11:05:54, 425.05s/it]


Train loss: 0.006016246355598804
Validation loss: 0.007906227575286667
***** Eval results *****

              precision    recall  f1-score   support

         ADE     0.0208    0.0097    0.0132       103
        Drug     0.3069    0.6642    0.4198      2043
      Reason     0.1129    0.1879    0.1411       447

   micro avg     0.2766    0.5561    0.3694      2593
   macro avg     0.1469    0.2873    0.1914      2593
weighted avg     0.2621    0.5561    0.3556      2593

F1 score: 0.369412
Accuracy score: 0.962057
Saving model for BEST f1 -  0.36941206609453053


Epoch:   7%|▋         | 7/100 [49:35<10:58:37, 424.92s/it]


Train loss: 0.005330377207711801
Validation loss: 0.007448857544988064
***** Eval results *****

              precision    recall  f1-score   support

         ADE     0.0235    0.0194    0.0213       103
        Drug     0.3421    0.6662    0.4521      2043
      Reason     0.1118    0.2103    0.1460       447

   micro avg     0.2971    0.5619    0.3887      2593
   macro avg     0.1591    0.2986    0.2064      2593
weighted avg     0.2898    0.5619    0.3822      2593

F1 score: 0.388689
Accuracy score: 0.964802
Saving model for BEST f1 -  0.3886888088568761


Epoch:   8%|▊         | 8/100 [56:39<10:51:08, 424.66s/it]


Train loss: 0.004750108472069903
Validation loss: 0.006677878139705265
***** Eval results *****

              precision    recall  f1-score   support

         ADE     0.0417    0.0388    0.0402       103
        Drug     0.4398    0.6334    0.5192      2043
      Reason     0.1693    0.1924    0.1801       447

   micro avg     0.3903    0.5337    0.4509      2593
   macro avg     0.2169    0.2882    0.2465      2593
weighted avg     0.3774    0.5337    0.4417      2593

F1 score: 0.450888
Accuracy score: 0.971666
Saving model for BEST f1 -  0.4508877667372536


Epoch:   9%|▉         | 9/100 [1:03:43<10:43:41, 424.41s/it]


Train loss: 0.004176493647215246
Validation loss: 0.006703661096748461
***** Eval results *****

              precision    recall  f1-score   support

         ADE     0.0980    0.0971    0.0976       103
        Drug     0.4849    0.6750    0.5644      2043
      Reason     0.1600    0.2148    0.1834       447

   micro avg     0.4188    0.5727    0.4838      2593
   macro avg     0.2476    0.3289    0.2818      2593
weighted avg     0.4135    0.5727    0.4801      2593

F1 score: 0.483792
Accuracy score: 0.972397
Saving model for BEST f1 -  0.48379214855839714


Epoch:  10%|█         | 10/100 [1:10:48<10:36:40, 424.45s/it]


Train loss: 0.0034619544427029604
Validation loss: 0.006775490479087207
***** Eval results *****

              precision    recall  f1-score   support

         ADE     0.0504    0.0680    0.0579       103
        Drug     0.5248    0.7053    0.6018      2043
      Reason     0.1367    0.2841    0.1846       447

   micro avg     0.4130    0.6074    0.4916      2593
   macro avg     0.2373    0.3525    0.2814      2593
weighted avg     0.4390    0.6074    0.5083      2593

F1 score: 0.491650
Accuracy score: 0.971303
Saving model for BEST f1 -  0.4916497580771032


In [None]:
# print('Saving model for BEST loss - ', best_val_loss)
# savemodel = model.module if hasattr(model, 'module') else model
# torch.save(savemodel.state_dict(), MODEL_PATH)
# savemodel.config.to_json_file(CONFIG_PATH)
# tokenizer.save_vocabulary(MODEL_DIR)

In [None]:
!ls '$MODEL_DIR'

# Analyse

In [None]:
# Use plot styling from seaborn.
sns.set(style='darkgrid')

# Increase the plot size and font size.
sns.set(font_scale=1.5)
plt.rcParams["figure.figsize"] = (12,6)

# Plot the learning curve.
plt.plot(loss_values, 'b-o', label="training loss")
plt.plot(val_loss_values, 'r-o', label="validation loss")

# Label the plot.
plt.title("Learning curve")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()

plt.savefig(OUTPUT_DIR + "/loss.png")

plt.show()

In [None]:
!ls '$OUTPUT_DIR'