**Train the BioNER model on N2C2 2018 Track 2 dataset using Clinical-BERT. Save to /model**

**Final training notebook for BERT-CRF model**


**Data versions**
- v2 = reduced max seq length to ~100

**Model versions**
- v5 = train v4 model params + CRF layer
- v6 = v5 but 384 max_seq_len

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

# Initialize Parameters


In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [3]:
!ls '/content/gdrive/My Drive/projects/biomedical_ner/model'

v4  v5	v6  v7


In [4]:
DATA_VER = "v2"
MODEL_VER = "v5"
PARENT_DIR = "/content/gdrive/My Drive/projects/biomedical_ner"
DATA_DIR = PARENT_DIR + "/data/" + DATA_VER
MODEL_DIR = PARENT_DIR + "/model/" + MODEL_VER
TRAIN_DIR = DATA_DIR + "/train"
VAL_DIR = DATA_DIR + "/val"
OUTPUT_DIR = PARENT_DIR + "/output/" + MODEL_VER

MODEL_PATH = MODEL_DIR + "/pytorch_model.pt"
CONFIG_PATH = MODEL_DIR + "/config.json"
VOCAB_PATH = MODEL_DIR + "/vocab.txt"
BERT_VARIANT = "emilyalsentzer/Bio_Discharge_Summary_BERT"

In [5]:
import os
if not os.path.exists(MODEL_DIR):
  os.makedirs(MODEL_DIR)
if not os.path.exists(OUTPUT_DIR):
  os.makedirs(OUTPUT_DIR)

In [6]:
batch_size = 16
max_len = 272 # try 384
epochs = 30
lr = 3e-5
pad_label = "X"
max_grad_norm = 1.0
full_finetuning = True
dropout = 0.1

# Requirements Installation

In [7]:
!pip install seqeval
!pip install transformers



In [8]:
import pandas as pd
import math
import numpy as np
from seqeval.metrics import f1_score
from seqeval.metrics import classification_report,accuracy_score,f1_score
import torch.nn.functional as F
from torch.nn import CrossEntropyLoss

import torch
import os
from tqdm import tqdm,trange
from torch.optim import Adam
from torch.utils.data import DataLoader, SequentialSampler, Dataset, ConcatDataset
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from transformers import AutoConfig, AutoModel, AutoTokenizer, AdamW, BertTokenizer, BertForTokenClassification
from transformers import get_linear_schedule_with_warmup
import matplotlib.pyplot as plt
import seaborn as sns
import torch.nn as nn
# from pytorch_pretrained_bert.optimization import BertAdam

In [9]:
# Check library version
!pip list | grep -E 'transformers|torch|Keras'

Keras                         2.4.3          
Keras-Preprocessing           1.1.2          
torch                         1.7.0+cu101    
torchsummary                  1.5.1          
torchtext                     0.3.1          
torchvision                   0.8.1+cu101    
transformers                  4.0.0          


# Setup Mapping

In [10]:
tag2idx = {'B-Drug': 0,
          'I-Drug': 1,
          'B-Reason': 2,
          'I-Reason': 3,
          'B-ADE': 4,
          'I-ADE': 5,
          'O': 6,
          'X': 7,
          '[CLS]': 8,
          '[SEP]': 9
          }
tag2name = {tag2idx[key] : key for key in tag2idx}
# class_weights = torch.tensor([5.667039548812603, 30.35792759051186, 24.878964599959076, 28.26208740120874, 99.69946699466995, 116.96344396344396, 0.11770158405624111, 0, 9.980995772277634, 9.980995772277634])

# Setup GPU

In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
n_gpu

1

# Prepare Data- Load, Concatenate, Tokenize

In [12]:
!ls '$TRAIN_DIR' | wc -l

265


In [13]:
!ls '$VAL_DIR' | wc -l

38


In [14]:
class ClinicalDataset(Dataset):
    def __init__(self, file, path, max_seq_len, tag2idx, tokenizer):
        self.max_seq_len = max_seq_len;
        self.path = os.path.join(path, file)
        self.df = pd.read_csv(self.path, names=['patientID', 'sentenceID', 'token', 'tag'], keep_default_na=False)
        self.tag2idx = tag2idx
        self.tokenizer = tokenizer
        # Convert Tokens to indices
        self.prepare_data()

    def prepare_data(self):
        sentences, labels = self.get_sentences(self.df)
        tokenized_texts, word_piece_labels = self.tokenize_text(sentences, labels)
        # print(tokenized_texts)
        # print(word_piece_labels)

        # Make text token into id
        input_ids = pad_sequences([self.tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                                  maxlen=self.max_seq_len, dtype="long", truncating="post", padding="post")

        # Make label into id, pad with "X" meaning others/wrong
        tags = pad_sequences([[tag2idx[l] for l in lab] for lab in word_piece_labels],
                             maxlen=self.max_seq_len, value=self.tag2idx[pad_label],
                             padding="post", dtype="long", truncating="post")

        # For fine tune of predict, with token mask is 1,pad token is 0
        attention_masks = [[int(i > 0) for i in ii] for ii in input_ids]

        self.Sentences = torch.tensor(input_ids)
        self.label_data = torch.tensor(tags)
        self.attention_masks = torch.tensor(attention_masks)

    def get_sentences(self, data):
        agg_func = lambda s: [(w, t) for w, t in zip(s["token"].values.tolist(), s["tag"].values.tolist())]
        grouped = data.groupby("sentenceID").apply(agg_func)
        tokenstags = [s for s in grouped]
        sentences = [[s[0] for s in sent] for sent in tokenstags]
        labels = [[s[1] for s in sent] for sent in tokenstags]
        return sentences, labels

    def tokenize_text(self, sentences, labels):
        tokenized_texts = []
        word_piece_labels = []
        i_inc = 0
        for word_list, label in (zip(sentences,labels)):
            temp_label = []
            temp_token = []

            # Add [CLS] at the front
            temp_label.append('[CLS]')
            temp_token.append('[CLS]')

            for word,lab in zip(word_list,label):
                token_list = self.tokenizer.tokenize(word)
                for m,token in enumerate(token_list):
                    temp_token.append(token)
                    if lab.startswith('B'):
                        if m==0:
                            temp_label.append(lab)
                        else:
                            temp_label.append('I-'+lab.split('-')[1])
                    else:
                        temp_label.append(lab)

            # Add [SEP] at the end
            temp_token.append('[SEP]')
            temp_label.append('[SEP]')

            tokenized_texts.append(temp_token)
            word_piece_labels.append(temp_label)

        return tokenized_texts, word_piece_labels

    def __len__(self):
        return len(self.Sentences)

    def __getitem__(self, idx):
        return self.Sentences[idx], self.attention_masks[idx], self.label_data[idx]

In [15]:
# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(BERT_VARIANT)

In [16]:
# TRAIN DATASET
train_datasets = []

for doc in os.listdir(TRAIN_DIR):
    train_datasets.append(ClinicalDataset(doc, TRAIN_DIR, max_len, tag2idx, tokenizer))

# concatenate CSV data
train_dataset = ConcatDataset(train_datasets)

train_sampler = SequentialSampler(train_dataset)

train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=batch_size) # drop_last=True

In [17]:
print(f'Dataset length - {len(train_dataset)}, Dataloader length - {len(train_dataloader)}')

Dataset length - 8121, Dataloader length - 508


In [18]:
# VAL DATASET
val_datasets = []

for doc in os.listdir(VAL_DIR):
    val_datasets.append(ClinicalDataset(doc, VAL_DIR, max_len, tag2idx, tokenizer))

# concatenate CSV data
val_dataset = ConcatDataset(val_datasets)

val_sampler = SequentialSampler(val_dataset)

val_dataloader = DataLoader(val_dataset, sampler=val_sampler, batch_size=batch_size)

# Train Model

In [19]:
# torch.cuda.empty_cache()

In [20]:
def log_sum_exp_1vec(vec):  # shape(1,m)
    max_score = vec[0, np.argmax(vec)]
    max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
    return max_score + torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))

def log_sum_exp_mat(log_M, axis=-1):  # shape(n,m)
    return torch.max(log_M, axis)[0]+torch.log(torch.exp(log_M-torch.max(log_M, axis)[0][:, None]).sum(axis))

def log_sum_exp_batch(log_Tensor, axis=-1): # shape (batch_size,n,m)
    return torch.max(log_Tensor, axis)[0]+torch.log(torch.exp(log_Tensor-torch.max(log_Tensor, axis)[0].view(log_Tensor.shape[0],-1,1)).sum(axis))


class BERT_CRF_NER(nn.Module):

    def __init__(self, bert_model, start_label_id, stop_label_id, num_labels, max_seq_length, batch_size, device):
        super(BERT_CRF_NER, self).__init__()
        self.hidden_size = 768
        self.start_label_id = start_label_id
        self.stop_label_id = stop_label_id
        self.num_labels = num_labels
        # self.max_seq_length = max_seq_length
        self.batch_size = batch_size
        self.device=device

        # use pretrainded BertModel 
        self.bert = bert_model
        self.dropout = torch.nn.Dropout(0.2)
        # Maps the output of the bert into label space.
        self.hidden2label = nn.Linear(self.hidden_size, self.num_labels)

        # Matrix of transition parameters.  Entry i,j is the score of transitioning *to* i *from* j.
        self.transitions = nn.Parameter(
            torch.randn(self.num_labels, self.num_labels))

        # These two statements enforce the constraint that we never transfer *to* the start tag(or label),
        # and we never transfer *from* the stop label (the model would probably learn this anyway,
        # so this enforcement is likely unimportant)
        self.transitions.data[start_label_id, :] = -10000
        self.transitions.data[:, stop_label_id] = -10000

        nn.init.xavier_uniform_(self.hidden2label.weight)
        nn.init.constant_(self.hidden2label.bias, 0.0)
        # self.apply(self.init_bert_weights)

    def init_bert_weights(self, module):
        """ Initialize the weights.
        """
        if isinstance(module, (nn.Linear, nn.Embedding)): 
            # Slightly different from the TF version which uses truncated_normal for initialization
            # cf https://github.com/pytorch/pytorch/pull/5617
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
        elif isinstance(module, BertLayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        if isinstance(module, nn.Linear) and module.bias is not None:
            module.bias.data.zero_()

    def _forward_alg(self, feats):
        '''
        this also called alpha-recursion or forward recursion, to calculate log_prob of all barX 
        '''
        
        # T = self.max_seq_length
        T = feats.shape[1]
        batch_size = feats.shape[0]
        
        # alpha_recursion,forward, alpha(zt)=p(zt,bar_x_1:t)
        log_alpha = torch.Tensor(batch_size, 1, self.num_labels).fill_(-10000.).to(self.device)
        # normal_alpha_0 : alpha[0]=Ot[0]*self.PIs
        # self.start_label has all of the score. it is log,0 is p=1
        log_alpha[:, 0, self.start_label_id] = 0
        
        # feats: sentances -> word embedding -> lstm -> MLP -> feats
        # feats is the probability of emission, feat.shape=(1,tag_size)
        for t in range(1, T):
            log_alpha = (log_sum_exp_batch(self.transitions + log_alpha, axis=-1) + feats[:, t]).unsqueeze(1)

        # log_prob of all barX
        log_prob_all_barX = log_sum_exp_batch(log_alpha)
        return log_prob_all_barX

    def _get_bert_features(self, input_ids, segment_ids, input_mask):
        '''
        sentances -> word embedding -> lstm -> MLP -> feats
        '''
        bert_seq_out, last_hidden = self.bert(input_ids, token_type_ids=segment_ids, attention_mask=input_mask)
        bert_seq_out = self.dropout(bert_seq_out)
        bert_feats = self.hidden2label(bert_seq_out)
        return bert_feats

    def _score_sentence(self, feats, label_ids):
        ''' 
        Gives the score of a provided label sequence
        p(X=w1:t,Zt=tag1:t)=...p(Zt=tag_t|Zt-1=tag_t-1)p(xt|Zt=tag_t)...
        '''
        
        # T = self.max_seq_length
        T = feats.shape[1]
        batch_size = feats.shape[0]

        batch_transitions = self.transitions.expand(batch_size,self.num_labels,self.num_labels)
        batch_transitions = batch_transitions.flatten(1)

        score = torch.zeros((feats.shape[0],1)).to(device)
        # the 0th node is start_label->start_word,the probability of them=1. so t begin with 1.
        for t in range(1, T):
            score = score + \
                batch_transitions.gather(-1, (label_ids[:, t]*self.num_labels+label_ids[:, t-1]).view(-1,1)) \
                    + feats[:, t].gather(-1, label_ids[:, t].view(-1,1)).view(-1,1)
        return score

    def _viterbi_decode(self, feats):
        '''
        Max-Product Algorithm or viterbi algorithm, argmax(p(z_0:t|x_0:t))
        '''
        
        # T = self.max_seq_length
        T = feats.shape[1]
        batch_size = feats.shape[0]

        # batch_transitions=self.transitions.expand(batch_size,self.num_labels,self.num_labels)

        log_delta = torch.Tensor(batch_size, 1, self.num_labels).fill_(-10000.).to(self.device)
        log_delta[:, 0, self.start_label_id] = 0
        
        # psi is for the vaule of the last latent that make P(this_latent) maximum.
        psi = torch.zeros((batch_size, T, self.num_labels), dtype=torch.long).to(self.device)  # psi[0]=0000 useless
        for t in range(1, T):
            # delta[t][k]=max_z1:t-1( p(x1,x2,...,xt,z1,z2,...,zt-1,zt=k|theta) )
            # delta[t] is the max prob of the path from  z_t-1 to z_t[k]
            log_delta, psi[:, t] = torch.max(self.transitions + log_delta, -1)
            # psi[t][k]=argmax_z1:t-1( p(x1,x2,...,xt,z1,z2,...,zt-1,zt=k|theta) )
            # psi[t][k] is the path choosed from z_t-1 to z_t[k],the value is the z_state(is k) index of z_t-1
            log_delta = (log_delta + feats[:, t]).unsqueeze(1)

        # trace back
        path = torch.zeros((batch_size, T), dtype=torch.long).to(self.device)

        # max p(z1:t,all_x|theta)
        max_logLL_allz_allx, path[:, -1] = torch.max(log_delta.squeeze(), -1)

        for t in range(T-2, -1, -1):
            # choose the state of z_t according the state choosed of z_t+1.
            path[:, t] = psi[:, t+1].gather(-1,path[:, t+1].view(-1,1)).squeeze()

        return max_logLL_allz_allx, path

    def neg_log_likelihood(self, input_ids, segment_ids, input_mask, label_ids):
        bert_feats = self._get_bert_features(input_ids, segment_ids, input_mask)
        forward_score = self._forward_alg(bert_feats)
        # p(X=w1:t,Zt=tag1:t)=...p(Zt=tag_t|Zt-1=tag_t-1)p(xt|Zt=tag_t)...
        gold_score = self._score_sentence(bert_feats, label_ids)
        # - log[ p(X=w1:t,Zt=tag1:t)/p(X=w1:t) ] = - log[ p(Zt=tag1:t|X=w1:t) ]
        return torch.mean(forward_score - gold_score)

    # this forward is just for predict, not for train
    # dont confuse this with _forward_alg above.
    def forward(self, input_ids, segment_ids, input_mask):
        # Get the emission scores from the BiLSTM
        bert_feats = self._get_bert_features(input_ids, segment_ids, input_mask)

        # Find the best path, given the features.
        score, label_seq_ids = self._viterbi_decode(bert_feats)
        gold_score = self._score_sentence(bert_feats, label_seq_ids)

        return torch.mean(gold_score - score), label_seq_ids


#bert_model = BertModel.from_pretrained(bert_model_scale)

bert_model = AutoModel.from_pretrained(BERT_VARIANT, output_hidden_states=False, return_dict = False)
start_label_id = tag2idx["[CLS]"]
stop_label_id = tag2idx["[SEP]"]

model = BERT_CRF_NER(bert_model, start_label_id, stop_label_id, len(tag2idx), max_len, batch_size, device)


In [21]:
model.cuda();
# loss_weights = torch.FloatTensor(class_weights).cuda()

In [22]:
# Prepare optimizer
param_optimizer = list(model.named_parameters())
weight_decay_finetune = 1e-5
weight_decay_crf_fc = 5e-6
lr0_crf_fc = 8e-5
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
new_param = ['transitions', 'hidden2label.weight', 'hidden2label.bias']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) \
        and not any(nd in n for nd in new_param)], 'weight_decay': weight_decay_finetune},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) \
        and not any(nd in n for nd in new_param)], 'weight_decay': 0.0},
    {'params': [p for n, p in param_optimizer if n in ('transitions','hidden2label.weight')] \
        , 'lr':lr0_crf_fc, 'weight_decay': weight_decay_crf_fc},
    {'params': [p for n, p in param_optimizer if n == 'hidden2label.bias'] \
        , 'lr':lr0_crf_fc, 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=lr, eps=1e-8)

# def warmup_linear(x, warmup=0.002):
#     if x < warmup:
#         return x/warmup
#     return 1.0 - x

In [23]:
# if full_finetuning:
#     # Fine tune model all layer parameters
#     param_optimizer = list(model.named_parameters())
#     no_decay = ['bias', 'gamma', 'beta']
#     optimizer_grouped_parameters = [
#         {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
#          'weight_decay_rate': 0.01},
#         {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
#          'weight_decay_rate': 0.0}
#     ]
# else:
#     # Only fine tune classifier parameters
#     param_optimizer = list(model.classifier.named_parameters()) 
#     optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]
# optimizer = AdamW(optimizer_grouped_parameters, lr=lr, eps=1e-8) # (default=1e-6)

In [24]:
# Scheduler
# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataset) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

In [None]:
print("\n***** Running training *****")
print("  Num examples = %d"%(len(train_dataset)))
print("  Batch size = %d"%(batch_size))
loss_values, val_loss_values = [], []
best_f1 = float("-inf")
invalid_tags = set(["X", "[CLS]", "[SEP]"])
for epoch in trange(epochs,desc="Epoch"):
    model.train();
    tr_loss = 0
    # nb_tr_examples = 0
    for step, batch in enumerate(train_dataloader):
        # add batch to gpu
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        
        # clear any previously calculated gradients
        model.zero_grad()

        loss = model.neg_log_likelihood(b_input_ids, None, b_input_mask, b_labels)

        # forward pass
        # outputs = model(b_input_ids, token_type_ids=None, 
        #                 attention_mask=b_input_mask, labels = b_labels)
        # loss = outputs[0]
        

        # Custom loss calculation
        # outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels = None)
        # logits = outputs[0]
        # loss = None
        # attention_mask = b_input_mask
        # labels = b_labels

        # loss_fct = CrossEntropyLoss(weight=loss_weights)
        # if attention_mask is not None:
        #     active_loss = attention_mask.view(-1) == 1
        #     active_logits = logits.view(-1, len(tag2idx))
        #     active_labels = torch.where(active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels))
        #     loss = loss_fct(active_logits, active_labels)
        # else:
        #     loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        # backward pass
        loss.backward()
        
        # track train loss
        tr_loss += loss.item()
        # nb_tr_examples += b_input_ids.size(0)
        
        # Clip the norm of the gradient
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        
        # update parameters
        optimizer.step()
        scheduler.step()
        

    # print and store train loss
    train_loss = (tr_loss / len(train_dataset))
    loss_values.append(train_loss)
    print("Train loss: {}".format(train_loss))

    # VALIDATION STEP
    model.eval();
    val_loss = 0
    # nb_eval_examples = 0
    predictions , true_labels = [], []

    for batch in val_dataloader:
        # add batch to gpu
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        
        with torch.no_grad():
          # forward pass
          # outputs = model(b_input_ids, token_type_ids=None,
          # attention_mask=b_input_mask, labels=b_labels)
          _, predicted_label_seq_ids = model(b_input_ids, None, b_input_mask)

        # Move logits and labels to CPU
        # logits = outputs[1].detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        predicted_label_ids = predicted_label_seq_ids.to('cpu').numpy()

        # Calculate the loss for this batch of test sentences.
        val_loss += loss.item()
        # nb_eval_examples += b_input_ids.size(0)
        # predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
        predictions.extend(predicted_label_ids)
        true_labels.extend(label_ids)
        
    eval_loss = (val_loss / len(val_dataset))
    val_loss_values.append(eval_loss)
    print("Validation loss: {}".format(eval_loss))

    # pred_tags = [[tag2name[p_i] for p_i, l_i in zip(p, l) if (tag2name[l_i] != "X" and tag2name[l_i] != "[CLS]" and tag2name[l_i] != "[SEP]")] for p, l in zip(predictions, true_labels)]
    pred_tags = [[tag2name[p_i] for p_i, l_i in zip(p, l) if tag2name[l_i] not in invalid_tags] for p, l in zip(predictions, true_labels)]
    # valid_tags = [[tag2name[l_i] for l_i in l if (tag2name[l_i] != "X" and tag2name[l_i] != "[CLS]" and tag2name[l_i] != "[SEP]")] for l in true_labels]
    valid_tags = [[tag2name[l_i] for l_i in l if tag2name[l_i] not in invalid_tags] for l in true_labels]

    report = classification_report(valid_tags, pred_tags,digits=4)
    print("***** Eval results *****")
    print("\n%s"%(report))
    f1 = f1_score(valid_tags, pred_tags)
    print("F1 score: %f"%(f1))
    print("Accuracy score: %f"%(accuracy_score(valid_tags, pred_tags)))

    # SAVE MODEL
    if f1 > best_f1:
      best_f1 = f1
      print('Saving model for BEST f1 - ', best_f1)
      torch.save({'epoch': epoch, 'model_state': model.state_dict(), 'valid_f1': best_f1}, MODEL_PATH)
      tokenizer.save_vocabulary(MODEL_DIR)
      # savemodel = model.module if hasattr(model, 'module') else model
      # torch.save(savemodel.state_dict(), MODEL_PATH)
      # savemodel.config.to_json_file(CONFIG_PATH)
      


    
    print()

Epoch:   0%|          | 0/30 [00:00<?, ?it/s]


***** Running training *****
  Num examples = 8121
  Batch size = 16
Train loss: 613.3536532528514
Validation loss: 629.2907700614072




***** Eval results *****

              precision    recall  f1-score   support

         ADE     0.1778    0.0777    0.1081       103
        Drug     0.9008    0.9070    0.9039      2043
      Reason     0.5027    0.6286    0.5586       447

   micro avg     0.8050    0.8261    0.8154      2593
   macro avg     0.5271    0.5378    0.5236      2593
weighted avg     0.8035    0.8261    0.8128      2593

F1 score: 0.815379
Accuracy score: 0.984810
Saving model for BEST f1 -  0.8153787590407309


Epoch:   3%|▎         | 1/30 [09:51<4:45:45, 591.21s/it]


Train loss: 610.2353538472787
Validation loss: 626.441165565154
***** Eval results *****

              precision    recall  f1-score   support

         ADE     0.3690    0.3010    0.3316       103
        Drug     0.9145    0.9055    0.9100      2043
      Reason     0.5801    0.6398    0.6085       447

   micro avg     0.8335    0.8357    0.8346      2593
   macro avg     0.6212    0.6154    0.6167      2593
weighted avg     0.8352    0.8357    0.8350      2593

F1 score: 0.834585
Accuracy score: 0.986014
Saving model for BEST f1 -  0.8345850182938571


Epoch:   7%|▋         | 2/30 [19:56<4:37:54, 595.51s/it]


Train loss: 607.2534104530307
Validation loss: 623.2900512593671
***** Eval results *****

              precision    recall  f1-score   support

         ADE     0.3265    0.3107    0.3184       103
        Drug     0.8936    0.9212    0.9072      2043
      Reason     0.4778    0.6734    0.5590       447

   micro avg     0.7816    0.8542    0.8163      2593
   macro avg     0.5660    0.6351    0.5949      2593
weighted avg     0.7994    0.8542    0.8238      2593



Epoch:  10%|█         | 3/30 [29:55<4:28:26, 596.54s/it]

F1 score: 0.816289
Accuracy score: 0.983320

Train loss: 604.0196493971071
Validation loss: 619.950454699209
***** Eval results *****

              precision    recall  f1-score   support

         ADE     0.3217    0.3592    0.3394       103
        Drug     0.8842    0.9232    0.9033      2043
      Reason     0.5050    0.6801    0.5796       447

   micro avg     0.7814    0.8589    0.8183      2593
   macro avg     0.5703    0.6542    0.6074      2593
weighted avg     0.7965    0.8589    0.8251      2593



Epoch:  13%|█▎        | 4/30 [39:55<4:18:53, 597.44s/it]

F1 score: 0.818299
Accuracy score: 0.983871

Train loss: 600.5668678354151
Validation loss: 616.2025623178602
***** Eval results *****

              precision    recall  f1-score   support

         ADE     0.3468    0.4175    0.3789       103
        Drug     0.8879    0.9232    0.9052      2043
      Reason     0.5320    0.6689    0.5927       447

   micro avg     0.7929    0.8592    0.8247      2593
   macro avg     0.5889    0.6698    0.6256      2593
weighted avg     0.8051    0.8592    0.8304      2593



Epoch:  17%|█▋        | 5/30 [49:56<4:09:24, 598.59s/it]

F1 score: 0.824727
Accuracy score: 0.984324

Train loss: 596.9208279099903
Validation loss: 612.4746923136969
***** Eval results *****

              precision    recall  f1-score   support

         ADE     0.3577    0.4272    0.3894       103
        Drug     0.9032    0.9139    0.9085      2043
      Reason     0.5731    0.6398    0.6047       447

   micro avg     0.8170    0.8473    0.8319      2593
   macro avg     0.6114    0.6603    0.6342      2593
weighted avg     0.8247    0.8473    0.8355      2593



Epoch:  20%|██        | 6/30 [59:57<3:59:44, 599.35s/it]

F1 score: 0.831882
Accuracy score: 0.985438

Train loss: 593.0654325735362
Validation loss: 608.437340627602
***** Eval results *****

              precision    recall  f1-score   support

         ADE     0.3800    0.3689    0.3744       103
        Drug     0.9074    0.9109    0.9091      2043
      Reason     0.6096    0.6219    0.6157       447

   micro avg     0.8351    0.8396    0.8373      2593
   macro avg     0.6323    0.6339    0.6331      2593
weighted avg     0.8351    0.8396    0.8373      2593

F1 score: 0.837308
Accuracy score: 0.986597
Saving model for BEST f1 -  0.8373076923076924


Epoch:  23%|██▎       | 7/30 [1:10:00<3:50:11, 600.50s/it]


Train loss: 589.0425648853089
Validation loss: 604.0624512125312
***** Eval results *****

              precision    recall  f1-score   support

         ADE     0.4144    0.4466    0.4299       103
        Drug     0.8965    0.9119    0.9041      2043
      Reason     0.6164    0.6577    0.6364       447

   micro avg     0.8263    0.8496    0.8378      2593
   macro avg     0.6424    0.6721    0.6568      2593
weighted avg     0.8291    0.8496    0.8391      2593

F1 score: 0.837802
Accuracy score: 0.986072
Saving model for BEST f1 -  0.8378018634721429


Epoch:  27%|██▋       | 8/30 [1:20:06<3:40:45, 602.07s/it]


Train loss: 584.7956064668375
Validation loss: 599.764190648418
***** Eval results *****

              precision    recall  f1-score   support

         ADE     0.4184    0.3981    0.4080       103
        Drug     0.9052    0.9158    0.9105      2043
      Reason     0.6211    0.6309    0.6260       447

   micro avg     0.8377    0.8461    0.8419      2593
   macro avg     0.6482    0.6482    0.6481      2593
weighted avg     0.8369    0.8461    0.8415      2593

F1 score: 0.841903
Accuracy score: 0.986500
Saving model for BEST f1 -  0.841903300076746


Epoch:  30%|███       | 9/30 [1:30:09<3:30:46, 602.20s/it]


Train loss: 580.3393622629602
Validation loss: 595.0070221430059
***** Eval results *****

              precision    recall  f1-score   support

         ADE     0.4087    0.4563    0.4312       103
        Drug     0.8991    0.9202    0.9095      2043
      Reason     0.5912    0.6600    0.6237       447

   micro avg     0.8214    0.8569    0.8388      2593
   macro avg     0.6330    0.6788    0.6548      2593
weighted avg     0.8265    0.8569    0.8413      2593



Epoch:  33%|███▎      | 10/30 [1:40:10<3:20:42, 602.11s/it]

F1 score: 0.838807
Accuracy score: 0.985956

Train loss: 575.7025233817033
Validation loss: 589.9684280027061
***** Eval results *****

              precision    recall  f1-score   support

         ADE     0.4375    0.4078    0.4221       103
        Drug     0.8940    0.9202    0.9069      2043
      Reason     0.6190    0.6398    0.6293       447

   micro avg     0.8298    0.8515    0.8405      2593
   macro avg     0.6502    0.6559    0.6528      2593
weighted avg     0.8284    0.8515    0.8398      2593



Epoch:  37%|███▋      | 11/30 [1:50:10<3:10:25, 601.34s/it]

F1 score: 0.840502
Accuracy score: 0.986390

Train loss: 570.9192069797823
Validation loss: 584.6872170326811
***** Eval results *****

              precision    recall  f1-score   support

         ADE     0.3984    0.4757    0.4336       103
        Drug     0.8952    0.9280    0.9113      2043
      Reason     0.5992    0.6555    0.6261       447

   micro avg     0.8198    0.8631    0.8409      2593
   macro avg     0.6309    0.6864    0.6570      2593
weighted avg     0.8244    0.8631    0.8432      2593



Epoch:  40%|████      | 12/30 [2:00:08<3:00:04, 600.27s/it]

F1 score: 0.840879
Accuracy score: 0.986312

Train loss: 565.9539300237233
Validation loss: 580.0818230901332
***** Eval results *****

              precision    recall  f1-score   support

         ADE     0.3945    0.4175    0.4057       103
        Drug     0.9082    0.9099    0.9090      2043
      Reason     0.6188    0.6644    0.6408       447

   micro avg     0.8342    0.8481    0.8411      2593
   macro avg     0.6405    0.6639    0.6518      2593
weighted avg     0.8379    0.8481    0.8428      2593



Epoch:  43%|████▎     | 13/30 [2:10:08<2:50:05, 600.35s/it]

F1 score: 0.841079
Accuracy score: 0.986111



In [None]:
# print('Saving model for BEST loss - ', best_val_loss)
# savemodel = model.module if hasattr(model, 'module') else model
# torch.save(savemodel.state_dict(), MODEL_PATH)
# savemodel.config.to_json_file(CONFIG_PATH)
# tokenizer.save_vocabulary(MODEL_DIR)

In [None]:
!ls '$MODEL_DIR'

# Analyse

In [None]:
# Use plot styling from seaborn.
sns.set(style='darkgrid')

# Increase the plot size and font size.
sns.set(font_scale=1.5)
plt.rcParams["figure.figsize"] = (12,6)

# Plot the learning curve.
plt.plot(loss_values, 'b-o', label="training loss")
plt.plot(val_loss_values, 'r-o', label="validation loss")

# Label the plot.
plt.title("Learning curve")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()

plt.savefig(OUTPUT_DIR + "/loss.png")

plt.show()

In [None]:
!ls '$OUTPUT_DIR'