**Test the BioNER model on N2C2 2018 Track 2 dataset using trained Clinical-BERT. Save to /output**

**Data versions**
- v1 = Sentence-level input + Overlap filtering + max seq len(~192)
- **(BEST)v2** = v1 + reduced max seq length to ~100

**Model versions**

BERT-LR
- v1 = Bio_Discharge_Summary_BERT(data=v1)
- v2 = Bio_Discharge_Summary_BERT(data=v1) trained with weights, scheduler
- v3 = reduced max seq length 128 , 150 epoch, 16 batch, 2e-5 lr(val= 70)
- v4 = reduced max seq length 128(272) , 150 epoch, 32 batch, 3e-5 lr, dropout = 0.1 (val= 70, test(strict)= 73, test(lenient)= 84)

Note- max seq length ~350 (Stopped as the f1 was 0.40 at 80th epoch due to lost info via clipping)

BERT-CRF
- **(BEST)v5** = BERT-CRF with max seq length 128(272) (data=v2) (val= 84, test(strict)= 85, test(lenient)= 90)
- v6 = BERT-CRF with max seq length 128(384) (data=v2)


# Initialize Parameters

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
DATA_VER = "v2"
MODEL_VER = "v5"
PARENT_DIR = "/content/gdrive/My Drive/projects/biomedical_ner"
TEST_DIR = PARENT_DIR + "/data/" + DATA_VER + "/test"
MODEL_DIR = PARENT_DIR + "/model/" + MODEL_VER
OUTPUT_DIR = PARENT_DIR + "/output/" + MODEL_VER

MODEL_PATH = MODEL_DIR + "/pytorch_model.pt"
VOCAB_PATH = MODEL_DIR + "/vocab.txt"
PREDICTIONS_PATH = OUTPUT_DIR + "/predictions.csv"
REPORT_PATH = OUTPUT_DIR + "/result.txt"
CONFUSION_MATRIX_PATH = OUTPUT_DIR + "/confusion_matrix.csv"
SCORES_PATH = OUTPUT_DIR + "/scores.csv"

BERT_VARIANT = "emilyalsentzer/Bio_Discharge_Summary_BERT"

In [None]:
import os
if not os.path.exists(OUTPUT_DIR):
  os.makedirs(OUTPUT_DIR)

In [None]:
batch_size = 16
max_len = 272 # tried 384
pad_label = "X"

# Requirements Installation

In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

!pip install seqeval
!pip install transformers



# Imports

In [None]:
import pandas as pd
import math
import numpy as np
from seqeval.metrics import classification_report,accuracy_score,f1_score
import torch.nn.functional as F
import torch
import os
from tqdm import tqdm,trange
from torch.optim import Adam
from torch.utils.data import DataLoader, SequentialSampler, Dataset, ConcatDataset
from keras.preprocessing.sequence import pad_sequences
from transformers import AutoModel, BertTokenizer, AutoConfig, AutoModelForTokenClassification, AutoTokenizer, BertForTokenClassification
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
import torch.nn as nn

In [None]:
# Check library version
!pip list | grep -E 'transformers|torch|Keras'

Keras                         2.4.3          
Keras-Preprocessing           1.1.2          
torch                         1.7.0+cu101    
torchsummary                  1.5.1          
torchtext                     0.3.1          
torchvision                   0.8.1+cu101    
transformers                  4.0.0          


# Setup Mapping

In [None]:
tag2idx = {'B-Drug': 0,
          'I-Drug': 1,
          'B-Reason': 2,
          'I-Reason': 3,
          'B-ADE': 4,
          'I-ADE': 5,
          'O': 6,
          'X': 7,
          '[CLS]': 8,
          '[SEP]': 9
          }
tag2name = {tag2idx[key] : key for key in tag2idx.keys()}

# Setup GPU

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
n_gpu

1

# Prepare Data

In [None]:
!ls '$TEST_DIR' | wc -l

202


In [None]:
class ClinicalDataset(Dataset):
    def __init__(self, file, path, max_seq_len, tag2idx, tokenizer):
        self.max_seq_len = max_seq_len;
        self.path = os.path.join(path, file)
        self.df = pd.read_csv(self.path, names=['patientID', 'sentenceID', 'token', 'tag'], keep_default_na=False)
        self.tag2idx = tag2idx
        self.tokenizer = tokenizer
        # Convert Tokens to indices
        self.prepare_data()

    def prepare_data(self):
        sentences, labels = self.get_sentences(self.df)
        tokenized_texts, word_piece_labels = self.tokenize_text(sentences, labels)

        # Make text token into id
        input_ids = pad_sequences([self.tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                                  maxlen=self.max_seq_len, dtype="long", truncating="post", padding="post")

        # Make label into id, pad with "O" meaning others/wrong
        tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in word_piece_labels],
                             maxlen=self.max_seq_len, value=self.tag2idx[pad_label],
                             padding="post", dtype="long", truncating="post")
        
        input_text = pad_sequences([txt for txt in tokenized_texts],
                                  maxlen=self.max_seq_len, value = "[PAD]", 
                                   padding="post", dtype=object, truncating="post")
        
        y_text.extend(input_text)

        # For fine tune of predict, with token mask is 1,pad token is 0
        attention_masks = [[int(i > 0) for i in ii] for ii in input_ids]
        
        self.Sentences = torch.tensor(input_ids)
        self.label_data = torch.tensor(tags)
        self.attention_masks = torch.tensor(attention_masks)

    def get_sentences(self, data):
        agg_func = lambda s: [(w, t) for w, t in zip(s["token"].values.tolist(), s["tag"].values.tolist())]
        grouped = data.groupby("sentenceID").apply(agg_func)
        tokenstags = [s for s in grouped]
        sentences = [[s[0] for s in sent] for sent in tokenstags]
        labels = [[s[1] for s in sent] for sent in tokenstags]
        return sentences, labels

    def tokenize_text(self, sentences, labels):
        tokenized_texts = []
        word_piece_labels = []
        i_inc = 0
        for word_list, label in (zip(sentences,labels)):
            temp_label = []
            temp_token = []

            # Add [CLS] at the front
            temp_label.append('[CLS]')
            temp_token.append('[CLS]')

            for word,lab in zip(word_list,label):
                token_list = self.tokenizer.tokenize(word)
                for m,token in enumerate(token_list):
                    temp_token.append(token)
                    if lab.startswith('B'):
                        if m==0:
                            temp_label.append(lab)
                        else:
                            temp_label.append('I-'+lab.split('-')[1])
                    else:
                        temp_label.append(lab)

            # Add [SEP] at the end
            temp_token.append('[SEP]')
            temp_label.append('[SEP]')

            tokenized_texts.append(temp_token)
            word_piece_labels.append(temp_label)

        return tokenized_texts, word_piece_labels

    def __len__(self):
        return len(self.Sentences)

    def __getitem__(self, idx):
        return self.Sentences[idx], self.attention_masks[idx], self.label_data[idx]

In [None]:
# Tokenizer
tokenizer = BertTokenizer(vocab_file=VOCAB_PATH)

In [None]:
# TEST DATASET
test_datasets = []
y_text = []
for doc in os.listdir(TEST_DIR):
    test_datasets.append(ClinicalDataset(doc, TEST_DIR, max_len, tag2idx, tokenizer))

# concatenate CSV data
test_dataset = ConcatDataset(test_datasets)

test_sampler = SequentialSampler(test_dataset)

test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=batch_size) # drop_last=True

In [None]:
print(f'Dataset length - {len(test_dataset)}, Dataloader length - {len(test_dataloader)}')

Dataset length - 6115, Dataloader length - 383


# Load Model

[ref](https://github.com/Louis-udm/NER-BERT-CRF/blob/master/NER_BERT_CRF.py)

In [None]:
def log_sum_exp_1vec(vec):  # shape(1,m)
    max_score = vec[0, np.argmax(vec)]
    max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
    return max_score + torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))

def log_sum_exp_mat(log_M, axis=-1):  # shape(n,m)
    return torch.max(log_M, axis)[0]+torch.log(torch.exp(log_M-torch.max(log_M, axis)[0][:, None]).sum(axis))

def log_sum_exp_batch(log_Tensor, axis=-1): # shape (batch_size,n,m)
    return torch.max(log_Tensor, axis)[0]+torch.log(torch.exp(log_Tensor-torch.max(log_Tensor, axis)[0].view(log_Tensor.shape[0],-1,1)).sum(axis))


class BERT_CRF_NER(nn.Module):

    def __init__(self, bert_model, start_label_id, stop_label_id, num_labels, max_seq_length, batch_size, device):
        super(BERT_CRF_NER, self).__init__()
        self.hidden_size = 768
        self.start_label_id = start_label_id
        self.stop_label_id = stop_label_id
        self.num_labels = num_labels
        # self.max_seq_length = max_seq_length
        self.batch_size = batch_size
        self.device=device

        # use pretrainded BertModel 
        self.bert = bert_model
        self.dropout = torch.nn.Dropout(0.2)
        # Maps the output of the bert into label space.
        self.hidden2label = nn.Linear(self.hidden_size, self.num_labels)

        # Matrix of transition parameters.  Entry i,j is the score of transitioning *to* i *from* j.
        self.transitions = nn.Parameter(
            torch.randn(self.num_labels, self.num_labels))

        # These two statements enforce the constraint that we never transfer *to* the start tag(or label),
        # and we never transfer *from* the stop label (the model would probably learn this anyway,
        # so this enforcement is likely unimportant)
        self.transitions.data[start_label_id, :] = -10000
        self.transitions.data[:, stop_label_id] = -10000

        nn.init.xavier_uniform_(self.hidden2label.weight)
        nn.init.constant_(self.hidden2label.bias, 0.0)
        # self.apply(self.init_bert_weights)

    def init_bert_weights(self, module):
        """ Initialize the weights.
        """
        if isinstance(module, (nn.Linear, nn.Embedding)): 
            # Slightly different from the TF version which uses truncated_normal for initialization
            # cf https://github.com/pytorch/pytorch/pull/5617
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
        elif isinstance(module, BertLayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        if isinstance(module, nn.Linear) and module.bias is not None:
            module.bias.data.zero_()

    def _forward_alg(self, feats):
        '''
        this also called alpha-recursion or forward recursion, to calculate log_prob of all barX 
        '''
        
        # T = self.max_seq_length
        T = feats.shape[1]
        batch_size = feats.shape[0]
        
        # alpha_recursion,forward, alpha(zt)=p(zt,bar_x_1:t)
        log_alpha = torch.Tensor(batch_size, 1, self.num_labels).fill_(-10000.).to(self.device)
        # normal_alpha_0 : alpha[0]=Ot[0]*self.PIs
        # self.start_label has all of the score. it is log,0 is p=1
        log_alpha[:, 0, self.start_label_id] = 0
        
        # feats: sentances -> word embedding -> lstm -> MLP -> feats
        # feats is the probability of emission, feat.shape=(1,tag_size)
        for t in range(1, T):
            log_alpha = (log_sum_exp_batch(self.transitions + log_alpha, axis=-1) + feats[:, t]).unsqueeze(1)

        # log_prob of all barX
        log_prob_all_barX = log_sum_exp_batch(log_alpha)
        return log_prob_all_barX

    def _get_bert_features(self, input_ids, segment_ids, input_mask):
        '''
        sentances -> word embedding -> lstm -> MLP -> feats
        '''
        bert_seq_out, last_hidden = self.bert(input_ids, token_type_ids=segment_ids, attention_mask=input_mask)
        bert_seq_out = self.dropout(bert_seq_out)
        bert_feats = self.hidden2label(bert_seq_out)
        return bert_feats

    def _score_sentence(self, feats, label_ids):
        ''' 
        Gives the score of a provided label sequence
        p(X=w1:t,Zt=tag1:t)=...p(Zt=tag_t|Zt-1=tag_t-1)p(xt|Zt=tag_t)...
        '''
        
        # T = self.max_seq_length
        T = feats.shape[1]
        batch_size = feats.shape[0]

        batch_transitions = self.transitions.expand(batch_size,self.num_labels,self.num_labels)
        batch_transitions = batch_transitions.flatten(1)

        score = torch.zeros((feats.shape[0],1)).to(device)
        # the 0th node is start_label->start_word,the probability of them=1. so t begin with 1.
        for t in range(1, T):
            score = score + \
                batch_transitions.gather(-1, (label_ids[:, t]*self.num_labels+label_ids[:, t-1]).view(-1,1)) \
                    + feats[:, t].gather(-1, label_ids[:, t].view(-1,1)).view(-1,1)
        return score

    def _viterbi_decode(self, feats):
        '''
        Max-Product Algorithm or viterbi algorithm, argmax(p(z_0:t|x_0:t))
        '''
        
        # T = self.max_seq_length
        T = feats.shape[1]
        batch_size = feats.shape[0]

        # batch_transitions=self.transitions.expand(batch_size,self.num_labels,self.num_labels)

        log_delta = torch.Tensor(batch_size, 1, self.num_labels).fill_(-10000.).to(self.device)
        log_delta[:, 0, self.start_label_id] = 0
        
        # psi is for the vaule of the last latent that make P(this_latent) maximum.
        psi = torch.zeros((batch_size, T, self.num_labels), dtype=torch.long).to(self.device)  # psi[0]=0000 useless
        for t in range(1, T):
            # delta[t][k]=max_z1:t-1( p(x1,x2,...,xt,z1,z2,...,zt-1,zt=k|theta) )
            # delta[t] is the max prob of the path from  z_t-1 to z_t[k]
            log_delta, psi[:, t] = torch.max(self.transitions + log_delta, -1)
            # psi[t][k]=argmax_z1:t-1( p(x1,x2,...,xt,z1,z2,...,zt-1,zt=k|theta) )
            # psi[t][k] is the path choosed from z_t-1 to z_t[k],the value is the z_state(is k) index of z_t-1
            log_delta = (log_delta + feats[:, t]).unsqueeze(1)

        # trace back
        path = torch.zeros((batch_size, T), dtype=torch.long).to(self.device)

        # max p(z1:t,all_x|theta)
        max_logLL_allz_allx, path[:, -1] = torch.max(log_delta.squeeze(), -1)

        for t in range(T-2, -1, -1):
            # choose the state of z_t according the state choosed of z_t+1.
            path[:, t] = psi[:, t+1].gather(-1,path[:, t+1].view(-1,1)).squeeze()

        return max_logLL_allz_allx, path

    def neg_log_likelihood(self, input_ids, segment_ids, input_mask, label_ids):
        bert_feats = self._get_bert_features(input_ids, segment_ids, input_mask)
        forward_score = self._forward_alg(bert_feats)
        # p(X=w1:t,Zt=tag1:t)=...p(Zt=tag_t|Zt-1=tag_t-1)p(xt|Zt=tag_t)...
        gold_score = self._score_sentence(bert_feats, label_ids)
        # - log[ p(X=w1:t,Zt=tag1:t)/p(X=w1:t) ] = - log[ p(Zt=tag1:t|X=w1:t) ]
        return torch.mean(forward_score - gold_score)

    # this forward is just for predict, not for train
    # dont confuse this with _forward_alg above.
    def forward(self, input_ids, segment_ids, input_mask):
        # Get the emission scores from the BiLSTM
        bert_feats = self._get_bert_features(input_ids, segment_ids, input_mask)

        # Find the best path, given the features.
        score, label_seq_ids = self._viterbi_decode(bert_feats)
        return score, label_seq_ids

In [None]:
bert_model = AutoModel.from_pretrained(BERT_VARIANT, output_hidden_states=False, return_dict=False)
start_label_id = tag2idx["[CLS]"]
stop_label_id = tag2idx["[SEP]"]

model = BERT_CRF_NER(bert_model, start_label_id, stop_label_id, len(tag2idx), max_len, batch_size, device)

# LOAD TRAINED MODEL
checkpoint = torch.load(MODEL_PATH, map_location='cpu')
epoch = checkpoint['epoch']
valid_f1_prev = checkpoint['valid_f1']
pretrained_dict=checkpoint['model_state']
net_state_dict = model.state_dict()
pretrained_dict_selected = {k: v for k, v in pretrained_dict.items() if k in net_state_dict}
net_state_dict.update(pretrained_dict_selected)
model.load_state_dict(net_state_dict)

print('Loaded the pretrain  NER_BERT_CRF  model, epoch:',checkpoint['epoch'], 'valid f1:', checkpoint['valid_f1'])
model.cuda();

Loaded the pretrain  NER_BERT_CRF  model, epoch: 13 valid f1: 0.8475551294343241


In [None]:
!ls '$MODEL_DIR'

pytorch_model.pt  vocab.txt


# Test Model

In [None]:
model.eval();

In [None]:
y_true = []
y_pred = []
y_confidence = []

probs = []
out = []
print("***** Running evaluation *****")
print("  Num examples = {}".format(len(test_dataset)))
print("  Batch size = {}".format(batch_size))
for step, batch in enumerate(test_dataloader):
    batch = tuple(t.to(device) for t in batch)
    input_ids, input_mask, label_ids = batch
    
    with torch.no_grad():
        _, predicted_label_seq_ids = model(input_ids, None, input_mask)
        # For eval mode, the first result of outputs is logits
    
    # Model Confidence
    # logits_prob, _ = torch.max(F.softmax(logits, dim=2),dim=2)
    # logits_prob = logits_prob.detach().cpu().numpy()

    # print(logits_prob)
    # logits = torch.argmax(F.log_softmax(logits,dim=2),dim=2)
    # logits = logits.detach().cpu().numpy()
    
    # Get NER true result
    label_ids = label_ids.to('cpu').numpy()
    
    # Only predict the real word, mark=0, will not calculate
    input_mask = input_mask.to('cpu').numpy()

    predicted_label_ids = predicted_label_seq_ids.to('cpu').numpy()
    
    # Compare the valuable predict result
    for i,mask in enumerate(input_mask):
        # Ground truth
        temp_true = []
        # Prediction
        temp_pred = []

        # temp_confidence = []
        for j, m in enumerate(mask):
            # Mark=0 (Label_ids = "X"), meaning its a pad word, dont compare
            if m:
                if tag2name[label_ids[i][j]] != "[CLS]" and tag2name[label_ids[i][j]] != "[SEP]":
                    temp_true.append(tag2name[label_ids[i][j]])
                    temp_pred.append(tag2name[predicted_label_ids[i][j]])
                    # temp_confidence.append(logits_prob[i][j])
            else:
                break
        
        y_true.append(temp_true)
        y_pred.append(temp_pred)
        # y_confidence.append(temp_confidence)

***** Running evaluation *****
  Num examples = 6115
  Batch size = 16


# Predictions

In [None]:
result = pd.DataFrame()
result["actual"] = y_true
result["predicted"] = y_pred
# result["confidence"] = y_confidence

result.to_csv(PREDICTIONS_PATH, sep=",", encoding="utf-8", index=False)
result.head()

Unnamed: 0,actual,predicted
0,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
1,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2,"[O, O, O, O, B-Drug, I-Drug, I-Drug, I-Drug, O...","[O, O, O, O, B-Drug, I-Drug, I-Drug, I-Drug, O..."
3,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, B-Reason, I-..."
4,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


# Analysis

In [None]:
# UTILS
y_true_total = [item for sublist in y_true for item in sublist]
y_pred_total = [item for sublist in y_pred for item in sublist]

y_text = [[l_i for l_i in l if (l_i != "[PAD]" and l_i != "[CLS]" and l_i != "[SEP]")] for l in y_text]
y_text_total = [item for sublist in y_text for item in sublist]
# y_confidence_total = [item for sublist in y_confidence for item in sublist]


def get_cleaned_label(label: str):
    if "-" in label:
        return label.split("-")[1]
    else:
        return label


interested_b = {"B-Drug", "B-Reason", "B-ADE"}
interested_i = {"I-Drug", "I-Reason", "I-ADE"}
n = len(y_true_total)

**Classification Report**

In [None]:
from seqeval.scheme import IOB2

# Get acc , recall, F1 result report(strict)
report_strict = classification_report(y_true, y_pred, mode='strict', scheme=IOB2, digits=4)

# Save the report into file
with open(REPORT_PATH, "w") as writer:
    print("***** Eval results(Strict) *****")
    print("\n%s"%(report_strict))
    print("F1 score: %f"%(f1_score(y_true, y_pred,zero_division=1, mode='strict', scheme=IOB2)))
    print("Accuracy score: %f"%(accuracy_score(y_true, y_pred)))
    
    writer.write("F1 score(Strict):\n")
    writer.write(str(f1_score(y_true, y_pred, mode='strict', scheme=IOB2)))
    writer.write("\n\nAccuracy score:\n")
    writer.write(str(accuracy_score(y_true, y_pred)))
    writer.write("\n\n")  
    writer.write(report_strict)

***** Eval results(Strict) *****

              precision    recall  f1-score   support

         ADE     0.5214    0.3626    0.4277       604
        Drug     0.9228    0.9315    0.9271     10569
      Reason     0.6725    0.5836    0.6249      2519

   micro avg     0.8689    0.8424    0.8554     13692
   macro avg     0.7056    0.6259    0.6599     13692
weighted avg     0.8590    0.8424    0.8495     13692

F1 score: 0.855416
Accuracy score: 0.986052


**Confusion Matrix**

In [None]:
conf_mat = {"Drug": {"Drug": 0, "Reason": 0, "ADE": 0, "O": 0}, 
            "Reason": {"Drug": 0, "Reason": 0, "ADE": 0, "O": 0}, 
            "ADE": {"Drug": 0, "Reason": 0, "ADE": 0, "O": 0},
            "O": {"Drug": 0, "Reason": 0, "ADE": 0, "O": 0}}

i = 0
while i < n:
  if y_true_total[i] in interested_b:

    # If first label matches just mark TP and move ahead till the end of I- tags
    if get_cleaned_label(y_pred_total[i]) == get_cleaned_label(y_true_total[i]):
      conf_mat[get_cleaned_label(y_true_total[i])][get_cleaned_label(y_true_total[i])] += 1
      i += 1
      while i<n and y_true_total[i] in interested_i:
        i += 1

    else:
      wrong_pred = get_cleaned_label(y_pred_total[i])

      i += 1
      # start checking I- tags till they either exhaust or atleast cleaned label matches
      while i<n and y_true_total[i] in interested_i and get_cleaned_label(y_pred_total[i]) != get_cleaned_label(y_true_total[i]):
        # fetching what the wrongly predicted entity is
        if get_cleaned_label(y_pred_total[i]) in conf_mat: 
          wrong_pred = get_cleaned_label(y_pred_total[i])
        i += 1

      if i>n:
        # marking the false negative of true tag
        conf_mat[get_cleaned_label(y_true_total[i-1])][wrong_pred] += 1
        break

      if y_true_total[i] not in interested_i:
        # marking the false negative of true tag
        conf_mat[get_cleaned_label(y_true_total[i-1])][wrong_pred] += 1

      else:
        # lenient marking if atleast something matches and moving ahead till the end of I- tags
        conf_mat[get_cleaned_label(y_true_total[i])][get_cleaned_label(y_true_total[i])] += 1
        while i<n and y_true_total[i] in interested_i:
          i += 1
      
  else:
    if y_pred_total[i].startswith('B') or y_pred_total[i].startswith('O'):
      conf_mat[get_cleaned_label(y_true_total[i])][get_cleaned_label(y_pred_total[i])] += 1
    i += 1    


confusion_matrix = pd.DataFrame.from_dict(conf_mat, orient='index')
confusion_matrix.drop("O", inplace=True)
confusion_matrix.to_csv(CONFUSION_MATRIX_PATH, sep=",", encoding="utf-8")

for key in conf_mat:
  print(key, conf_mat[key])
confusion_matrix

Drug {'Drug': 10155, 'Reason': 3, 'ADE': 0, 'O': 411}
Reason {'Drug': 11, 'Reason': 1732, 'ADE': 30, 'O': 746}
ADE {'Drug': 1, 'Reason': 33, 'ADE': 288, 'O': 282}
O {'Drug': 482, 'Reason': 491, 'ADE': 136, 'O': 736306}


Unnamed: 0,Drug,Reason,ADE,O
Drug,10155,3,0,411
Reason,11,1732,30,746
ADE,1,33,288,282


**Entity Count**

In [None]:
def get_counts(conf_mat):
  counts = {"Drug": 0, "Reason": 0, "ADE": 0}
  for key in conf_mat:
    count = 0
    for k in conf_mat[key]:
      count += conf_mat[key][k]
    counts[key] = count
  return counts

print('LENIENT =')
counts = get_counts(conf_mat)
# {'Drug': 10569, 'Reason': 2519, 'ADE': 604}
for key in counts:
  print(key, counts[key])

LENIENT =
Drug 10569
Reason 2519
ADE 604
O 737415


**Entity F1**

In [None]:
def get_entity_F1(conf_mat):

  stats = {"Drug": {"TP": 0, "TN": 0, "FP": 0, "FN": 0, "P": 0, "R": 0, "F1": 0}, 
           "Reason": {"TP": 0, "TN": 0, "FP": 0, "FN": 0, "P": 0, "R": 0, "F1": 0}, 
           "ADE": {"TP": 0, "TN": 0, "FP": 0, "FN": 0, "P": 0, "R": 0, "F1": 0}}
  for key in stats:
    stats[key]["TP"] = conf_mat[key][key]
    for k in conf_mat[key]:
      if k != key:
        stats[key]["FN"] += conf_mat[key][k]
    for k in conf_mat:
      if k != key:
        stats[key]["FP"] += conf_mat[k][key]
        for k2 in conf_mat[k]:
          if k2 != key:
            stats[key]["TN"] += conf_mat[k][k2]
    stats[key]["P"] = stats[key]["TP"] / (stats[key]["TP"] + stats[key]["FP"])
    stats[key]["R"] = stats[key]["TP"] / (stats[key]["TP"] + stats[key]["FN"])
    stats[key]["F1"] = (2 * stats[key]["P"] * stats[key]["R"]) / (stats[key]["P"] + stats[key]["R"])
  return stats

scores = get_entity_F1(conf_mat)
scoresDf = pd.DataFrame.from_dict(scores, orient='index')
scoresDf.to_csv(SCORES_PATH, sep=",", encoding="utf-8")
# SOTA= Drug- 0.954 (P- 0.956, R- 0.952), Reason- 0.676 (P- 0.757, R- 0.611), ADE- 0.462 (P- 0.649, R- 0.358)
print('LENIENT =')
scoresDf

LENIENT =


Unnamed: 0,TP,TN,FP,FN,P,R,F1
Drug,10155,740044,494,414,0.953611,0.960829,0.957206
Reason,1732,748061,527,787,0.766711,0.687574,0.72499
ADE,288,750337,166,316,0.634361,0.476821,0.544423


**Micro, Macro, Weighted-average F1(lenient)**

In [None]:
def get_micro_f1(scores):
  stats = {"TP": 0, "TN": 0, "FP": 0, "FN": 0}
  for key in scores:
    for k in scores[key]:
      if k in stats:
        stats[k] += scores[key][k]
  P = stats["TP"] / (stats["TP"] + stats["FN"])
  R = stats["TP"] / (stats["TP"] + stats["FP"])
  f1 = (2 * P * R) / (P + R)
  return f1

def get_macro_f1(scores):
  f1 = 0
  for key in scores:
    f1 += scores[key]["F1"]
  f1 /= 3
  return f1

def get_weighted_f1(scores, counts):
  f1 = 0
  totalEntities = 0
  for key in scores:
    f1 += scores[key]["F1"] * counts[key]
    totalEntities += counts[key]
  f1 /= totalEntities
  return f1

print('LENIENT =')
print('Micro F1- ', get_micro_f1(scores))
print('Macro F1- ', get_macro_f1(scores))
print('Weighted F1- ', get_weighted_f1(scores, counts))

LENIENT =
Micro F1-  0.9000517483551416
Macro F1-  0.7422063738498205
Weighted F1-  0.8962746239996473


**Confidence Plot**

In [None]:
# confidence = pd.DataFrame()
# confidence['input'] = y_text_total
# confidence['actual'] = y_true_total
# confidence['pred'] = y_pred_total

# confidence['correct'] = np.where((confidence['actual'] == confidence['pred']), y_confidence_total , np.NaN)
# confidence['wrong'] = np.where((confidence['actual'] != confidence['pred']), y_confidence_total, np.NaN)
# confidence['wrong_actual'] = np.where((confidence['actual'] != confidence['pred']), confidence['actual'], np.NaN)
# confidence['wrong_pred'] = np.where((confidence['actual'] != confidence['pred']), confidence['pred'], np.NaN)

# confidence['correctADE'] = np.where((confidence['actual'] == confidence['pred']) & (confidence['actual']=='ADE') , y_confidence_total , np.NaN)
# confidence['wrongADE'] = np.where((confidence['actual'] != confidence['pred']) & (confidence['actual']=='ADE'), y_confidence_total, np.NaN)

# confidence['correctReason'] = np.where((confidence['actual'] == confidence['pred']) & (confidence['actual']=='Reason') , y_confidence_total , np.NaN)
# confidence['wrongReason'] = np.where((confidence['actual'] != confidence['pred']) & (confidence['actual']=='Reason'), y_confidence_total, np.NaN)

# confidence['correctDrug'] = np.where((confidence['actual'] == confidence['pred']) & (confidence['actual']=='Drug') , y_confidence_total , np.NaN)
# confidence['wrongDrug'] = np.where((confidence['actual'] != confidence['pred']) & (confidence['actual']=='Drug'), y_confidence_total, np.NaN)

# CONFIDENCE_PATH = OUTPUT_DIR + "/confidence.csv"
# confidence.to_csv(CONFIDENCE_PATH, columns = ['input', 'wrong_actual', 'wrong_pred'])

In [None]:
# confidence.plot(y=['wrong'], style=['ro'], figsize=(72,48))

In [None]:
# confidence.plot(y=['correct'], style=['go'], figsize=(72,48))

In [None]:
!ls '$OUTPUT_DIR'

confusion_matrix.csv  predictions.csv  result.txt  strict_confusion_matrix.csv
loss.png	      result.gdoc      scores.csv
