In [1]:
import torch
import json
import numpy as np
import transformers
import pandas as pd
import pickle as pkl
from torch import nn
from tqdm import tqdm
from os.path import join
from importlib import reload
import multiprocessing as mp
from collections import Counter
from torch.nn import CrossEntropyLoss
from torch.utils.data import DataLoader
from sklearn.metrics import classification_report, confusion_matrix
from transformers import (BertPreTrainedModel, BertModel, AdamW, get_linear_schedule_with_warmup, 
                          RobertaPreTrainedModel, RobertaModel,
                          AutoTokenizer, AutoModel, AutoConfig)
from transformers import (WEIGHTS_NAME,
                          BertConfig, BertForTokenClassification, BertTokenizer,
                          XLMConfig, XLMForTokenClassification, XLMTokenizer,
                          DistilBertConfig, DistilBertForTokenClassification, DistilBertTokenizer,
                          RobertaConfig, RobertaForTokenClassification, RobertaTokenizer)


from MIMICSocialDeter import MIMICSocialDeter

In [2]:
# import data, reference maps for labels

# data file
with open('preprocessed_data_version_2.json') as f:
    data = json.load(f)

# label2id for both types of labels
with open('label2id_sbdh.json') as f:
    label2id_sbdh = json.load(f)
with open('label2id_umls.json') as f:
    label2id_umls = json.load(f)

#
unique_labels = list(label2id_sbdh.values())
num_labels_sbdh = len(unique_labels)
print(num_labels_sbdh)

15


In [3]:
# we start by defining the tokenizer for processing input text, the configuration of the model, and the model to be fine-tuned, 


# pre-trained model name for tokenizer, config and model
model_name = 'bert-base-uncased'


# define
tokenizer = AutoTokenizer.from_pretrained(model_name)
config = AutoConfig.from_pretrained(pretrained_model_name_or_path=model_name, num_labels=num_labels_sbdh)
model = BertForTokenClassification.from_pretrained(model_name, config=config)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

In [4]:
# define an object of Dataset class that will return tokenized data instance via __getitem__

train_dataset = MIMICSocialDeter(
    list_data=data['train'],
    tokenizer=tokenizer,
    label2id_sbdh=label2id_sbdh,
    label2id_umls=label2id_umls,
    max_length=512
)


# create dataloader object from train dataset
train_dataloader = DataLoader(
    train_dataset,
    batch_size=8,
    shuffle=True,
    num_workers=1
)

In [5]:
# set parameters for training
args = {'weight_decay':0.0,
        'learning_rate':2e-5,
        'epochs':1,
        'gradient_accumulation_steps':1,
        'adam_epsilon':1e-8}
args['t_total'] = len(train_dataloader) // args['gradient_accumulation_steps'] * args['epochs']
args['warmup_steps'] = int(0.20*args['t_total'])

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [6]:
# parameter specific updatation 
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
         'weight_decay': args['weight_decay']},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 
         'weight_decay': 0.0}
    ]

#
optimizer = AdamW(optimizer_grouped_parameters, lr=args['learning_rate'], eps=args['adam_epsilon'])
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args['warmup_steps'],
                                            num_training_steps=args['t_total'])

In [7]:
# define objective function

# loss_fct = CrossEntropyLoss(reduction='none')
loss_fct = CrossEntropyLoss()

In [8]:
model.to(device)

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

In [9]:
def evaluate(model, data_loader):
    model.eval()
    
    # dict to store results
    dict_result = {
        'actual':[],
        'preds':[]
    }
    
    # iterate over batches
    with torch.no_grad():
        for batch in tqdm(data_loader):
            dict_result['actual'] += batch['label'].numpy().tolist()

            input_batch = {'input_ids':batch['input_ids'],
                       'attention_mask':batch['attention_mask']}
            input_batch = {k: v.to(device) for k, v in input_batch.items()}
            outputs = model(**input_batch)

            dict_result['preds'] += np.argmax(outputs[0].detach().cpu().numpy(), axis=1).tolist()

    # update evaluation results
    dict_result['actual'] = [x[0] for x in dict_result['actual']]    
    
    return dict_result

In [10]:
# printing performance
def get_performance(actual_, preds_, dict_mapping):
    print(classification_report(actual_, preds_))
    print('--'*10)
    print('Confusion matrix')
    print(pd.DataFrame(confusion_matrix(actual_, preds_)))
    print('--'*10)
    print('Actual counter:', Counter(actual_))
    print('Prediction counter:', Counter(preds_))
    print('Mapping:', dict_mapping)

In [26]:
model.train()
for each_epoch in range(args['epochs']):
    model.train()
    for batch in tqdm(train_dataloader):
        model.zero_grad()        
        
        # unroll input features
        input_batch = {
            'input_ids':batch['input_ids'],
            'attention_mask':batch['attention_mask']
        }
        input_batch = {k: v.to(device) for k, v in input_batch.items()}
        
        # forward pass
        outputs = model(**input_batch)
        
        # compute loss
        #loss = loss_fct(outputs[0], batch['label'].view(-1).cuda()).sum()
        loss = loss_fct(outputs['logits'].permute(0, 2, 1), batch['labels_sbdh'].to(device))
        
        # backpropagation and parameter update
        loss.backward()
        optimizer.step()
        scheduler.step()  # Update learning rate schedule
        
    
    # evaluate after one sweep over training data
    dict_train = evaluate(
        model=model, 
        data_loader=train_loader
    )
    
    # print performance
    get_performance(
        actual_ = dict_train['actual'], 
        preds_ = dict_train['preds'], 
        dict_mapping = label2id
    )
    
    break



tensor(2.8165, grad_fn=<NllLoss2DBackward0>)


  0%|                                       | 1/4055 [01:04<72:08:00, 64.06s/it]

tensor(2.8123, grad_fn=<NllLoss2DBackward0>)


  0%|                                       | 2/4055 [02:06<71:20:33, 63.37s/it]

tensor(2.7716, grad_fn=<NllLoss2DBackward0>)


  0%|                                       | 3/4055 [03:20<75:17:31, 66.89s/it]


KeyboardInterrupt: 