## load library

In [None]:
!pip install datasets
!pip install transformers

In [1]:
import pandas as pd
import numpy as np

from torch.utils.data import Dataset
from torch.utils.data import SequentialSampler,RandomSampler
from torch import nn
import torch
from torch.utils.data.dataloader import DataLoader
from torch.utils.data import RandomSampler
from torch.utils.data import DataLoader,RandomSampler
from torch.optim import AdamW

import datasets
from datasets import load_dataset,load_from_disk,load_metric,DatasetDict,Dataset,Features,Value,concatenate_datasets,Sequence,ClassLabel
from transformers import get_linear_schedule_with_warmup
from transformers import AutoTokenizer,AutoModel

import math
from tqdm import tqdm,notebook
import functools
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

##hyperparameter

In [None]:
max_len = 128
batch_size = 24
device = torch.device('cuda')

num_epochs = 1
learning_rate = 2e-4
hidden_size = 768
hidden_dropout_prob = 0.4
num_choices = 4

train_log_interval = 100
patience = 2
counter = 0
best_val_acc = 0
best_val_loss = np.inf

## download dataset

In [2]:
dataset = datasets.load_dataset('medmcqa')

Using custom data configuration default
Reusing dataset med_mcqa (/root/.cache/huggingface/datasets/med_mcqa/default/1.1.0/f2fdfa9ccfbf9d148c0639e6afe3379f3c7e95c4d52d5e68ec1156e5004bd880)


  0%|          | 0/3 [00:00<?, ?it/s]

##prepare dataset

In [3]:
dataset_dict = {}

question_list = []
opa_list = []
opb_list = []
opc_list = []
opd_list = []
cop_list = []
exp_list = []

for a in np.random.choice(dataset['validation'],800):

    
    if a['exp'] != None:
        
        question_list.append(a['question'])
        opa_list.append(a['opa'])
        opb_list.append(a['opb'])
        opc_list.append(a['opc'])
        opd_list.append(a['opd'])
        cop_list.append(a['cop'])
        exp_list.append(a['exp'])

In [4]:
data_dict = {'question':question_list,'opa':opa_list,'opb':opb_list, 'opc':opc_list, 'opd':opd_list, 'cop':cop_list, 'exp':exp_list}
    
f = Features({'question': Value(dtype='string', id=None),
 'opa': Value(dtype='string', id=None),'opb': Value(dtype='string', id=None),'opc': Value(dtype='string', id=None),'opd': Value(dtype='string', id=None),
 'cop': ClassLabel(num_classes=4, names=['a', 'b', 'c', 'd'], id=None),'exp': Value(dtype='string', id=None)})
    
val_datasets = DatasetDict({'val': Dataset.from_dict(data_dict, features=f)})

In [None]:
dataset_dict = {}

question_list = []
opa_list = []
opb_list = []
opc_list = []
opd_list = []
cop_list = []
exp_list = []

for a in dataset['test']:

    
    if a['exp'] != None:
        
        question_list.append(a['question'])
        opa_list.append(a['opa'])
        opb_list.append(a['opb'])
        opc_list.append(a['opc'])
        opd_list.append(a['opd'])
        cop_list.append(a['cop'])
        exp_list.append(a['exp'])

In [None]:
data_dict = {'question':question_list,'opa':opa_list,'opb':opb_list, 'opc':opc_list, 'opd':opd_list, 'cop':cop_list, 'exp':exp_list}
    
f = Features({'question': Value(dtype='string', id=None),
 'opa': Value(dtype='string', id=None),'opb': Value(dtype='string', id=None),'opc': Value(dtype='string', id=None),'opd': Value(dtype='string', id=None),
 'cop': ClassLabel(num_classes=4, names=['a', 'b', 'c', 'd'], id=None),'exp': Value(dtype='string', id=None)})
    
test_datasets = DatasetDict({'test': Dataset.from_dict(data_dict, features=f)})

In [6]:
dataset_dict = {}

question_list = []
opa_list = []
opb_list = []
opc_list = []
opd_list = []
cop_list = []
exp_list = []

for a in np.random.choice(dataset['train'],6000):

    
    if a['exp'] != None:
        
        question_list.append(a['question'])
        opa_list.append(a['opa'])
        opb_list.append(a['opb'])
        opc_list.append(a['opc'])
        opd_list.append(a['opd'])
        cop_list.append(a['cop'])
        exp_list.append(a['exp'])

In [7]:
data_dict = {'question':question_list,'opa':opa_list,'opb':opb_list, 'opc':opc_list, 'opd':opd_list, 'cop':cop_list, 'exp':exp_list}
    
f = Features({'question': Value(dtype='string', id=None),
 'opa': Value(dtype='string', id=None),'opb': Value(dtype='string', id=None),'opc': Value(dtype='string', id=None),'opd': Value(dtype='string', id=None),
 'cop': ClassLabel(num_classes=4, names=['a', 'b', 'c', 'd'], id=None),'exp': Value(dtype='string', id=None)})
    
train_datasets = DatasetDict({'train': Dataset.from_dict(data_dict, features=f)})

In [5]:
## dataset class

class MedMCQADataset(Dataset):

  def __init__(self,
               dataset):
#     self.dataset = dataset['train'] if training == True else dataset['test']
    #self.dataset = pd.read_csv(csv_path)
    self.dataset = dataset

  def __len__(self):
    return len(self.dataset)
  
  def __getitem__(self,idx):
    #context = self.dataset.loc[idx,'exp']
    context = self.dataset[idx]['exp']
    #question = self.dataset.loc[idx,'question']
    question = self.dataset[idx]['question']
    #options = self.dataset.loc[idx,['opa', 'opb', 'opc', 'opd']].values
    #options = np.array([self.dataset[idx]['opa'],self.dataset[idx]['opb'],self.dataset[idx]['opc'],self.dataset[idx]['opd']],dtype=np.object)
    options = [self.dataset[idx]['opa'],self.dataset[idx]['opb'],self.dataset[idx]['opc'],self.dataset[idx]['opd']]
    #label = self.dataset.loc[idx,'cop'] - 1
    label = self.dataset[idx]['cop']
    return (context,question,options,label)

In [None]:
#convert batch dataset function

def process_batch(batch,tokenizer,max_len=32):

    expanded_batch = []

    labels = []
    
    for context,question,options,label in batch:
        question_option_pairs = [question+' '+option for option in options]
        contexts = [context]*len(options)
        labels.append(label)
        expanded_batch.extend(zip(contexts,question_option_pairs))

    tokenized_batch = tokenizer.batch_encode_plus(expanded_batch,truncation=True,padding="max_length",max_length=max_len,return_tensors="pt")
    
    return tokenized_batch,torch.tensor(labels)

## prepare pretrained model

In [10]:
model_name_or_path = 'bert-base-uncased'

model = AutoModel.from_pretrained(model_name_or_path)

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [None]:
#dropout and linear layer

dropout = nn.Dropout(hidden_dropout_prob).to(device)
#dropout = nn.Dropout(hidden_dropout_prob)

linear = nn.Linear(in_features=hidden_size,out_features=1).to(device)

criterion = nn.CrossEntropyLoss()

## train model

In [None]:
#prepare train dataset

train_dataset = MedMCQADataset(train_datasets['train'])
val_dataset = MedMCQADataset(val_datasets['val'])
#test_dataset = MedMCQADataset(test_datasets['test'])

train_sampler = RandomSampler(train_dataset)
eval_sampler = SequentialSampler(val_dataset)


model_collate_fn = functools.partial(process_batch,tokenizer=tokenizer,max_len=max_len)

train_dataloader = DataLoader(train_dataset,batch_size=batch_size,sampler = train_sampler, collate_fn = model_collate_fn)
val_dataloader = DataLoader(val_dataset,batch_size=batch_size,sampler=eval_sampler,collate_fn=model_collate_fn)
#test_dataloader = DataLoader(test_dataset,batch_size=batch_size,sampler=eval_sampler,collate_fn=model_collate_fn)

In [None]:
#optimizer and scheduler

optimizer = AdamW(model.parameters(),lr = learning_rate, eps = 1e-8)
scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=100,
        num_training_steps=(num_epochs + 1) * math.ceil(len(train_dataset) / batch_size),
    )

In [17]:
#training

model.to(device)

for epoch in notebook.tqdm(range(num_epochs)):
    # train loop
    model.train()

    train_loss = 0
    train_acc = 0
    loss_value = 0

    for idx, (inputs,targets) in notebook.tqdm(enumerate(train_dataloader)):
        
        optimizer.zero_grad()
        
        inputs = inputs.to(device)
        output = model(**inputs)

        pooled_output = output[1]
        pooled_output = dropout(pooled_output)
        logits = linear(pooled_output)
        reshaped_logits = logits.view(-1,num_choices)

        targets = targets.to(device)
        
        loss = criterion(reshaped_logits, targets)

        loss.backward()
        
        optimizer.step()
        scheduler.step()

        predictions = torch.argmax(reshaped_logits,axis=-1)        
        correct_predictions = torch.sum(predictions==targets)
        accuracy = correct_predictions.cpu().detach().numpy()/predictions.size()[0]

        loss_value += loss
        train_acc += accuracy

        if (idx + 1) % train_log_interval == 0:
            train_loss = loss_value / train_log_interval
            avg_train_acc = train_acc / train_log_interval
            
            current_lr = scheduler.get_last_lr()

            print(
                f"Epoch[{epoch}/{num_epochs}]({idx + 1}/{len(train_dataloader)}) || "
                f"training loss {train_loss:4.4} || training accuracy {avg_train_acc:4.2%} || lr {current_lr}||"
            )

            loss_value = 0
            train_acc = 0
        
        torch.cuda.empty_cache()

    #scheduler.step()

    # val loop
    with torch.no_grad():

        print("Calculating validation results...")
        
        model.eval()
        
        val_loss_items = []

        val_acc = 0
        
        for idx,(inputs,labels) in notebook.tqdm(enumerate(val_dataloader)):

            inputs = inputs.to(device)
            labels = labels.to(device)

            outs = model(**inputs)

            pooled_output = outs[1]
            pooled_output = dropout(pooled_output)
            logits = linear(pooled_output)
            reshaped_logits = logits.view(-1,num_choices)

            preds = torch.argmax(reshaped_logits, dim=-1)

            correct_predictions = torch.sum(preds==labels)

            accuracy = correct_predictions.cpu().detach().numpy()/preds.size()[0]
            
            val_acc += accuracy
            
            loss_item = criterion(reshaped_logits, labels).item()

            val_loss_items.append(loss_item)

        val_loss = np.sum(val_loss_items) / len(val_dataloader)

        avg_val_acc = val_acc / (idx+1)

        # Callback1: validation accuracy가 향상될수록 모델을 저장합니다.
        if val_loss < best_val_loss:
            best_val_loss = val_loss
        if avg_val_acc > best_val_acc:
            print("New best model for val accuracy! saving the model..")
            torch.save(model.state_dict(), f"result_{epoch:03}_accuracy_{avg_val_acc:4.2%}.ckpt")
            best_val_acc = avg_val_acc
            counter = 0
        else:
            counter += 1
        # Callback2: patience 횟수 동안 성능 향상이 없을 경우 학습을 종료시킵니다.
        if counter > patience:
            print("Early Stopping...")
            break
        
        
        print(
            f"[Val] acc : {avg_val_acc:4.2%}, loss: {val_loss:4.2} ||"
            f"best acc : {best_val_acc:4.2%}, best loss: {best_val_loss:4.2}"
        )

  0%|          | 0/1 [00:00<?, ?it/s]

0it [00:00, ?it/s]

Epoch[0/1](100/222) || training loss 1.05 || training accuracy 54.29% || lr [0.0002]||
Epoch[0/1](200/222) || training loss 0.7832 || training accuracy 70.29% || lr [0.0001418604651162791]||
Calculating validation results...


0it [00:00, ?it/s]

New best model for val accuracy! saving the model..
[Val] acc : 72.75%, loss: 0.68 ||best acc : 72.75%, best loss: 0.68


## Test model

In [10]:
#trained model load

model_name_or_path = 'bert-base-uncased'

model = AutoModel.from_pretrained(model_name_or_path)

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)

model.load_state_dict(torch.load('/content/drive/MyDrive/result_000_accuracy_72.75%.ckpt'))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


<All keys matched successfully>

In [None]:
#test dataset

dataset_dict = {}

question_list = []
opa_list = []
opb_list = []
opc_list = []
opd_list = []
cop_list = []
exp_list = []

for a in np.random.choice(dataset['validation'],50):

    
    if a['exp'] != None:
        
        question_list.append(a['question'])
        opa_list.append(a['opa'])
        opb_list.append(a['opb'])
        opc_list.append(a['opc'])
        opd_list.append(a['opd'])
        cop_list.append(a['cop'])
        exp_list.append(a['exp'])
  
data_dict = {'question':question_list,'opa':opa_list,'opb':opb_list, 'opc':opc_list, 'opd':opd_list, 'cop':cop_list, 'exp':exp_list}
    
f = Features({'question': Value(dtype='string', id=None),
 'opa': Value(dtype='string', id=None),'opb': Value(dtype='string', id=None),'opc': Value(dtype='string', id=None),'opd': Value(dtype='string', id=None),
 'cop': ClassLabel(num_classes=4, names=['a', 'b', 'c', 'd'], id=None),'exp': Value(dtype='string', id=None)})
    
val_datasets = DatasetDict({'val': Dataset.from_dict(data_dict, features=f)})

In [14]:
#prepare inference dataset

inference_dataset = MedMCQADataset(val_datasets['val'])

eval_sampler = SequentialSampler(inference_dataset)

model_collate_fn = functools.partial(
  process_batch,
  tokenizer=tokenizer,
  max_len=max_len
  )

inference_dataloader = DataLoader(inference_dataset,
                            batch_size=batch_size,
                            sampler=eval_sampler,
                            collate_fn=model_collate_fn)

In [21]:
#inference

device = torch.device('cuda')

model.to(device)

model.eval()

# 모델이 테스트 데이터셋을 예측하고 결과를 저장합니다.
all_predictions = []

for inputs,labels in notebook.tqdm(inference_dataloader):

    with torch.no_grad():

        inputs = inputs.to(device)

        pred = model(**inputs)

        pooled_output = pred[1]
        pooled_output = dropout(pooled_output)
        logits = linear(pooled_output)
        reshaped_logits = logits.view(-1,num_choices)

        top_choices = torch.argmax(reshaped_logits, dim=-1)

        all_predictions.extend(top_choices.cpu().numpy())

print(all_predictions)

  0%|          | 0/1 [00:00<?, ?it/s]

[2, 0, 3, 1, 3, 0, 3, 3, 0, 2, 3, 0, 1, 0, 1, 2, 0, 0, 0, 3, 0]


In [22]:
labels

tensor([0, 0, 1, 3, 1, 2, 0, 2, 3, 2, 0, 2, 2, 1, 3, 2, 0, 0, 2, 1, 2])

In [29]:
correct = torch.sum(torch.tensor(all_predictions) == labels)

accuracy = correct/len(labels)

accuracy.item() * 100

23.80952388048172