In [1]:
import shutil
import os

# Remove the directory if already exist 
dir_name = 'neural_medical_qa'
if os.path.exists(dir_name):
    shutil.rmtree(dir_name)

#clone the repo from github
!git clone https://github.com/trduc97/neural_medical_qa.git
%cd neural_medical_qa
# install the requirement
!pip install -r requirements.txt

Cloning into 'neural_medical_qa'...
remote: Enumerating objects: 123, done.[K
remote: Counting objects: 100% (123/123), done.[K
remote: Compressing objects: 100% (116/116), done.[K
remote: Total 123 (delta 57), reused 0 (delta 0), pack-reused 0[K
Receiving objects: 100% (123/123), 1.78 MiB | 5.75 MiB/s, done.
Resolving deltas: 100% (57/57), done.
/kaggle/working/neural_medical_qa


In [6]:
import json
from datasets import Dataset, DatasetDict, load_dataset
from sklearn.model_selection import train_test_split
import pandas as pd

def load_bioasq_pubmedqa(bioasq_kaggle_path = '/kaggle/input/bioasq-training-12b/training12b_new.json', 
                         pubmed_kaggle_path='/kaggle/input/pubmed-qa/pubmed_qa_pga_labeled.parquet'):
    # Load the JSON file
    with open(bioasq_kaggle_path,'r') as f:
        bioasq_data=json.load(f)
    # Extract yes/no questions directly
    bioasq_yesno = [{
            'id':question['id'],
            'question':question['body'],
            'final_decision':question['exact_answer'],
            'long_answer':question['ideal_answer'], 
            'documents':question['documents']
        }
        for question in bioasq_data['questions'] if question['type'] == 'yesno']
    # Convert the list of yes/no questions to a Pandas DataFrame
    bioasq_df = pd.DataFrame(bioasq_yesno)

    # Convert the DataFrame to a Hugging Face Dataset
    bioasq_dataset = Dataset.from_pandas(bioasq_df)
    # Create a DatasetDict with the 'train' split
    bioasq_data=DatasetDict({'train': bioasq_dataset})

    # Read from parquet and translate to a dataset object
    pubmed_df=pd.read_parquet(pubmed_kaggle_path)
    dataset=Dataset.from_pandas(pubmed_df,preserve_index=False)
    #Setting into similar format as from huggingface
    pubmedqa_data = DatasetDict({'train': dataset})
    
    # Load the pubmedqa dataset
    #pubmedqa_data=load_dataset("pubmed_qa","pqa_labeled") # unstable connection

    #Encoding decisions 
    def decision_encode(question):
        labels_map = {'no': 0, 'maybe': 1, 'yes': 2}
        question['decision_encoded'] = labels_map[question['final_decision']]
        return question

    pubmedqa_data=pubmedqa_data.map(decision_encode)
    bioasq_data=pubmedqa_data.map(decision_encode)

    return bioasq_data, pubmedqa_data


def pubmed_train_test_split(datasetdict,train_size=0.75, 
                         strat_col='decision_encoded'):
    #Convert dataset to pandas DataFrame
    df = pd.DataFrame(datasetdict['train'])
    test_size=(1-train_size)
    # Define the stratification column
    stratify_col=strat_col

    #Split like normal
    train_df, test_df = train_test_split(
        df,
        test_size=test_size,
        stratify=df[stratify_col],
        random_state=42)
    # Convert DataFrames back to Dataset
    train_dataset = Dataset.from_pandas(train_df, preserve_index=False)
    test_dataset = Dataset.from_pandas(test_df, preserve_index=False)

    return train_dataset, test_dataset


def train_val_test_split(datasetdict,train_size=0.75, 
                         val_test_ratio=0.6,
                         strat_col='decision_encoded'):
    #Convert dataset to pandas DataFrame
    df = pd.DataFrame(datasetdict['train'])
    test_size=(1-train_size)
    # Define the stratification column
    stratify_col=strat_col

    #First, split for the train set
    train_df, val_test_df = train_test_split(
        df,
        test_size=test_size,
        stratify=df[stratify_col],
        random_state=42)
    #Then, split the remaining into validation and test
    val_df, test_df = train_test_split(
        val_test_df,
        test_size=val_test_ratio,
        stratify=val_test_df[stratify_col],
        random_state=42)
    # Convert DataFrames back to Dataset
    train_dataset = Dataset.from_pandas(train_df, preserve_index=False)
    val_dataset = Dataset.from_pandas(val_df, preserve_index=False)
    test_dataset = Dataset.from_pandas(test_df, preserve_index=False)

    return train_dataset, val_dataset, test_dataset

In [7]:
from import_datasets import load_bioasq_pubmedqa, train_test_split

bioasq, pubmedqa = load_bioasq_pubmedqa()

# Display the first few samples of the PubMedQA dataset
print(pubmedqa['train'].to_pandas().head())

responses = pubmedqa['train']['final_decision']
# Counting the occurrences of each value
yes_count = responses.count('yes')
no_count = responses.count('no')
maybe_count = responses.count('maybe')

# Display the counts
print(f"Yes: {yes_count}")
print(f"No: {no_count}")
print(f"Maybe: {maybe_count}")

pubmedqa_train, pubmedqa_test = pubmed_train_test_split(pubmedqa)
print(f"Train size: {len(pubmedqa_train)}")
print(f"Test size: {len(pubmedqa_test)}")

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

      pubid                                           question  \
0  21645374  Do mitochondria play a role in remodelling lac...   
1  16418930  Landolt C and snellen e acuity: differences in...   
2   9488747  Syncope during bathing in infants, a pediatric...   
3  17208539  Are the long-term results of the transanal pul...   
4  10808977  Can tailored interventions increase mammograph...   

                                             context  \
0  {'contexts': ['Programmed cell death (PCD) is ...   
1  {'contexts': ['Assessment of visual acuity dep...   
2  {'contexts': ['Apparent life-threatening event...   
3  {'contexts': ['The transanal endorectal pull-t...   
4  {'contexts': ['Telephone counseling and tailor...   

                                         long_answer final_decision  \
0  Results depicted mitochondrial dynamics in viv...            yes   
1  Using the charts described, there was only a s...             no   
2  "Aquagenic maladies" could be a pediatric form... 

In [8]:
from collections import defaultdict
# Initialize a defaultdict to hold the bucket counts
length_buckets = defaultdict(int)

# Define the bucket size
bucket_size = 128

# Loop through each string in the list
for s in pubmedqa_train['long_answer']:
    # Determine the bucket for the current string length
    bucket = (len(s) // bucket_size) * bucket_size
    # Increment the count for the appropriate bucket
    length_buckets[bucket] += 1

# Display the counts for each bucket
for bucket, count in sorted(length_buckets.items()):
    print(f"Length {bucket} - {bucket + bucket_size - 1}: {count} strings")

Length 0 - 127: 66 strings
Length 128 - 255: 326 strings
Length 256 - 383: 244 strings
Length 384 - 511: 82 strings
Length 512 - 639: 25 strings
Length 640 - 767: 4 strings
Length 768 - 895: 3 strings


In [None]:
bioasq, pubmedqa_artificial = load_bioasq_pubmedqa(pubmed_kaggle_path='/kaggle/input/pubmed-qa/pubmed_qa_pga_artificial.parquet')

In [None]:
from datasets import DatasetDict, Dataset
from sklearn.model_selection import train_test_split

df_artificial=pubmedqa_artificial['train'].to_pandas()
df_sample, _=train_test_split(df_artificial, test_size=0.95, random_state=42, stratify=df_artificial['decision_encoded'])   
df_sample=df_sample[['pubid', 'question', 'context', 'long_answer', 'final_decision', 'decision_encoded']]
data_art=Dataset.from_pandas(df_sample,preserve_index=False)

In [None]:
# Convert back to datasets
pubmedqa_arti = DatasetDict({'train': data_art})
pubmedqa_art_train,pubmedqa_art_val, pubmedqa_art_test = pubmed_train_test_split(pubmedqa_arti)

In [39]:
import os
import torch
from transformers import BertTokenizer, BertModel, AutoTokenizer, AutoModel, GPT2Tokenizer, GPT2Model
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, f1_score

class QAModel(nn.Module):
    def __init__(self, model, classes=3, dropout_prob=0.5):
        super(QAModel, self).__init__()
        self.bert = model
        self.dropout1 = nn.Dropout(dropout_prob)
        self.linear1 = nn.Linear(model.config.hidden_size, 128)
        self.dropout2 = nn.Dropout(dropout_prob)
        self.linear2 = nn.Linear(128, classes)  # number of classes may vary between BioASQ (2 classes) and PubMedQA (3 classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]  # CLS token
        cls_output = self.dropout1(cls_output)  # Apply first dropout
        cls_output = self.linear1(cls_output)  # Apply first linear layer
        cls_output = self.dropout2(cls_output)  # Apply second dropout
        logits = self.linear2(cls_output)  # Apply second linear layer
        return logits

class Trainandtest:

    def __init__(self, df_train, df_test, stratify_col='decision_encoded'):
        self.train_data = df_train
        self.test_data = df_test
        self.loss_fn = nn.CrossEntropyLoss()
        self.stratify_col = stratify_col
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.results={}

    def initialize_tokenizer(self, model_name, source):
        if 'GPT' in model_name:
            tokenizer = GPT2Tokenizer.from_pretrained(source)
            if tokenizer.pad_token is None:
                tokenizer.add_special_tokens({'pad_token': '[PAD]'})
            return tokenizer
        elif 'BioLinkBERT' in model_name:
            return AutoTokenizer.from_pretrained(source)
        else:
            return BertTokenizer.from_pretrained(source)

    def encode_data(self, df):
        inputs = self.tokenizer(
            text=df['question'], 
            text_pair=df['long_answer'], 
            padding=True, 
            truncation=True, 
            return_tensors='pt', 
            max_length=128*4
        )
        labels = torch.tensor(df[self.stratify_col])
        return inputs, labels

    def create_dataloader(self, inputs, labels, batch_size):
        dataset = TensorDataset(inputs['input_ids'], inputs['attention_mask'], labels)
        return DataLoader(dataset, batch_size=batch_size, shuffle=True)

    def import_model(self, model_name, source, tokenizer):
        if 'GPT' in model_name:
            model = GPT2Model.from_pretrained(source)
            model.resize_token_embeddings(len(tokenizer))
            model = QAModel(model)
        elif 'BioLinkBERT' in model_name or 'LinkBERT' in model_name:
            model = AutoModel.from_pretrained(source)
            model = QAModel(model)
        else:
            model = BertModel.from_pretrained(source)
            model = QAModel(model)
        return model
    def model_compile(self, model_name, source, batch_size=64):
        batch_size = 16 if 'GPT' in model_name else batch_size
        tokenizer = self.initialize_tokenizer(model_name, source)
        train_inputs, train_labels = self.encode_data(df_train)
        test_inputs, test_labels = self.encode_data(df_test)
        self.train_loader = self.create_dataloader(train_inputs, train_labels, batch_size)
        self.test_loader = self.create_dataloader(test_inputs, test_labels, batch_size)
        
        self.model = self.import_model(model_name, source, tokenizer).to(self.device) 
        self.optimizer = optim.AdamW(self.model.parameters(), lr=2e-5)

    
    def training(self, epochs=10):
        
        self.model.train()
        for epoch in range(epochs):
            total_loss = 0
            all_preds = []
            all_labels = []
        
            for batch in self.train_loader:
                b_input_ids, b_attention_mask, b_labels = [t.to(self.device) for t in batch]
                self.optimizer.zero_grad()
            
                outputs = self.model(b_input_ids, b_attention_mask)
                loss = self.loss_fn(outputs, b_labels)
                loss.backward()
                self.optimizer.step()
            
                total_loss += loss.item()
            
                preds = outputs.detach().cpu().numpy()
                label_ids = b_labels.to('cpu').numpy()
            
                all_preds.append(preds)
                all_labels.append(label_ids)
        
            avg_loss = total_loss / len(self.train_loader)
            all_preds = np.concatenate(all_preds, axis=0)
            all_labels = np.concatenate(all_labels, axis=0)
            avg_f1_score = self.calculate_f1_score(all_preds, all_labels)
        
            print(f"Epoch {epoch+1}, Loss: {avg_loss}, F1 Score: {avg_f1_score}")
        
        self.save_model()

    def save_model(self):
        os.makedirs('/kaggle/working/models', exist_ok=True)
        model_path = f'/kaggle/working/models/{self.name}_model.pth'
        torch.save({
            'model_state_dict': self.model.state_dict(),
            'optimizer_state_dict': self.optimizer.state_dict(),
        }, model_path)
        print(f"Model saved to {model_path}")

    def load_model(self, model_path):
        checkpoint = torch.load(model_path)
        self.model.load_state_dict(checkpoint['model_state_dict'])
        self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        print(f"Model loaded from {model_path}")

    def calculate_f1_score(self, preds, labels):
        preds_flat = np.argmax(preds, axis=1).flatten()
        labels_flat = labels.flatten()
        return f1_score(labels_flat, preds_flat, average='weighted')

    def evaluate(self, dataloader):
        self.model.eval()
        total_loss = 0
        predictions, true_labels = [], []
    
        with torch.no_grad():
            for batch in dataloader:
                b_input_ids, b_attention_mask, b_labels = [t.to(self.device) for t in batch]
                outputs = self.model(b_input_ids, b_attention_mask)
                logits = outputs.detach().cpu().numpy()
                label_ids = b_labels.cpu().numpy()
                predictions.extend(np.argmax(logits, axis=1))
                true_labels.extend(label_ids)
    
        accuracy = accuracy_score(true_labels, predictions)
        precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='weighted')
    
        return accuracy, precision, recall, f1
        

    def val(self, load_model_path=None):
        if load_model_path:
            self.load_model(load_model_path)
                
        test_accuracy, test_precision, test_recall, test_f1 = self.evaluate(self.test_loader)
        print(f"Test - Accuracy: {test_accuracy}, Precision: {test_precision}, Recall: {test_recall}, F1-Score: {test_f1}")
    
        return {
            'test': {
                'accuracy': test_accuracy,
                'precision': test_precision,
                'recall': test_recall,
                'f1': test_f1
            }
        }

In [30]:
models = [
    {
        'model_name': 'BERT',
        'source': 'bert-base-uncased',
    },
    {
        'model_name': 'GPT',
        'source': 'gpt2',
    },
    {
        'model_name': 'ColBERT',
        'source': 'colbert-ir/colbertv2.0',
    },
    {
        'model_name': 'LinkBERT',
        'source': 'michiyasunaga/LinkBERT-base',
    },
    {
        'model_name': 'BiomedNLP',
        'source': 'microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract',
    },
    {
        'model_name': 'BioLinkBERT',
        'source': 'michiyasunaga/BioLinkBERT-base',
    },

]

In [45]:
tokenizer = BertTokenizer.from_pretrained(models[0]['source'])

In [41]:
trainer = Trainandtest(pubmedqa_train, pubmedqa_test)

for model in models:
    model_name=model['model_name'],
    source=model['source'],
    trainer.model_compile(model_name,source)
    # Train the model
    trainer.training(epochs=1)
    
    # test the model
    test_result = trainer.val()
    trainer.results[model['model_name']] = test_result

OSError: Incorrect path_or_model_id: '('bert-base-uncased',)'. Please provide either the path to a local folder or the repo_id of a model on the Hub.

In [22]:
df = pd.DataFrame({
    'Model': test_results.keys(),
    'Accuracy': [test_results[model]['test']['accuracy'] for model in test_results],
    'Precision': [test_results[model]['test']['precision'] for model in test_results],
    'Recall': [test_results[model]['test']['recall'] for model in test_results],
    'F1 Score': [test_results[model]['test']['f1'] for model in test_results]})
print(df)

         Model  Accuracy  Precision  Recall  F1 Score
0         BERT     0.732   0.712725   0.732  0.721290
1          GPT     0.552   0.304704   0.552  0.392660
2      ColBERT     0.724   0.699425   0.724  0.708513
3    BiomedNLP     0.772   0.781622   0.772  0.771634
4  BioLinkBERT     0.804   0.768103   0.804  0.781757
