# 1. Importing

In [2]:
import shutil
import os

# Remove the directory if already exist 
dir_name = 'neural_medical_qa'
if os.path.exists(dir_name):
    shutil.rmtree(dir_name)

#clone the repo from github
!git clone https://github.com/trduc97/neural_medical_qa.git
%cd neural_medical_qa
# install the requirement
!pip install -r requirements.txt

Cloning into 'neural_medical_qa'...
remote: Enumerating objects: 132, done.[K
remote: Counting objects: 100% (132/132), done.[K
remote: Compressing objects: 100% (125/125), done.[K
remote: Total 132 (delta 62), reused 0 (delta 0), pack-reused 0[K
Receiving objects: 100% (132/132), 2.28 MiB | 10.02 MiB/s, done.
Resolving deltas: 100% (62/62), done.
/kaggle/working/neural_medical_qa/neural_medical_qa


## 1.2. Importing data

### 1.2.1. Importing pubmedqa and bioasq

In [4]:
from import_datasets import load_bioasq_pubmedqa, train_test_split

bioasq, pubmedqa = load_bioasq_pubmedqa()

# Display the first few samples of the PubMedQA dataset
print(pubmedqa['train'].to_pandas().head())

responses = pubmedqa['train']['final_decision']
# Counting the occurrences of each value
yes_count = responses.count('yes')
no_count = responses.count('no')
maybe_count = responses.count('maybe')

# Display the counts
print(f"Yes: {yes_count}")
print(f"No: {no_count}")
print(f"Maybe: {maybe_count}")

pubmedqa_train, pubmedqa_test = pubmed_train_test_split(pubmedqa)
print(f"Train size: {len(pubmedqa_train)}")
print(f"Test size: {len(pubmedqa_test)}")

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

      pubid                                           question  \
0  21645374  Do mitochondria play a role in remodelling lac...   
1  16418930  Landolt C and snellen e acuity: differences in...   
2   9488747  Syncope during bathing in infants, a pediatric...   
3  17208539  Are the long-term results of the transanal pul...   
4  10808977  Can tailored interventions increase mammograph...   

                                             context  \
0  {'contexts': ['Programmed cell death (PCD) is ...   
1  {'contexts': ['Assessment of visual acuity dep...   
2  {'contexts': ['Apparent life-threatening event...   
3  {'contexts': ['The transanal endorectal pull-t...   
4  {'contexts': ['Telephone counseling and tailor...   

                                         long_answer final_decision  \
0  Results depicted mitochondrial dynamics in viv...            yes   
1  Using the charts described, there was only a s...             no   
2  "Aquagenic maladies" could be a pediatric form... 

In [3]:
import json
from datasets import Dataset, DatasetDict, load_dataset
from sklearn.model_selection import train_test_split
import pandas as pd

def load_bioasq_pubmedqa(bioasq_kaggle_path = '/kaggle/input/bioasq-training-12b/training12b_new.json', 
                         pubmed_kaggle_path='/kaggle/input/pubmed-qa/pubmed_qa_pga_labeled.parquet'):
    # Load the JSON file
    with open(bioasq_kaggle_path,'r') as f:
        bioasq_data=json.load(f)
    # Extract yes/no questions directly
    bioasq_yesno = [{
            'id':question['id'],
            'question':question['body'],
            'final_decision':question['exact_answer'],
            'long_answer':question['ideal_answer'], 
            'documents':question['documents']
        }
        for question in bioasq_data['questions'] if question['type'] == 'yesno']
    # Convert the list of yes/no questions to a Pandas DataFrame
    bioasq_df = pd.DataFrame(bioasq_yesno)

    # Convert the DataFrame to a Hugging Face Dataset
    bioasq_dataset = Dataset.from_pandas(bioasq_df)
    # Create a DatasetDict with the 'train' split
    bioasq_data=DatasetDict({'train': bioasq_dataset})

    # Read from parquet and translate to a dataset object
    pubmed_df=pd.read_parquet(pubmed_kaggle_path)
    dataset=Dataset.from_pandas(pubmed_df,preserve_index=False)
    #Setting into similar format as from huggingface
    pubmedqa_data = DatasetDict({'train': dataset})
    
    # Load the pubmedqa dataset
    #pubmedqa_data=load_dataset("pubmed_qa","pqa_labeled") # unstable connection

    #Encoding decisions 
    def decision_encode(question):
        labels_map = {'no': 0, 'maybe': 1, 'yes': 2}
        question['decision_encoded'] = labels_map[question['final_decision']]
        return question

    pubmedqa_data=pubmedqa_data.map(decision_encode)
    bioasq_data=pubmedqa_data.map(decision_encode)

    return bioasq_data, pubmedqa_data


def pubmed_train_test_split(datasetdict,train_size=0.75, 
                         strat_col='decision_encoded'):
    #Convert dataset to pandas DataFrame
    df = pd.DataFrame(datasetdict['train'])
    test_size=(1-train_size)
    # Define the stratification column
    stratify_col=strat_col

    #Split like normal
    train_df, test_df = train_test_split(
        df,
        test_size=test_size,
        stratify=df[stratify_col],
        random_state=42)
    # Convert DataFrames back to Dataset
    train_dataset = Dataset.from_pandas(train_df, preserve_index=False)
    test_dataset = Dataset.from_pandas(test_df, preserve_index=False)

    return train_dataset, test_dataset

In [5]:
from collections import defaultdict
# Initialize a defaultdict to hold the bucket counts
length_buckets = defaultdict(int)

# Define the bucket size
bucket_size = 128

# Loop through each string in the list
for s in pubmedqa_train['long_answer']:
    # Determine the bucket for the current string length
    bucket = (len(s) // bucket_size) * bucket_size
    # Increment the count for the appropriate bucket
    length_buckets[bucket] += 1

# Display the counts for each bucket
for bucket, count in sorted(length_buckets.items()):
    print(f"Length {bucket} - {bucket + bucket_size - 1}: {count} strings")

Length 0 - 127: 66 strings
Length 128 - 255: 326 strings
Length 256 - 383: 244 strings
Length 384 - 511: 82 strings
Length 512 - 639: 25 strings
Length 640 - 767: 4 strings
Length 768 - 895: 3 strings


In [26]:
pubmedqa_artificial_df

Unnamed: 0,question,long_answer,pubid,final_decision,context,decision_encoded
0,Are group 2 innate lymphoid cells ( ILC2s ) in...,"As ILC2s are elevated in patients with CRSwNP,...",25429730,yes,{'contexts': ['Chronic rhinosinusitis (CRS) is...,2
1,Does vagus nerve contribute to the development...,Neuronal signals via the hepatic vagus nerve c...,25433161,yes,{'contexts': ['Phosphatidylethanolamine N-meth...,2
2,Does psammaplin A induce Sirtuin 1-dependent a...,PsA significantly inhibited MCF-7/adr cells pr...,25445714,yes,{'contexts': ['Psammaplin A (PsA) is a natural...,2
3,Is methylation of the FGFR2 gene associated wi...,We identified a novel biologically plausible c...,25431941,yes,{'contexts': ['This study examined links betwe...,2
4,Do tumor-infiltrating immune cell profiles and...,Breast cancer immune cell subpopulation profil...,25432519,yes,{'contexts': ['Tumor microenvironment immunity...,2
...,...,...,...,...,...,...
211264,Is urine production rate related to behavioura...,During active sleep (state 2F) hourly fetal ur...,8217974,yes,{'contexts': ['To investigate the relation bet...,2
211265,Does evaluation of the use of general practice...,General practice registers can provide a suita...,8204319,yes,{'contexts': ['This study set out to show how ...,2
211266,Does intracoronary angiotensin-converting enzy...,Intracoronary enalaprilat resulted in an impro...,8205673,yes,{'contexts': ['There is increasing recognition...,2
211267,Does transfusion significantly increase the ri...,The choice between splenectomy and splenic rep...,8215873,yes,{'contexts': ['To determine if splenectomy res...,2


### 1.2.2. Importing artificial pubmedqa dataset

In [6]:
bioasq, pubmedqa_artificial = load_bioasq_pubmedqa(pubmed_kaggle_path='/kaggle/input/pubmed-qa/pubmed_qa_pga_artificial.parquet')

Map:   0%|          | 0/211269 [00:00<?, ? examples/s]

Map:   0%|          | 0/211269 [00:00<?, ? examples/s]

In [27]:
stratified_sample_df = pubmedqa_artificial_df.groupby('final_decision', group_keys=False).apply(lambda x: x.sample(min(len(x), 1000 // len(pubmedqa_artificial_df['final_decision'].unique())), random_state=42))

  stratified_sample_df = pubmedqa_artificial_df.groupby('final_decision', group_keys=False).apply(lambda x: x.sample(min(len(x), 1000 // len(pubmedqa_artificial_df['final_decision'].unique())), random_state=42))


In [36]:
from datasets import load_dataset, concatenate_datasets, DatasetDict

# Convert to pandas DataFrame to handle schema mismatch
pubmedqa_df = pubmedqa_train.to_pandas()
pubmedqa_artificial_df = pubmedqa_artificial['train'].to_pandas()
# Ensure both DataFrames have the same columns
common_columns = list(set(pubmedqa_df.columns).intersection(set(pubmedqa_artificial_df.columns)))

pubmedqa_df = pubmedqa_df[common_columns]
pubmedqa_artificial_df = pubmedqa_artificial_df[common_columns]

# Take 1000 rows from artificial
pubmedqa_artificial_sample =  pubmedqa_artificial_df.groupby('final_decision', group_keys=False).apply(lambda x: x.sample(min(len(x), 1000 // len(pubmedqa_artificial_df['final_decision'].unique())), random_state=42))

# Step 3: Combine the samples to create pubmed_mix
combined_df = pd.concat([pubmedqa_df, pubmedqa_artificial_sample], ignore_index=True)

# Step 4: Shuffle the combined DataFrame to mix the rows
pubmed_mix_df = combined_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Step 5: Convert back to DatasetDict format
pubmed_mix = Dataset.from_pandas(pubmed_mix_df)

# Create DatasetDict
#pubmedqa_mix = DatasetDict({'train': pubmed_mix})


DatasetDict({
    train: Dataset({
        features: ['question', 'long_answer', 'pubid', 'final_decision', 'context', 'decision_encoded'],
        num_rows: 1750
    })
})


  pubmedqa_artificial_sample =  pubmedqa_artificial_df.groupby('final_decision', group_keys=False).apply(lambda x: x.sample(min(len(x), 1000 // len(pubmedqa_artificial_df['final_decision'].unique())), random_state=42))


In [37]:
responses = pubmed_mix['final_decision']
# Counting the occurrences of each value
yes_count = responses.count('yes')
no_count = responses.count('no')
maybe_count = responses.count('maybe')

# Display the counts
print(f"Yes: {yes_count}")
print(f"No: {no_count}")
print(f"Maybe: {maybe_count}")

Yes: 914
No: 754
Maybe: 82


In [61]:
import os
import torch
from transformers import BertTokenizer, BertModel, AutoTokenizer, AutoModel, GPT2Tokenizer, GPT2Model
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, f1_score
import gc

class QAModel(nn.Module):
    def __init__(self, model, classes=3, dropout_prob=0.5):
        super(QAModel, self).__init__()
        self.bert = model
        self.dropout1 = nn.Dropout(dropout_prob)
        self.linear1 = nn.Linear(model.config.hidden_size, 128)
        self.dropout2 = nn.Dropout(dropout_prob)
        self.linear2 = nn.Linear(128, classes)  # number of classes may vary between BioASQ (2 classes) and PubMedQA (3 classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]  # CLS token
        cls_output = self.dropout1(cls_output)  # Apply first dropout
        cls_output = self.linear1(cls_output)  # Apply first linear layer
        cls_output = self.dropout2(cls_output)  # Apply second dropout
        logits = self.linear2(cls_output)  # Apply second linear layer
        return logits

class Trainandtest:

    def __init__(self, df_train, df_test, stratify_col='decision_encoded'):
        self.train_data = df_train
        self.test_data = df_test
        self.loss_fn = nn.CrossEntropyLoss()
        self.stratify_col = stratify_col
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.results={}

    def initialize_tokenizer(self, model_name, source):
        if isinstance(source, tuple):
                source = source[0]
        if 'GPT' in model_name:
            tokenizer = GPT2Tokenizer.from_pretrained(source)
            if tokenizer.pad_token is None:
                tokenizer.add_special_tokens({'pad_token': '[PAD]'})
            return tokenizer
        elif 'BioLinkBERT' in model_name or 'LinkBERT' in model_name:
            return AutoTokenizer.from_pretrained(source)
        else:
            return BertTokenizer.from_pretrained(source)

    def encode_data(self, df, tokenizer):
        inputs = tokenizer(
            text=df['question'], 
            text_pair=df['long_answer'], 
            padding=True, 
            truncation=True, 
            return_tensors='pt', 
            max_length=128*4
        )
        labels = torch.tensor(df[self.stratify_col])
        return inputs, labels

    def create_dataloader(self, inputs, labels, batch_size):
        dataset = TensorDataset(inputs['input_ids'], inputs['attention_mask'], labels)
        return DataLoader(dataset, batch_size=batch_size, shuffle=True)

    def import_model(self, QAModel, model_name, source, tokenizer):
        if isinstance(source, tuple):
            source = source[0]
        if 'GPT' in model_name:
            model = GPT2Model.from_pretrained(source)
            model.resize_token_embeddings(len(tokenizer))
            model = QAModel(model)
        elif 'BioLinkBERT' in model_name or 'LinkBERT' in model_name:
            model = AutoModel.from_pretrained(source)
            model = QAModel(model)
        else:
            model = BertModel.from_pretrained(source)
            model = QAModel(model)
        return model
    def model_compile(self, QAModel, model_name, source, batch_size=64, adamw=True):
        batch_size = 16 if 'GPT' in model_name else batch_size
        tokenizer = self.initialize_tokenizer(model_name, source)
        train_inputs, train_labels = self.encode_data(self.train_data, tokenizer)
        test_inputs, test_labels = self.encode_data(self.test_data, tokenizer)
        self.train_loader = self.create_dataloader(train_inputs, train_labels, batch_size)
        self.test_loader = self.create_dataloader(test_inputs, test_labels, batch_size)
        
        self.model = self.import_model(QAModel, model_name, source, tokenizer).to(self.device) 
        if adamw:
            self.optimizer = optim.AdamW(self.model.parameters(), lr=2e-5)
        else: 
            self.optimizer = optim.Adam(self.model.parameters(), lr=2e-5)
    
    def training(self, model_name, epochs=10):
        if isinstance(model_name, tuple):
            model_name = model_name[0]        
        self.model.train()
        for epoch in range(epochs):
            total_loss = 0
            all_preds = []
            all_labels = []
        
            for batch in self.train_loader:
                b_input_ids, b_attention_mask, b_labels = [t.to(self.device) for t in batch]
                self.optimizer.zero_grad()
            
                outputs = self.model(b_input_ids, b_attention_mask)
                loss = self.loss_fn(outputs, b_labels)
                loss.backward()
                self.optimizer.step()
                
                total_loss += loss.item()
            
                preds = outputs.detach().cpu().numpy()
                label_ids = b_labels.to('cpu').numpy()
                del b_input_ids 
                del b_attention_mask 
                del b_labels
                gc.collect()
                torch.cuda.empty_cache()
                all_preds.append(preds)
                all_labels.append(label_ids)
        
            avg_loss = total_loss / len(self.train_loader)
            all_preds = np.concatenate(all_preds, axis=0)
            all_labels = np.concatenate(all_labels, axis=0)
            avg_f1_score = self.calculate_f1_score(all_preds, all_labels)
        
            print(f"Epoch {epoch+1}, Loss: {avg_loss}, F1 Score: {avg_f1_score}")
        
        self.save_model(model_name)

    def save_model(self, model_name):
        os.makedirs('/kaggle/working/models', exist_ok=True)
        model_path = f'/kaggle/working/models/{model_name}_model.pth'
        torch.save({
            'model_state_dict': self.model.state_dict(),
            'optimizer_state_dict': self.optimizer.state_dict(),
        }, model_path)
        print(f"Model saved to {model_path}")

    def load_model(self, model_path):
        checkpoint = torch.load(model_path)
        self.model.load_state_dict(checkpoint['model_state_dict'])
        self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        print(f"Model loaded from {model_path}")

    def calculate_f1_score(self, preds, labels):
        preds_flat = np.argmax(preds, axis=1).flatten()
        labels_flat = labels.flatten()
        return f1_score(labels_flat, preds_flat, average='weighted')

    def evaluate(self, dataloader):
        self.model.eval()
        total_loss = 0
        predictions, true_labels = [], []
    
        with torch.no_grad():
            for batch in dataloader:
                b_input_ids, b_attention_mask, b_labels = [t.to(self.device) for t in batch]
                outputs = self.model(b_input_ids, b_attention_mask)
                logits = outputs.detach().cpu().numpy()
                label_ids = b_labels.cpu().numpy()
                predictions.extend(np.argmax(logits, axis=1))
                true_labels.extend(label_ids)
                del b_input_ids 
                del b_attention_mask 
                del b_labels
                gc.collect()
                torch.cuda.empty_cache()
    
        accuracy = accuracy_score(true_labels, predictions)
        precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='weighted')
    
        return accuracy, precision, recall, f1
        

    def val(self, load_model_path=None):
        if load_model_path:
            self.load_model(load_model_path)
                
        test_accuracy, test_precision, test_recall, test_f1 = self.evaluate(self.test_loader)
        print(f"Test - Accuracy: {test_accuracy}, Precision: {test_precision}, Recall: {test_recall}, F1-Score: {test_f1}")
    
        return {
            'test': {
                'accuracy': test_accuracy,
                'precision': test_precision,
                'recall': test_recall,
                'f1': test_f1
            }
        }

In [62]:
def result_convert(result_dict):
    df = pd.DataFrame({
        'Model': result_dict.keys(),
        'Accuracy': [result_dict[model]['test']['accuracy'] for model in result_dict],
        'Precision': [result_dict[model]['test']['precision'] for model in result_dict],
        'Recall': [result_dict[model]['test']['recall'] for model in result_dict],
        'F1 Score': [result_dict[model]['test']['f1'] for model in result_dict]})
    return df

In [53]:
models = [
    
    {
        'model_name': 'BERT',
        'source': 'bert-base-uncased',
    },
    {
        'model_name': 'GPT',
        'source': 'gpt2',
    },
    {
        'model_name': 'ColBERT',
        'source': 'colbert-ir/colbertv2.0',
    },

    {
        'model_name': 'LinkBERT',
        'source': 'michiyasunaga/LinkBERT-base',
    },
    {
        'model_name': 'BiomedNLP',
        'source': 'microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract',
    },
    {
        'model_name': 'BioLinkBERT',
        'source': 'michiyasunaga/BioLinkBERT-base',
    },

]

In [44]:
trainer = Trainandtest(pubmedqa_train, pubmedqa_test)

for model in models:
    model_name=model['model_name'],
    source=model['source'],
    trainer.model_compile(QAModel, model_name,source, batch_size=8)
    # Train the model
    trainer.training(model_name, epochs=3)
    
    # test the model
    test_result = trainer.val()
    trainer.results[model['model_name']] = test_result

Epoch 1, Loss: 0.9006132993926393, F1 Score: 0.5249099889435542
Epoch 2, Loss: 0.7098345870667315, F1 Score: 0.6958092577262693
Epoch 3, Loss: 0.5697074686276152, F1 Score: 0.7611704025147913
Model saved to /kaggle/working/models/BERT_model.pth


  _warn_prf(average, modifier, msg_start, len(result))


Test - Accuracy: 0.748, Precision: 0.6694068965517241, Recall: 0.748, F1-Score: 0.70531291715744
Epoch 1, Loss: 1.393192358473514, F1 Score: 0.40785338135254096
Epoch 2, Loss: 1.1123240907141503, F1 Score: 0.41203555542772197
Epoch 3, Loss: 1.032146523607538, F1 Score: 0.43241287380838644
Model saved to /kaggle/working/models/GPT_model.pth


  _warn_prf(average, modifier, msg_start, len(result))


Test - Accuracy: 0.552, Precision: 0.30470400000000003, Recall: 0.552, F1-Score: 0.392659793814433
Epoch 1, Loss: 0.9623821656754676, F1 Score: 0.4707225929057005
Epoch 2, Loss: 0.7839130653028793, F1 Score: 0.6467877762783025
Epoch 3, Loss: 0.5677921136325979, F1 Score: 0.7601542321805421
Model saved to /kaggle/working/models/ColBERT_model.pth


  _warn_prf(average, modifier, msg_start, len(result))


Test - Accuracy: 0.736, Precision: 0.6570966841650607, Recall: 0.736, F1-Score: 0.6938642056690837
Epoch 1, Loss: 0.9824871433542129, F1 Score: 0.45623053111762785
Epoch 2, Loss: 0.8014798021696984, F1 Score: 0.6310760208825001
Epoch 3, Loss: 0.6627186527277561, F1 Score: 0.7320635448666813
Model saved to /kaggle/working/models/LinkBERT_model.pth


  _warn_prf(average, modifier, msg_start, len(result))


Test - Accuracy: 0.74, Precision: 0.6640471698113207, Recall: 0.74, F1-Score: 0.6985478163493841
Epoch 1, Loss: 0.9246221917106751, F1 Score: 0.5620592798606044
Epoch 2, Loss: 0.6177987788427384, F1 Score: 0.7432733337182548
Epoch 3, Loss: 0.4287259772102884, F1 Score: 0.8118622911910612
Model saved to /kaggle/working/models/BiomedNLP_model.pth
Test - Accuracy: 0.756, Precision: 0.6822384779399705, Recall: 0.756, F1-Score: 0.7070682119205298
Epoch 1, Loss: 0.9633053610933587, F1 Score: 0.4426174883428436
Epoch 2, Loss: 0.7026493197425883, F1 Score: 0.7145666029027515
Epoch 3, Loss: 0.550870918213053, F1 Score: 0.7752358324651903
Model saved to /kaggle/working/models/BioLinkBERT_model.pth
Test - Accuracy: 0.784, Precision: 0.7036995305164319, Recall: 0.784, F1-Score: 0.7395857142857143


  _warn_prf(average, modifier, msg_start, len(result))


# Testing with adam instead of AdamW 

In [42]:
trainer_adam = Trainandtest(pubmedqa_train, pubmedqa_test)

for model in models:
    model_name=model['model_name'],
    source=model['source'],
    trainer_adam.model_compile(QAModel, model_name,source, adamw=False,batch_size=8)
    # Train the model
    trainer_adam.training(model_name, epochs=3)
    
    # test the model
    test_result = trainer_adam.val()
    trainer_adam.results[model['model_name']] = test_result

Epoch 1, Loss: 0.9944657968713883, F1 Score: 0.4253417483729644
Epoch 2, Loss: 0.7733636637951465, F1 Score: 0.6548607825295722
Epoch 3, Loss: 0.5842612776508991, F1 Score: 0.7550026857380494
Model saved to /kaggle/working/models/BERT_model.pth


  _warn_prf(average, modifier, msg_start, len(result))


Test - Accuracy: 0.756, Precision: 0.6714458404074704, Recall: 0.756, F1-Score: 0.7112167330829219
Epoch 1, Loss: 1.2827900267661887, F1 Score: 0.43629321470457905
Epoch 2, Loss: 1.064146750784935, F1 Score: 0.43621814171491585
Epoch 3, Loss: 1.0238550039047891, F1 Score: 0.450740495819702
Model saved to /kaggle/working/models/GPT_model.pth


  _warn_prf(average, modifier, msg_start, len(result))


Test - Accuracy: 0.552, Precision: 0.30470400000000003, Recall: 0.552, F1-Score: 0.392659793814433
Epoch 1, Loss: 0.9681339682416713, F1 Score: 0.49036236684242235
Epoch 2, Loss: 0.8045094900942863, F1 Score: 0.6229481668773704
Epoch 3, Loss: 0.5787445804540147, F1 Score: 0.7569829223879129
Model saved to /kaggle/working/models/ColBERT_model.pth


  _warn_prf(average, modifier, msg_start, len(result))


Test - Accuracy: 0.724, Precision: 0.6443233082706766, Recall: 0.724, F1-Score: 0.6817145888594165
Epoch 1, Loss: 0.9647965082462798, F1 Score: 0.4447860985053272
Epoch 2, Loss: 0.7196468939172461, F1 Score: 0.6949428613612875
Epoch 3, Loss: 0.5734080284675385, F1 Score: 0.760998794755239
Model saved to /kaggle/working/models/LinkBERT_model.pth


  _warn_prf(average, modifier, msg_start, len(result))


Test - Accuracy: 0.788, Precision: 0.7010075187969925, Recall: 0.788, F1-Score: 0.7418228116710875
Epoch 1, Loss: 0.9780284312177212, F1 Score: 0.5217758791110249
Epoch 2, Loss: 0.6510280216944978, F1 Score: 0.7361572286721654
Epoch 3, Loss: 0.4577103276202019, F1 Score: 0.8080678988997622
Model saved to /kaggle/working/models/BiomedNLP_model.pth
Test - Accuracy: 0.796, Precision: 0.7685895642818721, Recall: 0.796, F1-Score: 0.7588327432375017
Epoch 1, Loss: 0.9089553800035031, F1 Score: 0.5260186850522093
Epoch 2, Loss: 0.6385407179911086, F1 Score: 0.7309744816418116
Epoch 3, Loss: 0.4850178955400244, F1 Score: 0.7964205240258372
Model saved to /kaggle/working/models/BioLinkBERT_model.pth
Test - Accuracy: 0.824, Precision: 0.7824133333333333, Recall: 0.824, F1-Score: 0.7913226315503075


# mixing artificial data 

In [38]:
trainer_mix = Trainandtest(pubmed_mix, pubmedqa_test)

# Function to free up memory
def free_memory():
    gc.collect()
    torch.cuda.empty_cache()

for model in models:
    model_name=model['model_name'],
    source=model['source'],
    trainer_mix.model_compile(QAModel,model_name,source,batch_size=8)
    free_memory()
    # Train the model
    trainer_mix.training(model_name, epochs=3)
    
    # test the model
    test_result = trainer_mix.val()
    trainer_mix.results[model['model_name']] = test_result

Epoch 1, Loss: 0.6736238362378182, F1 Score: 0.691544910967982
Epoch 2, Loss: 0.465376404594613, F1 Score: 0.8295694071702636
Epoch 3, Loss: 0.33468898533754154, F1 Score: 0.8769848317757546
Model saved to /kaggle/working/models/BERT_model.pth


  _warn_prf(average, modifier, msg_start, len(result))


Test - Accuracy: 0.744, Precision: 0.6626754966887417, Recall: 0.744, F1-Score: 0.7007689602359748
Epoch 1, Loss: 1.1697420526634563, F1 Score: 0.4557046728980926
Epoch 2, Loss: 0.9441103918985887, F1 Score: 0.4662551558983837
Epoch 3, Loss: 0.9006876452402635, F1 Score: 0.4617393230350778
Model saved to /kaggle/working/models/GPT_model.pth


  _warn_prf(average, modifier, msg_start, len(result))


Test - Accuracy: 0.552, Precision: 0.30470400000000003, Recall: 0.552, F1-Score: 0.392659793814433
Epoch 1, Loss: 0.6935228329134858, F1 Score: 0.6728090010628573
Epoch 2, Loss: 0.4662316624568478, F1 Score: 0.8209139196858206
Epoch 3, Loss: 0.31377549711870006, F1 Score: 0.8815665162193244
Model saved to /kaggle/working/models/ColBERT_model.pth
Test - Accuracy: 0.732, Precision: 0.6584333333333334, Recall: 0.732, F1-Score: 0.6919905437352245
Epoch 1, Loss: 0.6730421713838294, F1 Score: 0.6990942692374034
Epoch 2, Loss: 0.46076451884965375, F1 Score: 0.8314749850609237
Epoch 3, Loss: 0.3329331569169482, F1 Score: 0.8858639456298941
Model saved to /kaggle/working/models/LinkBERT_model.pth


  _warn_prf(average, modifier, msg_start, len(result))


Test - Accuracy: 0.756, Precision: 0.6699550802139038, Recall: 0.756, F1-Score: 0.7092652176460249


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/225k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Epoch 1, Loss: 0.6295522175150919, F1 Score: 0.7207359059710398
Epoch 2, Loss: 0.393806324506255, F1 Score: 0.8586774804932878
Epoch 3, Loss: 0.2491684401892636, F1 Score: 0.9110065858041375
Model saved to /kaggle/working/models/BiomedNLP_model.pth
Test - Accuracy: 0.78, Precision: 0.6954216867469879, Recall: 0.78, F1-Score: 0.7336621493854395


tokenizer_config.json:   0%|          | 0.00/379 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/225k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/447k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/559 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/433M [00:00<?, ?B/s]

Epoch 1, Loss: 0.6713117689030356, F1 Score: 0.695257681121867
Epoch 2, Loss: 0.42754317891516097, F1 Score: 0.8409749716283901
Epoch 3, Loss: 0.3084790704763371, F1 Score: 0.8889605442515863
Model saved to /kaggle/working/models/BioLinkBERT_model.pth
Test - Accuracy: 0.804, Precision: 0.7171698595146871, Recall: 0.804, F1-Score: 0.7574175438596492


# Testing with bidirectional LSTM for output

In [47]:
import torch
import torch.nn as nn

class BiLSTMmodel(nn.Module):
    def __init__(self, model, classes=3, lstm_hidden_size=256, lstm_layers=1, dropout_prob=0.5):
        super(BiLSTMmodel, self).__init__()
        self.bert = model
        self.dropout1 = nn.Dropout(dropout_prob)
        
        # BiLSTM layer
        self.lstm = nn.LSTM(input_size=model.config.hidden_size,
                            hidden_size=lstm_hidden_size,
                            num_layers=lstm_layers,
                            batch_first=True,
                            bidirectional=True)
        
        self.dropout2 = nn.Dropout(dropout_prob)
        self.linear1 = nn.Linear(lstm_hidden_size * 2, 128)  # Bidirectional LSTM hidden size is doubled
        self.dropout3 = nn.Dropout(dropout_prob)
        self.linear2 = nn.Linear(128, classes)  # number of classes may vary between BioASQ (2 classes) and PubMedQA (3 classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = outputs.last_hidden_state  # BERT outputs
        
        # Pass BERT outputs through BiLSTM
        lstm_output, _ = self.lstm(sequence_output)
        lstm_output = lstm_output[:, 0, :]  # Use the hidden state of the first token (CLS token)
        
        lstm_output = self.dropout2(lstm_output)
        lstm_output = self.linear1(lstm_output)
        lstm_output = self.dropout3(lstm_output)
        logits = self.linear2(lstm_output)
        
        return logits


In [50]:
trainer_bilstm = Trainandtest(pubmedqa_train, pubmedqa_test)

for model in models:
    model_name=model['model_name'],
    source=model['source'],
    trainer_bilstm.model_compile(BiLSTMmodel, model_name,source,batch_size=8)
    # Train the model
    trainer_bilstm.training(model_name, epochs=3)
    
    # test the model
    test_result = trainer_bilstm.val()
    trainer_bilstm.results[model['model_name']] = test_result

Epoch 1, Loss: 0.9635301663520488, F1 Score: 0.48290765593552526
Epoch 2, Loss: 0.7392633834734876, F1 Score: 0.6988844637095462
Epoch 3, Loss: 0.5267599085544018, F1 Score: 0.7794557473839665
Model saved to /kaggle/working/models/BERT_model.pth


  _warn_prf(average, modifier, msg_start, len(result))


Test - Accuracy: 0.74, Precision: 0.6578823529411766, Recall: 0.74, F1-Score: 0.6939056065885334
Epoch 1, Loss: 1.3115251089664215, F1 Score: 0.43367769678223544
Epoch 2, Loss: 1.0169418936080121, F1 Score: 0.4643370417796646
Epoch 3, Loss: 1.0267750785705891, F1 Score: 0.4273241014715746
Model saved to /kaggle/working/models/GPT_model.pth


  _warn_prf(average, modifier, msg_start, len(result))


Test - Accuracy: 0.552, Precision: 0.30470400000000003, Recall: 0.552, F1-Score: 0.392659793814433
Epoch 1, Loss: 0.9932269111592719, F1 Score: 0.42351438326931284
Epoch 2, Loss: 0.8081247873128728, F1 Score: 0.642997106285956
Epoch 3, Loss: 0.6008845420276865, F1 Score: 0.7443287563364338
Model saved to /kaggle/working/models/ColBERT_model.pth


  _warn_prf(average, modifier, msg_start, len(result))


Test - Accuracy: 0.728, Precision: 0.6462502902252147, Recall: 0.728, F1-Score: 0.6789677888989991
Epoch 1, Loss: 0.9748588819453057, F1 Score: 0.44215684125563953
Epoch 2, Loss: 0.8172687174791985, F1 Score: 0.6304076789339947
Epoch 3, Loss: 0.6159245508148316, F1 Score: 0.7505618500273673
Model saved to /kaggle/working/models/LinkBERT_model.pth


  _warn_prf(average, modifier, msg_start, len(result))


Test - Accuracy: 0.74, Precision: 0.6620484720758694, Recall: 0.74, F1-Score: 0.6978807311956847
Epoch 1, Loss: 0.9573984653391736, F1 Score: 0.5095489964580875
Epoch 2, Loss: 0.6249953443382649, F1 Score: 0.7370382056805996
Epoch 3, Loss: 0.4561533945751317, F1 Score: 0.8216667785353308
Model saved to /kaggle/working/models/BiomedNLP_model.pth
Test - Accuracy: 0.804, Precision: 0.7839732136878056, Recall: 0.804, F1-Score: 0.7713454922279793
Epoch 1, Loss: 0.9686440977644413, F1 Score: 0.424868906455863
Epoch 2, Loss: 0.768838274986186, F1 Score: 0.6491089457125033
Epoch 3, Loss: 0.5697488480425895, F1 Score: 0.7712638846583408
Model saved to /kaggle/working/models/BioLinkBERT_model.pth
Test - Accuracy: 0.792, Precision: 0.7095912685445396, Recall: 0.792, F1-Score: 0.7467093961357157


  _warn_prf(average, modifier, msg_start, len(result))


# bilstm + adam

In [52]:
trainer_bilstm_adam = Trainandtest(pubmedqa_train, pubmedqa_test)

for model in models:
    model_name=model['model_name'],
    source=model['source'],
    trainer_bilstm_adam.model_compile(BiLSTMmodel, model_name,source,adamw=False, batch_size=8)
    # Train the model
    trainer_bilstm_adam.training(model_name, epochs=3)
    
    # test the model
    test_result = trainer_bilstm_adam.val()
    trainer_bilstm_adam.results[model['model_name']] = test_result

Epoch 1, Loss: 0.9335505280722963, F1 Score: 0.4956809169038876
Epoch 2, Loss: 0.7176162714653826, F1 Score: 0.7000559695528399
Epoch 3, Loss: 0.5201842948952888, F1 Score: 0.7708864527705387
Model saved to /kaggle/working/models/BERT_model.pth


  _warn_prf(average, modifier, msg_start, len(result))


Test - Accuracy: 0.748, Precision: 0.6643477079796265, Recall: 0.748, F1-Score: 0.7036946250500505
Epoch 1, Loss: 1.9795183552072404, F1 Score: 0.3883792578840153
Epoch 2, Loss: 1.1531561511628172, F1 Score: 0.42933204453510676
Epoch 3, Loss: 1.089423419313228, F1 Score: 0.42742713340781985
Model saved to /kaggle/working/models/GPT_model.pth


  _warn_prf(average, modifier, msg_start, len(result))


Test - Accuracy: 0.552, Precision: 0.30470400000000003, Recall: 0.552, F1-Score: 0.392659793814433
Epoch 1, Loss: 0.9634516144052465, F1 Score: 0.4520106452132586
Epoch 2, Loss: 0.7716564190514544, F1 Score: 0.6572602721961783
Epoch 3, Loss: 0.5557666971011365, F1 Score: 0.7670569836602411
Model saved to /kaggle/working/models/ColBERT_model.pth


  _warn_prf(average, modifier, msg_start, len(result))


Test - Accuracy: 0.72, Precision: 0.6373140477755862, Recall: 0.72, F1-Score: 0.6739880367189813
Epoch 1, Loss: 0.95411590439208, F1 Score: 0.4542520625889047
Epoch 2, Loss: 0.7634850464602734, F1 Score: 0.6668476381953001
Epoch 3, Loss: 0.6080983522090506, F1 Score: 0.7512189133798105
Model saved to /kaggle/working/models/LinkBERT_model.pth


  _warn_prf(average, modifier, msg_start, len(result))


Test - Accuracy: 0.764, Precision: 0.6777523390203632, Recall: 0.764, F1-Score: 0.7182162162162162
Epoch 1, Loss: 0.9656948662818746, F1 Score: 0.5176955226765354
Epoch 2, Loss: 0.6322591750228659, F1 Score: 0.7463169170551024
Epoch 3, Loss: 0.4217598030858852, F1 Score: 0.82901014775011
Model saved to /kaggle/working/models/BiomedNLP_model.pth
Test - Accuracy: 0.776, Precision: 0.7099284082254379, Recall: 0.776, F1-Score: 0.7405554679234394
Epoch 1, Loss: 0.9815204828343493, F1 Score: 0.45244928014067476
Epoch 2, Loss: 0.7912213304575454, F1 Score: 0.6509258327824234
Epoch 3, Loss: 0.5800189352098931, F1 Score: 0.7583522811161967
Model saved to /kaggle/working/models/BioLinkBERT_model.pth
Test - Accuracy: 0.804, Precision: 0.7348923076923076, Recall: 0.804, F1-Score: 0.7603266022827041


  _warn_prf(average, modifier, msg_start, len(result))


In [45]:
result_adamw= result_convert(trainer.results)
result_adamw

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,BERT,0.748,0.669407,0.748,0.705313
1,GPT,0.552,0.304704,0.552,0.39266
2,ColBERT,0.736,0.657097,0.736,0.693864
3,LinkBERT,0.74,0.664047,0.74,0.698548
4,BiomedNLP,0.756,0.682238,0.756,0.707068
5,BioLinkBERT,0.784,0.7037,0.784,0.739586


In [43]:
result_adam= result_convert(trainer_adam.results)
result_adam

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,BERT,0.756,0.671446,0.756,0.711217
1,GPT,0.552,0.304704,0.552,0.39266
2,ColBERT,0.724,0.644323,0.724,0.681715
3,LinkBERT,0.788,0.701008,0.788,0.741823
4,BiomedNLP,0.796,0.76859,0.796,0.758833
5,BioLinkBERT,0.824,0.782413,0.824,0.791323


In [41]:
result_mix= result_convert(trainer_mix.results)
result_mix

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,BERT,0.744,0.662675,0.744,0.700769
1,GPT,0.552,0.304704,0.552,0.39266
2,ColBERT,0.732,0.658433,0.732,0.691991
3,LinkBERT,0.756,0.669955,0.756,0.709265
4,BiomedNLP,0.78,0.695422,0.78,0.733662
5,BioLinkBERT,0.804,0.71717,0.804,0.757418


In [51]:
result_bilstm= result_convert(trainer_bilstm.results)
result_bilstm

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,BERT,0.74,0.657882,0.74,0.693906
1,GPT,0.552,0.304704,0.552,0.39266
2,ColBERT,0.728,0.64625,0.728,0.678968
3,LinkBERT,0.74,0.662048,0.74,0.697881
4,BiomedNLP,0.804,0.783973,0.804,0.771345
5,BioLinkBERT,0.792,0.709591,0.792,0.746709


In [53]:
result_bilstm_adam= result_convert(trainer_bilstm_adam.results)
result_bilstm_adam

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,BERT,0.748,0.664348,0.748,0.703695
1,GPT,0.552,0.304704,0.552,0.39266
2,ColBERT,0.72,0.637314,0.72,0.673988
3,LinkBERT,0.764,0.677752,0.764,0.718216
4,BiomedNLP,0.776,0.709928,0.776,0.740555
5,BioLinkBERT,0.804,0.734892,0.804,0.760327


# Testing with just context 

In [47]:
pubmed_text = pd.DataFrame(pubmedqa['train']['context'])
pubmed_text['full_context'] = pubmed_text['contexts'].apply(lambda x: ' '.join(x))
# Convert to a DataFrame
pubmedqa_train_df = pd.DataFrame(pubmedqa['train'])
pubmedqa_train_df['full_context']= pubmed_text['full_context']

# Convert the DataFrame back to a Dataset
pubmedqa_context = Dataset.from_pandas(pubmedqa_train_df)

# Create a DatasetDict
pubmedqa = DatasetDict({
    'train': pubmedqa_context
})

In [49]:
pubmedqa_context_train, pubmedqa_context_test = pubmed_train_test_split(pubmedqa)

In [57]:
import os
import torch
from transformers import BertTokenizer, BertModel, AutoTokenizer, AutoModel, GPT2Tokenizer, GPT2Model
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, f1_score

class QAModel(nn.Module):
    def __init__(self, model, classes=3, dropout_prob=0.5):
        super(QAModel, self).__init__()
        self.bert = model
        self.dropout1 = nn.Dropout(dropout_prob)
        self.linear1 = nn.Linear(model.config.hidden_size, 128)
        self.dropout2 = nn.Dropout(dropout_prob)
        self.linear2 = nn.Linear(128, classes)  # number of classes may vary between BioASQ (2 classes) and PubMedQA (3 classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]  # CLS token
        cls_output = self.dropout1(cls_output)  # Apply first dropout
        cls_output = self.linear1(cls_output)  # Apply first linear layer
        cls_output = self.dropout2(cls_output)  # Apply second dropout
        logits = self.linear2(cls_output)  # Apply second linear layer
        return logits

class Trainandtest:

    def __init__(self, df_train, df_test, stratify_col='decision_encoded'):
        self.train_data = df_train
        self.test_data = df_test
        self.loss_fn = nn.CrossEntropyLoss()
        self.stratify_col = stratify_col
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.results={}

    def initialize_tokenizer(self, model_name, source):
        if isinstance(source, tuple):
                source = source[0]
        if 'GPT' in model_name:
            tokenizer = GPT2Tokenizer.from_pretrained(source)
            if tokenizer.pad_token is None:
                tokenizer.add_special_tokens({'pad_token': '[PAD]'})
            return tokenizer
        elif 'BioLinkBERT' in model_name or 'LinkBERT' in model_name:
            return AutoTokenizer.from_pretrained(source)
        else:
            return BertTokenizer.from_pretrained(source)

    def encode_data(self, df, tokenizer):
        inputs = tokenizer(
            text=df['question'], 
            text_pair=df['full_context'], 
            padding=True, 
            truncation=True, 
            return_tensors='pt', 
            max_length=128*4
        )
        labels = torch.tensor(df[self.stratify_col])
        return inputs, labels

    def create_dataloader(self, inputs, labels, batch_size):
        dataset = TensorDataset(inputs['input_ids'], inputs['attention_mask'], labels)
        return DataLoader(dataset, batch_size=batch_size, shuffle=True)

    def import_model(self, QAModel, model_name, source, tokenizer):
        if isinstance(source, tuple):
            source = source[0]
        if 'GPT' in model_name:
            model = GPT2Model.from_pretrained(source)
            model.resize_token_embeddings(len(tokenizer))
            model = QAModel(model)
        elif 'BioLinkBERT' in model_name or 'LinkBERT' in model_name:
            model = AutoModel.from_pretrained(source)
            model = QAModel(model)
        else:
            model = BertModel.from_pretrained(source)
            model = QAModel(model)
        return model
    def model_compile(self, QAModel, model_name, source, batch_size=64, adamw=True):
        batch_size = 16 if 'GPT' in model_name else batch_size
        tokenizer = self.initialize_tokenizer(model_name, source)
        train_inputs, train_labels = self.encode_data(self.train_data, tokenizer)
        test_inputs, test_labels = self.encode_data(self.test_data, tokenizer)
        self.train_loader = self.create_dataloader(train_inputs, train_labels, batch_size)
        self.test_loader = self.create_dataloader(test_inputs, test_labels, batch_size)
        
        self.model = self.import_model(QAModel, model_name, source, tokenizer).to(self.device) 
        if adamw:
            self.optimizer = optim.AdamW(self.model.parameters(), lr=2e-5)
        else: 
            self.optimizer = optim.Adam(self.model.parameters(), lr=2e-5)
    
    def training(self, model_name, epochs=10):
        if isinstance(model_name, tuple):
            model_name = model_name[0]        
        self.model.train()
        for epoch in range(epochs):
            total_loss = 0
            all_preds = []
            all_labels = []
        
            for batch in self.train_loader:
                b_input_ids, b_attention_mask, b_labels = [t.to(self.device) for t in batch]
                self.optimizer.zero_grad()
            
                outputs = self.model(b_input_ids, b_attention_mask)
                loss = self.loss_fn(outputs, b_labels)
                loss.backward()
                self.optimizer.step()
                
                total_loss += loss.item()
            
                preds = outputs.detach().cpu().numpy()
                label_ids = b_labels.to('cpu').numpy()
                del b_input_ids 
                del b_attention_mask 
                del b_labels
                gc.collect()
                torch.cuda.empty_cache()
                all_preds.append(preds)
                all_labels.append(label_ids)
        
            avg_loss = total_loss / len(self.train_loader)
            all_preds = np.concatenate(all_preds, axis=0)
            all_labels = np.concatenate(all_labels, axis=0)
            avg_f1_score = self.calculate_f1_score(all_preds, all_labels)
        
            print(f"Epoch {epoch+1}, Loss: {avg_loss}, F1 Score: {avg_f1_score}")
        
        self.save_model(model_name)

    def save_model(self, model_name):
        os.makedirs('/kaggle/working/models', exist_ok=True)
        model_path = f'/kaggle/working/models/{model_name}_model.pth'
        torch.save({
            'model_state_dict': self.model.state_dict(),
            'optimizer_state_dict': self.optimizer.state_dict(),
        }, model_path)
        print(f"Model saved to {model_path}")

    def load_model(self, model_path):
        checkpoint = torch.load(model_path)
        self.model.load_state_dict(checkpoint['model_state_dict'])
        self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        print(f"Model loaded from {model_path}")

    def calculate_f1_score(self, preds, labels):
        preds_flat = np.argmax(preds, axis=1).flatten()
        labels_flat = labels.flatten()
        return f1_score(labels_flat, preds_flat, average='weighted')

    def evaluate(self, dataloader):
        self.model.eval()
        total_loss = 0
        predictions, true_labels = [], []
    
        with torch.no_grad():
            for batch in dataloader:
                b_input_ids, b_attention_mask, b_labels = [t.to(self.device) for t in batch]
                outputs = self.model(b_input_ids, b_attention_mask)
                logits = outputs.detach().cpu().numpy()
                label_ids = b_labels.cpu().numpy()
                predictions.extend(np.argmax(logits, axis=1))
                true_labels.extend(label_ids)
                del b_input_ids 
                del b_attention_mask 
                del b_labels
                gc.collect()
                torch.cuda.empty_cache()
    
        accuracy = accuracy_score(true_labels, predictions)
        precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='weighted')
    
        return accuracy, precision, recall, f1
        

    def val(self, load_model_path=None):
        if load_model_path:
            self.load_model(load_model_path)
                
        test_accuracy, test_precision, test_recall, test_f1 = self.evaluate(self.test_loader)
        print(f"Test - Accuracy: {test_accuracy}, Precision: {test_precision}, Recall: {test_recall}, F1-Score: {test_f1}")
    
        return {
            'test': {
                'accuracy': test_accuracy,
                'precision': test_precision,
                'recall': test_recall,
                'f1': test_f1
            }
        }

In [67]:
trainer_context = Trainandtest(pubmedqa_context_train, pubmedqa_context_test)

for model in models:
    model_name=model['model_name'],
    source=model['source'],
    trainer_context.model_compile(QAModel, model_name,source, adamw=False,batch_size=32)
    # Train the model
    trainer_context.training(model_name, epochs=10)
    
    # test the model
    test_result = trainer_context.val()
    trainer_context.results[model['model_name']] = test_result

Epoch 1, Loss: 0.9956087817748388, F1 Score: 0.44379389794495544
Epoch 2, Loss: 0.9260348826646805, F1 Score: 0.467974720792834
Epoch 3, Loss: 0.7666239080329736, F1 Score: 0.6489225499984872
Epoch 4, Loss: 0.6035305373370647, F1 Score: 0.7415143995953295
Epoch 5, Loss: 0.47606802855928737, F1 Score: 0.7946067327830272
Epoch 6, Loss: 0.34642056996623677, F1 Score: 0.8511775348168191
Epoch 7, Loss: 0.2580205186580618, F1 Score: 0.8866075525251556
Epoch 8, Loss: 0.18729669569681087, F1 Score: 0.9369975794868157
Epoch 9, Loss: 0.13886863629644117, F1 Score: 0.960059241154904
Epoch 10, Loss: 0.0845037210577478, F1 Score: 0.9779172647527911
Model saved to /kaggle/working/models/BERT_model.pth
Test - Accuracy: 0.684, Precision: 0.7294617424242424, Recall: 0.684, F1-Score: 0.7045881409897586
Epoch 1, Loss: 1.3585337527254795, F1 Score: 0.44299407780965916
Epoch 2, Loss: 1.0587855374559443, F1 Score: 0.45385088741681573
Epoch 3, Loss: 1.0438256200323714, F1 Score: 0.44054302488042835
Epoch 4, 

  _warn_prf(average, modifier, msg_start, len(result))


Test - Accuracy: 0.552, Precision: 0.30470400000000003, Recall: 0.552, F1-Score: 0.392659793814433
Epoch 1, Loss: 0.9520245417952538, F1 Score: 0.45297674916961317
Epoch 2, Loss: 0.9212048103411993, F1 Score: 0.4826229757472229
Epoch 3, Loss: 0.7950820500651995, F1 Score: 0.6363152721510011
Epoch 4, Loss: 0.6057927769919237, F1 Score: 0.7363657106752968
Epoch 5, Loss: 0.4558780913551648, F1 Score: 0.7984399635452267
Epoch 6, Loss: 0.3412573526923855, F1 Score: 0.8496917808219178
Epoch 7, Loss: 0.23623682030787072, F1 Score: 0.8973209773134148
Epoch 8, Loss: 0.19447369469950596, F1 Score: 0.9264559794355036
Epoch 9, Loss: 0.14640878606587648, F1 Score: 0.9546388083429811
Epoch 10, Loss: 0.11481602140702307, F1 Score: 0.9595680799282695
Model saved to /kaggle/working/models/ColBERT_model.pth
Test - Accuracy: 0.688, Precision: 0.7030860215053764, Recall: 0.688, F1-Score: 0.6932197595858558
Epoch 1, Loss: 1.0067957465847333, F1 Score: 0.4560220394560017
Epoch 2, Loss: 0.9534691696365675, F

In [68]:
result_context= result_convert(trainer_context.results)
result_context

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,BERT,0.684,0.729462,0.684,0.704588
1,GPT,0.552,0.304704,0.552,0.39266
2,ColBERT,0.688,0.703086,0.688,0.69322
3,LinkBERT,0.74,0.722004,0.74,0.73004
4,BiomedNLP,0.756,0.798076,0.756,0.774141
5,BioLinkBERT,0.708,0.805696,0.708,0.738557


In [69]:
trainer_context_adamw = Trainandtest(pubmedqa_context_train, pubmedqa_context_test)

for model in models:
    model_name=model['model_name'],
    source=model['source'],
    trainer_context_adamw.model_compile(QAModel, model_name,source,batch_size=32)
    # Train the model
    trainer_context_adamw.training(model_name, epochs=10)
    
    # test the model
    test_result = trainer_context_adamw.val()
    trainer_context_adamw.results[model['model_name']] = test_result

Epoch 1, Loss: 0.9762519747018814, F1 Score: 0.4699483311583402
Epoch 2, Loss: 0.923114595313867, F1 Score: 0.461531669637552
Epoch 3, Loss: 0.7446715695162615, F1 Score: 0.6811459012673045
Epoch 4, Loss: 0.5992327022055784, F1 Score: 0.7511168646563102
Epoch 5, Loss: 0.47549499819676083, F1 Score: 0.787570027877593
Epoch 6, Loss: 0.37996089334289235, F1 Score: 0.8323402266290735
Epoch 7, Loss: 0.3250357086459796, F1 Score: 0.8703587289410076
Epoch 8, Loss: 0.22723182663321495, F1 Score: 0.9252874017493735
Epoch 9, Loss: 0.19044163528208932, F1 Score: 0.9426072087826678
Epoch 10, Loss: 0.13953980958710113, F1 Score: 0.9535687984294463
Model saved to /kaggle/working/models/BERT_model.pth
Test - Accuracy: 0.728, Precision: 0.7235849731663685, Recall: 0.728, F1-Score: 0.7257289760348584
Epoch 1, Loss: 1.4827121978110456, F1 Score: 0.4296229401969969
Epoch 2, Loss: 1.110146200403254, F1 Score: 0.45954562368681473
Epoch 3, Loss: 1.0427006292850414, F1 Score: 0.45145540092523667
Epoch 4, Los

  _warn_prf(average, modifier, msg_start, len(result))


Test - Accuracy: 0.552, Precision: 0.30470400000000003, Recall: 0.552, F1-Score: 0.392659793814433
Epoch 1, Loss: 0.9739934826890627, F1 Score: 0.4464864515894175
Epoch 2, Loss: 0.9144586796561877, F1 Score: 0.48615408600753535
Epoch 3, Loss: 0.8352963477373123, F1 Score: 0.5982742597691155
Epoch 4, Loss: 0.6690496106942495, F1 Score: 0.7089612755545032
Epoch 5, Loss: 0.5227077280481657, F1 Score: 0.7713850612478728
Epoch 6, Loss: 0.37999746575951576, F1 Score: 0.8262018389810438
Epoch 7, Loss: 0.2986296160767476, F1 Score: 0.8821333461859778
Epoch 8, Loss: 0.22801320627331734, F1 Score: 0.9202210750158099
Epoch 9, Loss: 0.14270742796361446, F1 Score: 0.9618476024079696
Epoch 10, Loss: 0.08368351760630806, F1 Score: 0.9863758230074019
Model saved to /kaggle/working/models/ColBERT_model.pth
Test - Accuracy: 0.708, Precision: 0.6730457352171638, Recall: 0.708, F1-Score: 0.6881130361648445
Epoch 1, Loss: 0.978992094596227, F1 Score: 0.46778994976737287
Epoch 2, Loss: 0.9315755193432173, F

In [70]:
result_context_adamw= result_convert(trainer_context_adamw.results)
result_context_adamw

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,BERT,0.728,0.723585,0.728,0.725729
1,GPT,0.552,0.304704,0.552,0.39266
2,ColBERT,0.708,0.673046,0.708,0.688113
3,LinkBERT,0.732,0.685469,0.732,0.704661
4,BiomedNLP,0.776,0.767667,0.776,0.771358
5,BioLinkBERT,0.804,0.809602,0.804,0.806523
