# 1. Importing 

## 1.1. Importing libraries

In [1]:
import shutil
import os

# Remove the directory if already exist 
dir_name = 'neural_medical_qa'
if os.path.exists(dir_name):
    shutil.rmtree(dir_name)

#clone the repo from github
!git clone https://github.com/trduc97/neural_medical_qa.git
%cd neural_medical_qa
# install the requirement
!pip install -r requirements.txt

Cloning into 'neural_medical_qa'...
remote: Enumerating objects: 162, done.[K
remote: Counting objects: 100% (22/22), done.[K
remote: Compressing objects: 100% (22/22), done.[K
remote: Total 162 (delta 11), reused 0 (delta 0), pack-reused 140 (from 1)[K
Receiving objects: 100% (162/162), 1.82 MiB | 15.97 MiB/s, done.
Resolving deltas: 100% (81/81), done.
/kaggle/working/neural_medical_qa


In [2]:
from classifiers import QAModel, BiLSTMmodel
from train_and_test import Trainandtest
from processing import load_bioasq_pubmedqa, pubmed_train_test_split,result_convert 
from datasets import Dataset, DatasetDict
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

## 1.2. Importing data 

### 1.2.1. Importing PubMedQA and BioASQ

In [3]:
bioasq, pubmedqa = load_bioasq_pubmedqa()

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

### 1.2.2. Importing PubMedQA artificial

In [4]:
_, pubmedqa_artificial = load_bioasq_pubmedqa(pubmed_kaggle_path='/kaggle/input/pubmed-qa/pubmed_qa_pga_artificial.parquet')

Map:   0%|          | 0/211269 [00:00<?, ? examples/s]

Map:   0%|          | 0/211269 [00:00<?, ? examples/s]

## 1.3. Preprocessing

### 1.3.1 Concatenating the full context for PubMedQA

In [5]:
# Extracting the contexts
pubmed_text = pd.DataFrame(pubmedqa['train']['context'])
pubmed_text['full_context'] = pubmed_text['contexts'].apply(lambda x: ' '.join(x))
# Convert to a DataFrame
pubmedqa_train_df = pd.DataFrame(pubmedqa['train'])
pubmedqa_train_df['full_context']= pubmed_text['full_context']

# Convert the DataFrame back to a Dataset
pubmedqa_context = Dataset.from_pandas(pubmedqa_train_df)

# Create a DatasetDict
pubmedqa = DatasetDict({
    'train': pubmedqa_context
})

### 1.3.2. Concatenating the full context for PubMedQA artificial data

In [6]:
# Extracting the contexts
pubmed_text = pd.DataFrame(pubmedqa_artificial['train']['context'])
pubmed_text['full_context'] = pubmed_text['contexts'].apply(lambda x: ' '.join(x))
# Convert to a DataFrame
pubmedqa_train_df = pd.DataFrame(pubmedqa_artificial['train'])
pubmedqa_train_df['full_context']= pubmed_text['full_context']

# Convert the DataFrame back to a Dataset
pubmedqa_arti_context = Dataset.from_pandas(pubmedqa_train_df)

# Create a DatasetDict
pubmedqa_artificial = DatasetDict({
    'train': pubmedqa_arti_context
})

# 2. EDA

In [7]:
# Display the first few samples of the PubMedQA dataset
print(pubmedqa['train'].to_pandas().head())

responses = pubmedqa['train']['final_decision']
# Counting the occurrences of each value
yes_count = responses.count('yes')
no_count = responses.count('no')
maybe_count = responses.count('maybe')

# Display the counts
print(f"Yes: {yes_count}")
print(f"No: {no_count}")
print(f"Maybe: {maybe_count}")

      pubid                                           question  \
0  21645374  Do mitochondria play a role in remodelling lac...   
1  16418930  Landolt C and snellen e acuity: differences in...   
2   9488747  Syncope during bathing in infants, a pediatric...   
3  17208539  Are the long-term results of the transanal pul...   
4  10808977  Can tailored interventions increase mammograph...   

                                             context  \
0  {'contexts': ['Programmed cell death (PCD) is ...   
1  {'contexts': ['Assessment of visual acuity dep...   
2  {'contexts': ['Apparent life-threatening event...   
3  {'contexts': ['The transanal endorectal pull-t...   
4  {'contexts': ['Telephone counseling and tailor...   

                                         long_answer final_decision  \
0  Results depicted mitochondrial dynamics in viv...            yes   
1  Using the charts described, there was only a s...             no   
2  "Aquagenic maladies" could be a pediatric form... 

# 3. Splitting and mix data for training

## 3.1. Splitting PubMedQA and BioASQ

In [8]:
pubmedqa_train, pubmedqa_test = pubmed_train_test_split(pubmedqa)
bioasq_train, bioasq_test = pubmed_train_test_split(bioasq)

## 3.2. Mixing Artificial data with PubMedQA labeled training data

In [9]:
# Convert the pubmedqa_artificial dataset to a pandas DataFrame
df_artificial = pd.DataFrame(pubmedqa_artificial['train'])

# Separate the DataFrame by class
df_class_0 = df_artificial[df_artificial['decision_encoded'] == 0]
df_class_2 = df_artificial[df_artificial['decision_encoded'] == 2]

# Calculate the number of samples needed from each class
samples_per_class = 700 // 2

# Sample equally from each class
sampled_class_0 = df_class_0.sample(n=samples_per_class, random_state=42)
sampled_class_2 = df_class_2.sample(n=samples_per_class, random_state=42)

# Combine the samples into one DataFrame
sampled_artificial = pd.concat([sampled_class_0, sampled_class_2])

# Shuffle the combined DataFrame
shuffled_sampled_artificial = sampled_artificial.sample(frac=1, random_state=42).reset_index(drop=True)

# Convert the shuffled DataFrame to a Dataset
sampled_pubmedqa_artificial = Dataset.from_pandas(shuffled_sampled_artificial)

# Now you can proceed with your original steps to concatenate and shuffle the DataFrames
# Convert the datasets to pandas DataFrames
df_train = pd.DataFrame(pubmedqa_train)

# Concatenate the DataFrames
combined_df = pd.concat([shuffled_sampled_artificial, df_train], ignore_index=True)

# Shuffle the combined DataFrame
shuffled_combined_df = combined_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Convert the final shuffled DataFrame back to a Dataset
pubmedqa_arti_context_mixed = Dataset.from_pandas(shuffled_combined_df)

In [10]:
responses = pubmedqa_arti_context_mixed['final_decision']
# Counting the occurrences of each value
yes_count = responses.count('yes')
no_count = responses.count('no')
maybe_count = responses.count('maybe')

# Display the counts
print(f"Yes: {yes_count}")
print(f"No: {no_count}")
print(f"Maybe: {maybe_count}")

Yes: 736
No: 586
Maybe: 77


# 4. Training without context (reason-free setting)

In [11]:
learning_rate = 1e-5
batch_size=32
epochs=10

context = '_context'
opt = ''
data = ''
version = context+opt+data


models = [
    {'model_name': 'BERT'+version,
    'source': 'bert-base-uncased'},
#    {
#        'model_name': 'GPT',
#        'source': 'gpt2',
#    },
    {'model_name': 'ColBERT'+version,
    'source': 'colbert-ir/colbertv2.0'},
    {'model_name': 'LinkBERT'+version,
    'source': 'michiyasunaga/LinkBERT-base'},
    {'model_name': 'BiomedNLP'+version,
    'source': 'microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract',},
    {'model_name': 'BioLinkBERT'+version,
    'source': 'michiyasunaga/BioLinkBERT-base',}]

## 4.1. Testing with different Optimization algorithm

In [12]:
trainer_adam = Trainandtest(pubmedqa_train, pubmedqa_test)

for model in models:
    model_name=model['model_name'],
    source=model['source'],
    trainer_adam.model_compile(QAModel, model_name,source,
                               optimizer='adam', 
                               batch_size=batch_size)
    # Train the model
    trainer_adam.training(model_name, epochs=epochs)
    
    # test the model
    test_result = trainer_adam.val()
    trainer_adam.results[model['model_name']] = test_result

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Epoch 1, Loss: 1.01187171719291, F1 Score: 0.4648839473470908, Time: 46.47 seconds
Epoch 2, Loss: 0.9644373899156397, F1 Score: 0.4286281058455812, Time: 45.82 seconds
Epoch 3, Loss: 0.9333845376968384, F1 Score: 0.45484602561846105, Time: 45.68 seconds
Epoch 4, Loss: 0.8485855947841298, F1 Score: 0.5783680046800348, Time: 45.66 seconds
Epoch 5, Loss: 0.7223394621502269, F1 Score: 0.7045784467894984, Time: 45.69 seconds
Epoch 6, Loss: 0.6277655308896845, F1 Score: 0.7393659589285584, Time: 45.60 seconds
Epoch 7, Loss: 0.5256653373891657, F1 Score: 0.7792578849410519, Time: 45.68 seconds
Epoch 8, Loss: 0.4606787006963383, F1 Score: 0.8088584409305992, Time: 45.75 seconds
Epoch 9, Loss: 0.39571518721905624, F1 Score: 0.8146001716604494, Time: 45.78 seconds
Epoch 10, Loss: 0.3437110429460352, F1 Score: 0.844984736482053, Time: 45.83 seconds
Model saved to /kaggle/working/models/BERT_context_model.pth
Test - Accuracy: 0.7740863787375415, Precision: 0.6983298913531472, Recall: 0.77408637873

tokenizer_config.json:   0%|          | 0.00/405 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Epoch 1, Loss: 1.0029005950147456, F1 Score: 0.46033315123091617, Time: 46.21 seconds
Epoch 2, Loss: 0.938882665200667, F1 Score: 0.4438576087717719, Time: 46.02 seconds
Epoch 3, Loss: 0.9214420833370902, F1 Score: 0.47615066462278754, Time: 45.98 seconds
Epoch 4, Loss: 0.8709744350476698, F1 Score: 0.5632172847411403, Time: 46.02 seconds
Epoch 5, Loss: 0.7698968567631461, F1 Score: 0.6704805080099371, Time: 46.01 seconds
Epoch 6, Loss: 0.6417545134370978, F1 Score: 0.7304054241836379, Time: 46.02 seconds
Epoch 7, Loss: 0.5527765466408296, F1 Score: 0.7642530072084002, Time: 46.02 seconds
Epoch 8, Loss: 0.4556568630717017, F1 Score: 0.8048308740134325, Time: 46.02 seconds
Epoch 9, Loss: 0.36299925297498703, F1 Score: 0.8207972198030586, Time: 46.05 seconds
Epoch 10, Loss: 0.32717840441248636, F1 Score: 0.8555396171727356, Time: 46.11 seconds
Model saved to /kaggle/working/models/ColBERT_context_model.pth
Test - Accuracy: 0.7574750830564784, Precision: 0.6803798351239602, Recall: 0.7574

tokenizer_config.json:   0%|          | 0.00/320 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/559 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/433M [00:00<?, ?B/s]

Epoch 1, Loss: 1.030395345254378, F1 Score: 0.4312315340187528, Time: 48.68 seconds
Epoch 2, Loss: 0.9585994482040405, F1 Score: 0.4261994535543803, Time: 48.91 seconds
Epoch 3, Loss: 0.9432522085579959, F1 Score: 0.4535997919138714, Time: 48.77 seconds
Epoch 4, Loss: 0.9399441372264515, F1 Score: 0.45232745007739933, Time: 48.72 seconds
Epoch 5, Loss: 0.9110608994960785, F1 Score: 0.48720711469339645, Time: 48.71 seconds
Epoch 6, Loss: 0.8354668942364779, F1 Score: 0.5948980480224849, Time: 48.67 seconds
Epoch 7, Loss: 0.6949002729220823, F1 Score: 0.7015012764277541, Time: 48.64 seconds
Epoch 8, Loss: 0.5925933989611539, F1 Score: 0.7585219046327403, Time: 48.46 seconds
Epoch 9, Loss: 0.5213158760558475, F1 Score: 0.7814161177018761, Time: 48.49 seconds
Epoch 10, Loss: 0.4566050253131173, F1 Score: 0.8107164361492218, Time: 48.59 seconds
Model saved to /kaggle/working/models/LinkBERT_context_model.pth
Test - Accuracy: 0.7408637873754153, Precision: 0.6597531370420244, Recall: 0.74086

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/225k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Epoch 1, Loss: 1.156344237652692, F1 Score: 0.4186302559678643, Time: 44.59 seconds
Epoch 2, Loss: 1.0037611045620658, F1 Score: 0.47428918140253346, Time: 44.98 seconds
Epoch 3, Loss: 0.9483869617635553, F1 Score: 0.5254986503646498, Time: 44.78 seconds
Epoch 4, Loss: 0.8572289049625397, F1 Score: 0.5844199081285901, Time: 45.01 seconds
Epoch 5, Loss: 0.6812061830000444, F1 Score: 0.7200893933345499, Time: 44.97 seconds
Epoch 6, Loss: 0.5594368102875623, F1 Score: 0.756930076832864, Time: 45.14 seconds
Epoch 7, Loss: 0.5176475874402306, F1 Score: 0.7770859650453299, Time: 45.00 seconds
Epoch 8, Loss: 0.4375656010075049, F1 Score: 0.826910402590777, Time: 44.95 seconds
Epoch 9, Loss: 0.38879949328574265, F1 Score: 0.8303214489243775, Time: 45.02 seconds
Epoch 10, Loss: 0.31109025464816525, F1 Score: 0.8780570234173817, Time: 44.97 seconds
Model saved to /kaggle/working/models/BiomedNLP_context_model.pth
Test - Accuracy: 0.8239202657807309, Precision: 0.7742123289871929, Recall: 0.82392

tokenizer_config.json:   0%|          | 0.00/379 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/225k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/447k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/559 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/433M [00:00<?, ?B/s]

Epoch 1, Loss: 1.0120107396082445, F1 Score: 0.44848177041515963, Time: 45.02 seconds
Epoch 2, Loss: 0.9395774088122628, F1 Score: 0.4418815035488408, Time: 45.04 seconds
Epoch 3, Loss: 0.9507878449830142, F1 Score: 0.4535812028818925, Time: 45.11 seconds
Epoch 4, Loss: 0.911650915037502, F1 Score: 0.4833275582917339, Time: 45.13 seconds
Epoch 5, Loss: 0.8158634142442183, F1 Score: 0.6394480511266492, Time: 45.17 seconds
Epoch 6, Loss: 0.6674754890528592, F1 Score: 0.732567683450485, Time: 45.20 seconds
Epoch 7, Loss: 0.5588268271901391, F1 Score: 0.7678226015135728, Time: 45.19 seconds
Epoch 8, Loss: 0.5049648901278322, F1 Score: 0.7879722373378095, Time: 45.14 seconds
Epoch 9, Loss: 0.4303083988753232, F1 Score: 0.8224714027609792, Time: 45.09 seconds
Epoch 10, Loss: 0.37618971548297186, F1 Score: 0.8405806250769383, Time: 45.21 seconds
Model saved to /kaggle/working/models/BioLinkBERT_context_model.pth
Test - Accuracy: 0.8239202657807309, Precision: 0.7382763581416049, Recall: 0.823

In [13]:
trainer_adamw = Trainandtest(pubmedqa_train, pubmedqa_test)

for model in models:
    model_name=model['model_name'],
    source=model['source'],
    trainer_adamw.model_compile(QAModel, model_name,source, 
                                optimizer='adamw',
                                batch_size=batch_size)
    # Train the model
    trainer_adamw.training(model_name, epochs=epochs)
    
    # test the model
    test_result = trainer_adamw.val()
    trainer_adamw.results[model['model_name']] = test_result

Epoch 1, Loss: 1.0118717795068568, F1 Score: 0.4648839473470908, Time: 45.91 seconds
Epoch 2, Loss: 0.9644354473460804, F1 Score: 0.4286281058455812, Time: 45.78 seconds
Epoch 3, Loss: 0.933366060256958, F1 Score: 0.45484602561846105, Time: 45.61 seconds
Epoch 4, Loss: 0.8485424030910839, F1 Score: 0.5765534451496245, Time: 45.57 seconds
Epoch 5, Loss: 0.7222516970200972, F1 Score: 0.7045784467894984, Time: 45.63 seconds
Epoch 6, Loss: 0.627579312432896, F1 Score: 0.7380810458174434, Time: 45.57 seconds
Epoch 7, Loss: 0.5254375433379953, F1 Score: 0.7792578849410519, Time: 45.54 seconds
Epoch 8, Loss: 0.4602521759542552, F1 Score: 0.8088584409305992, Time: 45.65 seconds
Epoch 9, Loss: 0.3953639580444856, F1 Score: 0.8153952219749705, Time: 45.60 seconds
Epoch 10, Loss: 0.34422232617031445, F1 Score: 0.844984736482053, Time: 45.73 seconds
Model saved to /kaggle/working/models/BERT_context_model.pth
Test - Accuracy: 0.7807308970099668, Precision: 0.7040986373407714, Recall: 0.78073089700

In [14]:
trainer_sgd = Trainandtest(pubmedqa_train, pubmedqa_test)

for model in models:
    model_name=model['model_name'],
    source=model['source'],
    trainer_sgd.model_compile(QAModel, model_name,source, 
                                optimizer='sgd',
                                batch_size=batch_size)
    # Train the model
    trainer_sgd.training(model_name, epochs=epochs)
    
    # test the model
    test_result = trainer_sgd.val()
    trainer_sgd.results[model['model_name']] = test_result

Epoch 1, Loss: 1.221792687069286, F1 Score: 0.22296992996754597, Time: 44.96 seconds
Epoch 2, Loss: 1.255124119195071, F1 Score: 0.1691594650190066, Time: 44.98 seconds
Epoch 3, Loss: 1.2405251535502346, F1 Score: 0.22253443953694912, Time: 44.86 seconds
Epoch 4, Loss: 1.2258132371035488, F1 Score: 0.22372856142916417, Time: 44.92 seconds
Epoch 5, Loss: 1.2209528684616089, F1 Score: 0.2101625420980019, Time: 44.87 seconds
Epoch 6, Loss: 1.2200871922753074, F1 Score: 0.2269705679784656, Time: 44.85 seconds
Epoch 7, Loss: 1.2187405770475215, F1 Score: 0.21327475349985855, Time: 44.67 seconds
Epoch 8, Loss: 1.199682279066606, F1 Score: 0.26350643415625646, Time: 44.71 seconds
Epoch 9, Loss: 1.2073645212433555, F1 Score: 0.2698126507859709, Time: 44.61 seconds
Epoch 10, Loss: 1.1930437142198735, F1 Score: 0.2556963307887038, Time: 44.66 seconds
Model saved to /kaggle/working/models/BERT_context_model.pth
Test - Accuracy: 0.11960132890365449, Precision: 0.08522115596096635, Recall: 0.119601

## 4.2. Testing with different Classifying layer

#### (1) AdamW + BiLSTM

In [15]:
trainer_adamw_bilstm = Trainandtest(pubmedqa_train, pubmedqa_test)

for model in models:
    model_name=model['model_name'],
    source=model['source'],
    trainer_adamw_bilstm.model_compile(BiLSTMmodel, model_name,source,
                                        optimizer='adamw',
                                       batch_size=batch_size)
    # Train the model
    trainer_adamw_bilstm.training(model_name, epochs=epochs)
    
    # test the model
    test_result = trainer_adamw_bilstm.val()
    trainer_adamw_bilstm.results[model['model_name']] = test_result

Epoch 1, Loss: 1.0219351188703016, F1 Score: 0.4117609143360215, Time: 45.96 seconds
Epoch 2, Loss: 0.9495659524744208, F1 Score: 0.4058769567304948, Time: 46.21 seconds
Epoch 3, Loss: 0.9332725920460441, F1 Score: 0.4028005363767064, Time: 46.59 seconds
Epoch 4, Loss: 0.8935398134318265, F1 Score: 0.4929180265167173, Time: 46.66 seconds
Epoch 5, Loss: 0.777571521022103, F1 Score: 0.6820254997000541, Time: 46.68 seconds
Epoch 6, Loss: 0.6785551336678591, F1 Score: 0.7420369018717505, Time: 46.70 seconds
Epoch 7, Loss: 0.5888508775017478, F1 Score: 0.7771363451701402, Time: 46.43 seconds
Epoch 8, Loss: 0.540947446768934, F1 Score: 0.7924884911961464, Time: 46.35 seconds
Epoch 9, Loss: 0.4710500694134019, F1 Score: 0.8097200887215473, Time: 46.44 seconds
Epoch 10, Loss: 0.4367422542788766, F1 Score: 0.8321551676466329, Time: 46.41 seconds
Model saved to /kaggle/working/models/BERT_context_model.pth
Test - Accuracy: 0.7541528239202658, Precision: 0.67470572110473, Recall: 0.75415282392026

#### (2) Adam + BiLSTM

In [16]:
trainer_adam_bilstm = Trainandtest(pubmedqa_train, pubmedqa_test)

for model in models:
    model_name=model['model_name'],
    source=model['source'],
    trainer_adam_bilstm.model_compile(BiLSTMmodel, model_name,source,
                                      optimizer='adam', 
                                      batch_size=batch_size)
    # Train the model
    trainer_adam_bilstm.training(model_name, epochs=epochs)
    
    # test the model
    test_result = trainer_adam_bilstm.val()
    trainer_adam_bilstm.results[model['model_name']] = test_result

Epoch 1, Loss: 1.0219350646842609, F1 Score: 0.4117609143360215, Time: 45.50 seconds
Epoch 2, Loss: 0.9495645544745706, F1 Score: 0.4058769567304948, Time: 46.29 seconds
Epoch 3, Loss: 0.9332668727094476, F1 Score: 0.4028005363767064, Time: 46.43 seconds
Epoch 4, Loss: 0.8934180357239463, F1 Score: 0.4929180265167173, Time: 46.44 seconds
Epoch 5, Loss: 0.7774821709502827, F1 Score: 0.6776520877347739, Time: 46.31 seconds
Epoch 6, Loss: 0.6784960722381418, F1 Score: 0.7393910702559864, Time: 46.09 seconds
Epoch 7, Loss: 0.5891391702673652, F1 Score: 0.7771541881384029, Time: 45.89 seconds
Epoch 8, Loss: 0.5423751094124534, F1 Score: 0.7923769082576095, Time: 45.95 seconds
Epoch 9, Loss: 0.47593092850663443, F1 Score: 0.8083524837453331, Time: 45.93 seconds
Epoch 10, Loss: 0.4430389146913182, F1 Score: 0.8293558360497492, Time: 45.97 seconds
Model saved to /kaggle/working/models/BERT_context_model.pth
Test - Accuracy: 0.7508305647840532, Precision: 0.6720665395799846, Recall: 0.750830564

## 4.3. Validating result with the BioASQ dataset

In [17]:
context = ''
opt = ''
data = '_bioasq'
version = context+opt+data


models = [
    {'model_name': 'BERT'+version,
    'source': 'bert-base-uncased'},
    {'model_name': 'ColBERT'+version,
    'source': 'colbert-ir/colbertv2.0'},
    {'model_name': 'LinkBERT'+version,
    'source': 'michiyasunaga/LinkBERT-base'},
    {'model_name': 'BiomedNLP'+version,
    'source': 'microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract',},
    {'model_name': 'BioLinkBERT'+version,
    'source': 'michiyasunaga/BioLinkBERT-base',}]

In [18]:
trainer_bioasq = Trainandtest(bioasq_train, bioasq_test)

for model in models:
    model_name=model['model_name'],
    source=model['source'],
    trainer_bioasq.model_compile(QAModel, model_name,source,
                                      batch_size=batch_size)
    # Train the model
    trainer_bioasq.training(model_name, epochs=epochs)
    
    # test the model
    test_result = trainer_bioasq.val()
    trainer_bioasq.results[model['model_name']] = test_result

Epoch 1, Loss: 1.0118717009370977, F1 Score: 0.4648839473470908, Time: 45.25 seconds
Epoch 2, Loss: 0.9644373736598275, F1 Score: 0.4286281058455812, Time: 45.05 seconds
Epoch 3, Loss: 0.9333602352575823, F1 Score: 0.45484602561846105, Time: 45.43 seconds
Epoch 4, Loss: 0.8485702682625164, F1 Score: 0.5765534451496245, Time: 45.71 seconds
Epoch 5, Loss: 0.7222238806161013, F1 Score: 0.7045784467894984, Time: 45.77 seconds
Epoch 6, Loss: 0.6275632300160148, F1 Score: 0.7380810458174434, Time: 45.79 seconds
Epoch 7, Loss: 0.5257374251430685, F1 Score: 0.7792578849410519, Time: 45.75 seconds
Epoch 8, Loss: 0.46052905375307257, F1 Score: 0.8088453176664636, Time: 45.77 seconds
Epoch 9, Loss: 0.3961173438213088, F1 Score: 0.8159413264882908, Time: 45.66 seconds
Epoch 10, Loss: 0.345219304615801, F1 Score: 0.8527522337126, Time: 45.63 seconds
Model saved to /kaggle/working/models/BERT_bioasq_model.pth
Test - Accuracy: 0.7774086378737541, Precision: 0.7009950564939693, Recall: 0.7774086378737

## 4.4. Results

In [19]:
result_adam= result_convert(trainer_adam.results)
result_adamw= result_convert(trainer_adamw.results)
result_sgd= result_convert(trainer_sgd.results)
result_adam_bilstm= result_convert(trainer_adam_bilstm.results)
result_adamw_bilstm= result_convert(trainer_adamw_bilstm.results)
#result_sgd_bilstm= result_convert(trainer_sgd_bilstm.results)
#print('Adam optimiser+Linear layer\n',result_adam[['Model','Accuracy','F1 Score']])
#print('AdamW optimiser+Linear layer\n', result_adamw[['Model','Accuracy','F1 Score']])
#print('SGD optimiser+Linear layer\n', result_sgd[['Model','Accuracy','F1 Score']])
#print('Adam optimiser+BiLSTM\n',result_adam_bilstm[['Model','Accuracy','F1 Score']])
#print('AdamW optimiser+BiLSTM\n', result_adamw_bilstm[['Model','Accuracy','F1 Score']])
#rint('SGD optimiser+BiLSTM\n', result_sgd_bilstm[['Model','Accuracy','F1 Score']])


In [20]:
# Extract accuracy and F1 score columns for each optimizer
adam_metrics = result_adam[['Accuracy', 'F1 Score']].rename(columns={'Accuracy': 'Adam Accuracy', 'F1 Score': 'Adam F1 Score'})
adamw_metrics = result_adamw[['Accuracy', 'F1 Score']].rename(columns={'Accuracy': 'AdamW Accuracy', 'F1 Score': 'AdamW F1 Score'})
sgd_metrics = result_sgd[['Accuracy', 'F1 Score']].rename(columns={'Accuracy': 'SGD Accuracy', 'F1 Score': 'SGD F1 Score'})

# Combine these into a single DataFrame
combined_metrics = pd.concat([result_adam['Model'],adam_metrics, adamw_metrics, sgd_metrics], axis=1)

# Print the combined DataFrame
print('Linear Layer Results\n', combined_metrics)

Linear Layer Results
                  Model  Adam Accuracy  Adam F1 Score  AdamW Accuracy  \
0         BERT_context       0.774086       0.734250        0.780731   
1      ColBERT_context       0.757475       0.716696        0.754153   
2     LinkBERT_context       0.740864       0.697958        0.740864   
3    BiomedNLP_context       0.823920       0.782909        0.823920   
4  BioLinkBERT_context       0.823920       0.778321        0.827243   

   AdamW F1 Score  SGD Accuracy  SGD F1 Score  
0        0.740356      0.119601      0.040140  
1        0.713691      0.328904      0.357285  
2        0.701125      0.338870      0.171537  
3        0.782909      0.392027      0.410031  
4        0.779046      0.338870      0.179900  


In [21]:
adam_linear = result_adam[['F1 Score']].rename(columns={'F1 Score': 'Adam Linear F1 Score'})
adam_bilstm = result_adam_bilstm[['F1 Score']].rename(columns={'F1 Score': 'Adam BiLSTM F1 Score'})

adamw_linear = result_adamw[['F1 Score']].rename(columns={'F1 Score': 'AdamW Linear F1 Score'})
adamw_bilstm = result_adamw_bilstm[['F1 Score']].rename(columns={'F1 Score': 'AdamW BiLSTM F1 Score'})

# Combine these into a single DataFrame
combined_metrics = pd.concat([result_adam['Model'],adam_linear, adam_bilstm, adamw_linear, adamw_bilstm], axis=1)

# Print the combined DataFrame
print('Comparison of Linear Layer vs BiLSTM across different optimizers\n', combined_metrics)

Comparison of Linear Layer vs BiLSTM across different optimizers
                  Model  Adam Linear F1 Score  Adam BiLSTM F1 Score  \
0         BERT_context              0.734250              0.709082   
1      ColBERT_context              0.716696              0.711283   
2     LinkBERT_context              0.697958              0.720583   
3    BiomedNLP_context              0.782909              0.768386   
4  BioLinkBERT_context              0.778321              0.765895   

   AdamW Linear F1 Score  AdamW BiLSTM F1 Score  
0               0.740356               0.712103  
1               0.713691               0.711283  
2               0.701125               0.727771  
3               0.782909               0.768386  
4               0.779046               0.780626  


In [22]:
# Results using BioASQ data
result_linear_bioasq= result_convert(trainer_bioasq.results)
print('Results using BioASQ dataset with Adam optimiser+Linear layer\n',result_linear_bioasq[['Model','Accuracy','F1 Score']])

Results using BioASQ dataset with Adam optimiser+Linear layer
                 Model  Accuracy  F1 Score
0         BERT_bioasq  0.777409  0.737084
1      ColBERT_bioasq  0.757475  0.716696
2     LinkBERT_bioasq  0.744186  0.701028
3    BiomedNLP_bioasq  0.823920  0.782909
4  BioLinkBERT_bioasq  0.837209  0.788880
