# 1. Importing 

## 1.1. Importing libraries

In [1]:
import shutil
import os

# Remove the directory if already exist 
dir_name = 'neural_medical_qa'
if os.path.exists(dir_name):
    shutil.rmtree(dir_name)

#clone the repo from github
!git clone https://github.com/trduc97/neural_medical_qa.git
%cd neural_medical_qa
# install the requirement
!pip install -r requirements.txt

Cloning into 'neural_medical_qa'...
remote: Enumerating objects: 179, done.[K
remote: Counting objects: 100% (39/39), done.[K
remote: Compressing objects: 100% (39/39), done.[K
remote: Total 179 (delta 19), reused 0 (delta 0), pack-reused 140 (from 1)[K
Receiving objects: 100% (179/179), 1.89 MiB | 28.95 MiB/s, done.
Resolving deltas: 100% (89/89), done.
/kaggle/working/neural_medical_qa


In [2]:
from classifiers import QAModel, BiLSTMmodel
from train_and_test import Trainandtest
from processing import load_bioasq_pubmedqa, pubmed_train_test_split,result_convert 
from datasets import Dataset, DatasetDict
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

## 1.2. Importing data 

### 1.2.1. Importing PubMedQA and BioASQ

In [3]:
bioasq, pubmedqa = load_bioasq_pubmedqa()

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

### 1.2.2. Importing PubMedQA artificial

In [4]:
_, pubmedqa_artificial = load_bioasq_pubmedqa(pubmed_kaggle_path='/kaggle/input/pubmed-qa/pubmed_qa_pga_artificial.parquet')

Map:   0%|          | 0/211269 [00:00<?, ? examples/s]

Map:   0%|          | 0/211269 [00:00<?, ? examples/s]

## 1.3. Preprocessing

### 1.3.1 Concatenating the full context for PubMedQA

In [5]:
# Extracting the contexts
pubmed_text = pd.DataFrame(pubmedqa['train']['context'])
pubmed_text['full_context'] = pubmed_text['contexts'].apply(lambda x: ' '.join(x))
# Convert to a DataFrame
pubmedqa_train_df = pd.DataFrame(pubmedqa['train'])
pubmedqa_train_df['full_context']= pubmed_text['full_context']

# Convert the DataFrame back to a Dataset
pubmedqa_context = Dataset.from_pandas(pubmedqa_train_df)

# Create a DatasetDict
pubmedqa = DatasetDict({
    'train': pubmedqa_context
})

### 1.3.2. Concatenating the full context for PubMedQA artificial data

In [6]:
# Extracting the contexts
pubmed_text = pd.DataFrame(pubmedqa_artificial['train']['context'])
pubmed_text['full_context'] = pubmed_text['contexts'].apply(lambda x: ' '.join(x))
# Convert to a DataFrame
pubmedqa_train_df = pd.DataFrame(pubmedqa_artificial['train'])
pubmedqa_train_df['full_context']= pubmed_text['full_context']

# Convert the DataFrame back to a Dataset
pubmedqa_arti_context = Dataset.from_pandas(pubmedqa_train_df)

# Create a DatasetDict
pubmedqa_artificial = DatasetDict({
    'train': pubmedqa_arti_context
})

# 2. EDA

In [7]:
# Display the first few samples of the PubMedQA dataset
print(pubmedqa['train'].to_pandas().head())

responses = pubmedqa['train']['final_decision']
# Counting the occurrences of each value
yes_count = responses.count('yes')
no_count = responses.count('no')
maybe_count = responses.count('maybe')

# Display the counts
print(f"Yes: {yes_count}")
print(f"No: {no_count}")
print(f"Maybe: {maybe_count}")

      pubid                                           question  \
0  21645374  Do mitochondria play a role in remodelling lac...   
1  16418930  Landolt C and snellen e acuity: differences in...   
2   9488747  Syncope during bathing in infants, a pediatric...   
3  17208539  Are the long-term results of the transanal pul...   
4  10808977  Can tailored interventions increase mammograph...   

                                             context  \
0  {'contexts': ['Programmed cell death (PCD) is ...   
1  {'contexts': ['Assessment of visual acuity dep...   
2  {'contexts': ['Apparent life-threatening event...   
3  {'contexts': ['The transanal endorectal pull-t...   
4  {'contexts': ['Telephone counseling and tailor...   

                                         long_answer final_decision  \
0  Results depicted mitochondrial dynamics in viv...            yes   
1  Using the charts described, there was only a s...             no   
2  "Aquagenic maladies" could be a pediatric form... 

# 3. Splitting and mix data for training

## 3.1. Splitting PubMedQA and BioASQ

In [8]:
pubmedqa_train, pubmedqa_test = pubmed_train_test_split(pubmedqa)
bioasq_train, bioasq_test = pubmed_train_test_split(bioasq)

## 3.2. Mixing Artificial data with PubMedQA labeled training data

In [9]:
# Convert the pubmedqa_artificial dataset to a pandas DataFrame
df_artificial = pd.DataFrame(pubmedqa_artificial['train'])

# Separate the DataFrame by class
df_class_0 = df_artificial[df_artificial['decision_encoded'] == 0]
df_class_2 = df_artificial[df_artificial['decision_encoded'] == 2]

# Calculate the number of samples needed from each class
samples_per_class = 700 // 2

# Sample equally from each class
sampled_class_0 = df_class_0.sample(n=samples_per_class, random_state=42)
sampled_class_2 = df_class_2.sample(n=samples_per_class, random_state=42)

# Combine the samples into one DataFrame
sampled_artificial = pd.concat([sampled_class_0, sampled_class_2])

# Shuffle the combined DataFrame
shuffled_sampled_artificial = sampled_artificial.sample(frac=1, random_state=42).reset_index(drop=True)

# Convert the shuffled DataFrame to a Dataset
sampled_pubmedqa_artificial = Dataset.from_pandas(shuffled_sampled_artificial)

# Now you can proceed with your original steps to concatenate and shuffle the DataFrames
# Convert the datasets to pandas DataFrames
df_train = pd.DataFrame(pubmedqa_train)

# Concatenate the DataFrames
combined_df = pd.concat([shuffled_sampled_artificial, df_train], ignore_index=True)

# Shuffle the combined DataFrame
shuffled_combined_df = combined_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Convert the final shuffled DataFrame back to a Dataset
pubmedqa_arti_context_mixed = Dataset.from_pandas(shuffled_combined_df)

In [10]:
responses = pubmedqa_arti_context_mixed['final_decision']
# Counting the occurrences of each value
yes_count = responses.count('yes')
no_count = responses.count('no')
maybe_count = responses.count('maybe')

# Display the counts
print(f"Yes: {yes_count}")
print(f"No: {no_count}")
print(f"Maybe: {maybe_count}")

Yes: 736
No: 586
Maybe: 77


# 4. Training without context (reason-free setting)

In [11]:
learning_rate = 1e-5
batch_size=32
epochs=10

context = '_withanswer'
opt = ''
data = ''
version = context+opt+data


models = [
    {'model_name': 'BERT'+version,
    'source': 'bert-base-uncased'},
#    {
#        'model_name': 'GPT',
#        'source': 'gpt2',
#    },
    {'model_name': 'ColBERT'+version,
    'source': 'colbert-ir/colbertv2.0'},
    {'model_name': 'LinkBERT'+version,
    'source': 'michiyasunaga/LinkBERT-base'},
    {'model_name': 'BiomedNLP'+version,
    'source': 'microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract',},
    {'model_name': 'BioLinkBERT'+version,
    'source': 'michiyasunaga/BioLinkBERT-base',}]

## 4.1. Testing with different Optimization algorithm

In [12]:
trainer_adam = Trainandtest(pubmedqa_train, pubmedqa_test)

for model in models:
    model_name=model['model_name'],
    source=model['source'],
    trainer_adam.model_compile(QAModel, model_name,source,
                               optimizer='adam', 
                               batch_size=batch_size)
    # Train the model
    trainer_adam.training(model_name, epochs=epochs)
    
    # test the model
    test_result = trainer_adam.val()
    trainer_adam.results[model['model_name']] = test_result

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Epoch 1, Loss: 1.0118717497045344, F1 Score: 0.4648839473470908, Time: 44.92 seconds
Epoch 2, Loss: 0.9644374657760967, F1 Score: 0.4286281058455812, Time: 44.37 seconds
Epoch 3, Loss: 0.9333867078477686, F1 Score: 0.45484602561846105, Time: 44.19 seconds
Epoch 4, Loss: 0.8485437279397791, F1 Score: 0.5783680046800348, Time: 44.25 seconds
Epoch 5, Loss: 0.7225005437027324, F1 Score: 0.7016650225157844, Time: 44.17 seconds
Epoch 6, Loss: 0.6277269273996353, F1 Score: 0.7393910702559864, Time: 44.19 seconds
Epoch 7, Loss: 0.5256873545321551, F1 Score: 0.7792578849410519, Time: 44.25 seconds
Epoch 8, Loss: 0.46014320714907214, F1 Score: 0.8088584409305992, Time: 44.24 seconds
Epoch 9, Loss: 0.39589236067100003, F1 Score: 0.8175576099920115, Time: 44.26 seconds
Epoch 10, Loss: 0.3432431044903668, F1 Score: 0.8527688536903626, Time: 44.27 seconds
Model saved to /kaggle/working/models/BERT_withanswer_model.pth
Test - Accuracy: 0.7740863787375415, Precision: 0.7006195564335099, Recall: 0.7740

tokenizer_config.json:   0%|          | 0.00/405 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Epoch 1, Loss: 1.0029005787589333, F1 Score: 0.46033315123091617, Time: 45.68 seconds
Epoch 2, Loss: 0.9388826814564791, F1 Score: 0.4438576087717719, Time: 45.59 seconds
Epoch 3, Loss: 0.9214420399882577, F1 Score: 0.47615066462278754, Time: 45.58 seconds
Epoch 4, Loss: 0.8709744702685963, F1 Score: 0.5632172847411403, Time: 45.62 seconds
Epoch 5, Loss: 0.769896691495722, F1 Score: 0.6704805080099371, Time: 45.68 seconds
Epoch 6, Loss: 0.641755691983483, F1 Score: 0.7304054241836379, Time: 45.67 seconds
Epoch 7, Loss: 0.5527756715362723, F1 Score: 0.7642530072084002, Time: 45.72 seconds
Epoch 8, Loss: 0.4556309323419224, F1 Score: 0.8048308740134325, Time: 45.67 seconds
Epoch 9, Loss: 0.3630063323812051, F1 Score: 0.8207972198030586, Time: 45.58 seconds
Epoch 10, Loss: 0.32694079726934433, F1 Score: 0.8555396171727356, Time: 45.51 seconds
Model saved to /kaggle/working/models/ColBERT_withanswer_model.pth
Test - Accuracy: 0.7541528239202658, Precision: 0.6775262342803163, Recall: 0.754

tokenizer_config.json:   0%|          | 0.00/320 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/559 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/433M [00:00<?, ?B/s]

Epoch 1, Loss: 1.0303954834287816, F1 Score: 0.4312315340187528, Time: 48.11 seconds
Epoch 2, Loss: 0.9586172564463182, F1 Score: 0.4261994535543803, Time: 48.20 seconds
Epoch 3, Loss: 0.9434717134995894, F1 Score: 0.45375683931963456, Time: 48.37 seconds
Epoch 4, Loss: 0.9405762986703352, F1 Score: 0.4567115645441826, Time: 48.17 seconds
Epoch 5, Loss: 0.9151423356749795, F1 Score: 0.48909817573977465, Time: 48.86 seconds
Epoch 6, Loss: 0.8605260632254861, F1 Score: 0.5587174178290809, Time: 48.47 seconds
Epoch 7, Loss: 0.711298560554331, F1 Score: 0.6886760785795436, Time: 48.32 seconds
Epoch 8, Loss: 0.6032899008555845, F1 Score: 0.7516483562169748, Time: 48.17 seconds
Epoch 9, Loss: 0.5293190682476218, F1 Score: 0.7813844518646158, Time: 48.15 seconds
Epoch 10, Loss: 0.4634697857228192, F1 Score: 0.8095676772076436, Time: 48.36 seconds
Model saved to /kaggle/working/models/LinkBERT_withanswer_model.pth
Test - Accuracy: 0.7441860465116279, Precision: 0.6647601750061576, Recall: 0.74

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/225k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Epoch 1, Loss: 1.1563442051410675, F1 Score: 0.4186302559678643, Time: 44.30 seconds
Epoch 2, Loss: 1.0037611289457842, F1 Score: 0.47428918140253346, Time: 44.35 seconds
Epoch 3, Loss: 0.9483869834379717, F1 Score: 0.5254986503646498, Time: 44.28 seconds
Epoch 4, Loss: 0.8572289618578824, F1 Score: 0.5844199081285901, Time: 44.37 seconds
Epoch 5, Loss: 0.6812061775814403, F1 Score: 0.7200893933345499, Time: 44.19 seconds
Epoch 6, Loss: 0.5594368861480192, F1 Score: 0.756930076832864, Time: 44.42 seconds
Epoch 7, Loss: 0.5176475427367471, F1 Score: 0.7770859650453299, Time: 44.45 seconds
Epoch 8, Loss: 0.4375656579028476, F1 Score: 0.826910402590777, Time: 44.40 seconds
Epoch 9, Loss: 0.388799672099677, F1 Score: 0.8303214489243775, Time: 44.33 seconds
Epoch 10, Loss: 0.3110903643748977, F1 Score: 0.8780570234173817, Time: 44.27 seconds
Model saved to /kaggle/working/models/BiomedNLP_withanswer_model.pth
Test - Accuracy: 0.8239202657807309, Precision: 0.7742123289871929, Recall: 0.8239

tokenizer_config.json:   0%|          | 0.00/379 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/225k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/447k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/559 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/433M [00:00<?, ?B/s]

Epoch 1, Loss: 1.0121474157680164, F1 Score: 0.44848177041515963, Time: 43.59 seconds
Epoch 2, Loss: 0.9394565333019603, F1 Score: 0.4369383162245147, Time: 43.43 seconds
Epoch 3, Loss: 0.9526875994422219, F1 Score: 0.45493520697916645, Time: 43.63 seconds
Epoch 4, Loss: 0.9163165796886791, F1 Score: 0.4784142755682132, Time: 43.74 seconds
Epoch 5, Loss: 0.827758092771877, F1 Score: 0.6145054661366902, Time: 43.67 seconds
Epoch 6, Loss: 0.6766355823386799, F1 Score: 0.7271895552521556, Time: 43.71 seconds
Epoch 7, Loss: 0.5693293687972155, F1 Score: 0.7597149794218567, Time: 43.77 seconds
Epoch 8, Loss: 0.5146864381703463, F1 Score: 0.7827510495294295, Time: 43.67 seconds
Epoch 9, Loss: 0.4357124756683003, F1 Score: 0.8184230103686756, Time: 43.65 seconds
Epoch 10, Loss: 0.3853335495699536, F1 Score: 0.8284205930211486, Time: 43.62 seconds
Model saved to /kaggle/working/models/BioLinkBERT_withanswer_model.pth
Test - Accuracy: 0.8172757475083057, Precision: 0.7299178310400664, Recall: 0

In [13]:
trainer_adamw = Trainandtest(pubmedqa_train, pubmedqa_test)

for model in models:
    model_name=model['model_name'],
    source=model['source'],
    trainer_adamw.model_compile(QAModel, model_name,source, 
                                optimizer='adamw',
                                batch_size=batch_size)
    # Train the model
    trainer_adamw.training(model_name, epochs=epochs)
    
    # test the model
    test_result = trainer_adamw.val()
    trainer_adamw.results[model['model_name']] = test_result

Epoch 1, Loss: 1.011871793053367, F1 Score: 0.4648839473470908, Time: 44.59 seconds
Epoch 2, Loss: 0.9644355611367659, F1 Score: 0.4286281058455812, Time: 44.48 seconds
Epoch 3, Loss: 0.9333684823729775, F1 Score: 0.45484602561846105, Time: 44.24 seconds
Epoch 4, Loss: 0.8485561446710066, F1 Score: 0.5765534451496245, Time: 44.27 seconds
Epoch 5, Loss: 0.7223129136995836, F1 Score: 0.7031315923781901, Time: 44.29 seconds
Epoch 6, Loss: 0.6275780322876844, F1 Score: 0.7380810458174434, Time: 44.40 seconds
Epoch 7, Loss: 0.5251349996436726, F1 Score: 0.7792578849410519, Time: 44.29 seconds
Epoch 8, Loss: 0.4601030586795373, F1 Score: 0.8088584409305992, Time: 44.33 seconds
Epoch 9, Loss: 0.39594291082837363, F1 Score: 0.816762609679229, Time: 44.25 seconds
Epoch 10, Loss: 0.3437985743988644, F1 Score: 0.8499077000123562, Time: 44.21 seconds
Model saved to /kaggle/working/models/BERT_withanswer_model.pth
Test - Accuracy: 0.7774086378737541, Precision: 0.7035278300094504, Recall: 0.7774086

In [14]:
trainer_sgd = Trainandtest(pubmedqa_train, pubmedqa_test)

for model in models:
    model_name=model['model_name'],
    source=model['source'],
    trainer_sgd.model_compile(QAModel, model_name,source, 
                                optimizer='sgd',
                                batch_size=batch_size)
    # Train the model
    trainer_sgd.training(model_name, epochs=epochs)
    
    # test the model
    test_result = trainer_sgd.val()
    trainer_sgd.results[model['model_name']] = test_result

Epoch 1, Loss: 1.2217926653948696, F1 Score: 0.22296992996754597, Time: 44.12 seconds
Epoch 2, Loss: 1.2551240866834468, F1 Score: 0.1691594650190066, Time: 43.97 seconds
Epoch 3, Loss: 1.2405251427130266, F1 Score: 0.22253443953694912, Time: 43.86 seconds
Epoch 4, Loss: 1.225813258777965, F1 Score: 0.22372856142916417, Time: 43.82 seconds
Epoch 5, Loss: 1.220952873880213, F1 Score: 0.2101625420980019, Time: 43.92 seconds
Epoch 6, Loss: 1.2200871976939114, F1 Score: 0.2269705679784656, Time: 43.83 seconds
Epoch 7, Loss: 1.218740533698689, F1 Score: 0.21327475349985855, Time: 43.82 seconds
Epoch 8, Loss: 1.1996823278340427, F1 Score: 0.26350643415625646, Time: 43.83 seconds
Epoch 9, Loss: 1.2073645374991677, F1 Score: 0.2698126507859709, Time: 43.76 seconds
Epoch 10, Loss: 1.1930437196384778, F1 Score: 0.2556963307887038, Time: 43.90 seconds
Model saved to /kaggle/working/models/BERT_withanswer_model.pth
Test - Accuracy: 0.11960132890365449, Precision: 0.08522115596096635, Recall: 0.119

## 4.2. Testing with different Classifying layer

#### (1) AdamW + BiLSTM

In [15]:
trainer_adamw_bilstm = Trainandtest(pubmedqa_train, pubmedqa_test)

for model in models:
    model_name=model['model_name'],
    source=model['source'],
    trainer_adamw_bilstm.model_compile(BiLSTMmodel, model_name,source,
                                        optimizer='adamw',
                                       batch_size=batch_size)
    # Train the model
    trainer_adamw_bilstm.training(model_name, epochs=epochs)
    
    # test the model
    test_result = trainer_adamw_bilstm.val()
    trainer_adamw_bilstm.results[model['model_name']] = test_result

Epoch 1, Loss: 1.0219351107423955, F1 Score: 0.4117609143360215, Time: 44.98 seconds
Epoch 2, Loss: 0.949566207148812, F1 Score: 0.4058769567304948, Time: 45.28 seconds
Epoch 3, Loss: 0.9332737597552213, F1 Score: 0.4028005363767064, Time: 45.66 seconds
Epoch 4, Loss: 0.8936033872040835, F1 Score: 0.4929180265167173, Time: 45.69 seconds
Epoch 5, Loss: 0.7777209769595753, F1 Score: 0.6820254997000541, Time: 45.51 seconds
Epoch 6, Loss: 0.6787005988034335, F1 Score: 0.7420121463294274, Time: 45.55 seconds
Epoch 7, Loss: 0.5893084379759702, F1 Score: 0.7771363451701402, Time: 45.54 seconds
Epoch 8, Loss: 0.5399105914614417, F1 Score: 0.795119561913061, Time: 45.42 seconds
Epoch 9, Loss: 0.47127320075576956, F1 Score: 0.8097200887215473, Time: 45.28 seconds
Epoch 10, Loss: 0.43895324116403406, F1 Score: 0.8340482727943974, Time: 45.27 seconds
Model saved to /kaggle/working/models/BERT_withanswer_model.pth
Test - Accuracy: 0.7541528239202658, Precision: 0.6765232655171787, Recall: 0.7541528

#### (2) Adam + BiLSTM

In [16]:
trainer_adam_bilstm = Trainandtest(pubmedqa_train, pubmedqa_test)

for model in models:
    model_name=model['model_name'],
    source=model['source'],
    trainer_adam_bilstm.model_compile(BiLSTMmodel, model_name,source,
                                      optimizer='adam', 
                                      batch_size=batch_size)
    # Train the model
    trainer_adam_bilstm.training(model_name, epochs=epochs)
    
    # test the model
    test_result = trainer_adam_bilstm.val()
    trainer_adam_bilstm.results[model['model_name']] = test_result

Epoch 1, Loss: 1.0219350782307712, F1 Score: 0.4117609143360215, Time: 44.99 seconds
Epoch 2, Loss: 0.9495646926489744, F1 Score: 0.4058769567304948, Time: 45.60 seconds
Epoch 3, Loss: 0.9332670786164023, F1 Score: 0.4028005363767064, Time: 45.82 seconds
Epoch 4, Loss: 0.8933462229642001, F1 Score: 0.4952914425552941, Time: 45.82 seconds
Epoch 5, Loss: 0.7774139236320149, F1 Score: 0.6776520877347739, Time: 46.09 seconds
Epoch 6, Loss: 0.678723546591672, F1 Score: 0.7407281585211316, Time: 45.96 seconds
Epoch 7, Loss: 0.589258002963933, F1 Score: 0.7744339701405404, Time: 45.96 seconds
Epoch 8, Loss: 0.5420386344194412, F1 Score: 0.7950646097654244, Time: 45.83 seconds
Epoch 9, Loss: 0.4762162735516375, F1 Score: 0.8083524837453331, Time: 45.98 seconds
Epoch 10, Loss: 0.44395725564523175, F1 Score: 0.8307175792940045, Time: 45.71 seconds
Model saved to /kaggle/working/models/BERT_withanswer_model.pth
Test - Accuracy: 0.7574750830564784, Precision: 0.6773721893813391, Recall: 0.75747508

## 4.3. Validating result with the BioASQ dataset

In [17]:
context = ''
opt = ''
data = '_bioasq'
version = context+opt+data


models = [
    {'model_name': 'BERT'+version,
    'source': 'bert-base-uncased'},
    {'model_name': 'ColBERT'+version,
    'source': 'colbert-ir/colbertv2.0'},
    {'model_name': 'LinkBERT'+version,
    'source': 'michiyasunaga/LinkBERT-base'},
    {'model_name': 'BiomedNLP'+version,
    'source': 'microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract',},
    {'model_name': 'BioLinkBERT'+version,
    'source': 'michiyasunaga/BioLinkBERT-base',}]

In [18]:
trainer_bioasq = Trainandtest(bioasq_train, bioasq_test)

for model in models:
    model_name=model['model_name'],
    source=model['source'],
    trainer_bioasq.model_compile(QAModel, model_name,source,
                                      batch_size=batch_size)
    # Train the model
    trainer_bioasq.training(model_name, epochs=epochs)
    
    # test the model
    test_result = trainer_bioasq.val()
    trainer_bioasq.results[model['model_name']] = test_result

Epoch 1, Loss: 1.0118717334487222, F1 Score: 0.4648839473470908, Time: 44.55 seconds
Epoch 2, Loss: 0.9644372138110074, F1 Score: 0.4286281058455812, Time: 44.44 seconds
Epoch 3, Loss: 0.9333593438972126, F1 Score: 0.45484602561846105, Time: 44.45 seconds
Epoch 4, Loss: 0.8485908589579843, F1 Score: 0.5801760005900214, Time: 44.41 seconds
Epoch 5, Loss: 0.7222059369087219, F1 Score: 0.7031315923781901, Time: 44.38 seconds
Epoch 6, Loss: 0.6271873820911754, F1 Score: 0.7393910702559864, Time: 44.38 seconds
Epoch 7, Loss: 0.524963609196923, F1 Score: 0.7792578849410519, Time: 44.60 seconds
Epoch 8, Loss: 0.4593421640721234, F1 Score: 0.8055618748992576, Time: 44.53 seconds
Epoch 9, Loss: 0.3947972553697499, F1 Score: 0.816762609679229, Time: 44.64 seconds
Epoch 10, Loss: 0.3439501998099414, F1 Score: 0.8499077000123562, Time: 44.57 seconds
Model saved to /kaggle/working/models/BERT_bioasq_model.pth
Test - Accuracy: 0.7774086378737541, Precision: 0.7009950564939693, Recall: 0.777408637873

## 4.4. Results

In [19]:
result_adam= result_convert(trainer_adam.results)
result_adamw= result_convert(trainer_adamw.results)
result_sgd= result_convert(trainer_sgd.results)
result_adam_bilstm= result_convert(trainer_adam_bilstm.results)
result_adamw_bilstm= result_convert(trainer_adamw_bilstm.results)
#result_sgd_bilstm= result_convert(trainer_sgd_bilstm.results)
#print('Adam optimiser+Linear layer\n',result_adam[['Model','Accuracy','F1 Score']])
#print('AdamW optimiser+Linear layer\n', result_adamw[['Model','Accuracy','F1 Score']])
#print('SGD optimiser+Linear layer\n', result_sgd[['Model','Accuracy','F1 Score']])
#print('Adam optimiser+BiLSTM\n',result_adam_bilstm[['Model','Accuracy','F1 Score']])
#print('AdamW optimiser+BiLSTM\n', result_adamw_bilstm[['Model','Accuracy','F1 Score']])
#rint('SGD optimiser+BiLSTM\n', result_sgd_bilstm[['Model','Accuracy','F1 Score']])


In [20]:
# Extract accuracy and F1 score columns for each optimizer
adam_metrics = result_adam[['Accuracy', 'F1 Score']].rename(columns={'Accuracy': 'Adam Accuracy', 'F1 Score': 'Adam F1 Score'})
adamw_metrics = result_adamw[['Accuracy', 'F1 Score']].rename(columns={'Accuracy': 'AdamW Accuracy', 'F1 Score': 'AdamW F1 Score'})
sgd_metrics = result_sgd[['Accuracy', 'F1 Score']].rename(columns={'Accuracy': 'SGD Accuracy', 'F1 Score': 'SGD F1 Score'})

# Combine these into a single DataFrame
combined_metrics = pd.concat([result_adam['Model'],adam_metrics, adamw_metrics, sgd_metrics], axis=1)

# Print the combined DataFrame
print('Linear Layer Results\n', combined_metrics)

Linear Layer Results
                     Model  Adam Accuracy  Adam F1 Score  AdamW Accuracy  \
0         BERT_withanswer       0.774086       0.735499        0.777409   
1      ColBERT_withanswer       0.754153       0.713691        0.754153   
2     LinkBERT_withanswer       0.744186       0.702234        0.744186   
3    BiomedNLP_withanswer       0.823920       0.782909        0.823920   
4  BioLinkBERT_withanswer       0.817276       0.771118        0.827243   

   AdamW F1 Score  SGD Accuracy  SGD F1 Score  
0        0.738563      0.119601      0.040140  
1        0.713691      0.328904      0.357285  
2        0.701028      0.338870      0.171537  
3        0.782909      0.392027      0.410031  
4        0.780901      0.338870      0.179900  


In [24]:
adam_linear = result_adam[['F1 Score']].rename(columns={'F1 Score': 'Adam Linear F1 Score'})
adam_bilstm = result_adam_bilstm[['F1 Score']].rename(columns={'F1 Score': 'Adam BiLSTM F1 Score'})

adamw_linear = result_adamw[['F1 Score']].rename(columns={'F1 Score': 'AdamW Linear F1 Score'})
adamw_bilstm = result_adamw_bilstm[['F1 Score']].rename(columns={'F1 Score': 'AdamW BiLSTM F1 Score'})

# Combine these into a single DataFrame
combined_metrics = pd.concat([result_adam['Model'],adam_linear, adam_bilstm, adamw_linear, adamw_bilstm], axis=1)

# Print the combined DataFrame
print('Comparison of Linear Layer vs BiLSTM across different optimizers\n', combined_metrics)

Comparison of Linear Layer vs BiLSTM across different optimizers
                     Model  Adam Linear F1 Score  Adam BiLSTM F1 Score  \
0         BERT_withanswer              0.735499              0.715123   
1      ColBERT_withanswer              0.713691              0.711283   
2     LinkBERT_withanswer              0.702234              0.714546   
3    BiomedNLP_withanswer              0.782909              0.768386   
4  BioLinkBERT_withanswer              0.771118              0.786298   

   AdamW Linear F1 Score  AdamW BiLSTM F1 Score  
0               0.738563               0.713192  
1               0.713691               0.711283  
2               0.701028               0.717528  
3               0.782909               0.768386  
4               0.780901               0.816047  


In [22]:
# Results using BioASQ data
result_linear_bioasq= result_convert(trainer_bioasq.results)
print('Results using BioASQ dataset with Adam optimiser+Linear layer\n',result_linear_bioasq[['Model','Accuracy','F1 Score']])

Results using BioASQ dataset with Adam optimiser+Linear layer
                 Model  Accuracy  F1 Score
0         BERT_bioasq  0.777409  0.737084
1      ColBERT_bioasq  0.754153  0.713691
2     LinkBERT_bioasq  0.750831  0.708113
3    BiomedNLP_bioasq  0.823920  0.782909
4  BioLinkBERT_bioasq  0.823920  0.776580
