In [1]:
import shutil
import os

# Remove the directory if already exist 
dir_name = 'neural_medical_qa'
if os.path.exists(dir_name):
    shutil.rmtree(dir_name)

#clone the repo from github
!git clone https://github.com/trduc97/neural_medical_qa.git
%cd neural_medical_qa
# install the requirement
!pip install -r requirements.txt

Cloning into 'neural_medical_qa'...
remote: Enumerating objects: 126, done.[K
remote: Counting objects: 100% (126/126), done.[K
remote: Compressing objects: 100% (119/119), done.[K
remote: Total 126 (delta 58), reused 0 (delta 0), pack-reused 0[K
Receiving objects: 100% (126/126), 1.79 MiB | 9.90 MiB/s, done.
Resolving deltas: 100% (58/58), done.
/kaggle/working/neural_medical_qa


In [3]:
import json
from datasets import Dataset, DatasetDict, load_dataset
from sklearn.model_selection import train_test_split
import pandas as pd

def load_bioasq_pubmedqa(bioasq_kaggle_path = '/kaggle/input/bioasq-training-12b/training12b_new.json', 
                         pubmed_kaggle_path='/kaggle/input/pubmed-qa/pubmed_qa_pga_labeled.parquet'):
    # Load the JSON file
    with open(bioasq_kaggle_path,'r') as f:
        bioasq_data=json.load(f)
    # Extract yes/no questions directly
    bioasq_yesno = [{
            'id':question['id'],
            'question':question['body'],
            'final_decision':question['exact_answer'],
            'long_answer':question['ideal_answer'], 
            'documents':question['documents']
        }
        for question in bioasq_data['questions'] if question['type'] == 'yesno']
    # Convert the list of yes/no questions to a Pandas DataFrame
    bioasq_df = pd.DataFrame(bioasq_yesno)

    # Convert the DataFrame to a Hugging Face Dataset
    bioasq_dataset = Dataset.from_pandas(bioasq_df)
    # Create a DatasetDict with the 'train' split
    bioasq_data=DatasetDict({'train': bioasq_dataset})

    # Read from parquet and translate to a dataset object
    pubmed_df=pd.read_parquet(pubmed_kaggle_path)
    dataset=Dataset.from_pandas(pubmed_df,preserve_index=False)
    #Setting into similar format as from huggingface
    pubmedqa_data = DatasetDict({'train': dataset})
    
    # Load the pubmedqa dataset
    #pubmedqa_data=load_dataset("pubmed_qa","pqa_labeled") # unstable connection

    #Encoding decisions 
    def decision_encode(question):
        labels_map = {'no': 0, 'maybe': 1, 'yes': 2}
        question['decision_encoded'] = labels_map[question['final_decision']]
        return question

    pubmedqa_data=pubmedqa_data.map(decision_encode)
    bioasq_data=pubmedqa_data.map(decision_encode)

    return bioasq_data, pubmedqa_data


def pubmed_train_test_split(datasetdict,train_size=0.75, 
                         strat_col='decision_encoded'):
    #Convert dataset to pandas DataFrame
    df = pd.DataFrame(datasetdict['train'])
    test_size=(1-train_size)
    # Define the stratification column
    stratify_col=strat_col

    #Split like normal
    train_df, test_df = train_test_split(
        df,
        test_size=test_size,
        stratify=df[stratify_col],
        random_state=42)
    # Convert DataFrames back to Dataset
    train_dataset = Dataset.from_pandas(train_df, preserve_index=False)
    test_dataset = Dataset.from_pandas(test_df, preserve_index=False)

    return train_dataset, test_dataset


def train_val_test_split(datasetdict,train_size=0.75, 
                         val_test_ratio=0.6,
                         strat_col='decision_encoded'):
    #Convert dataset to pandas DataFrame
    df = pd.DataFrame(datasetdict['train'])
    test_size=(1-train_size)
    # Define the stratification column
    stratify_col=strat_col

    #First, split for the train set
    train_df, val_test_df = train_test_split(
        df,
        test_size=test_size,
        stratify=df[stratify_col],
        random_state=42)
    #Then, split the remaining into validation and test
    val_df, test_df = train_test_split(
        val_test_df,
        test_size=val_test_ratio,
        stratify=val_test_df[stratify_col],
        random_state=42)
    # Convert DataFrames back to Dataset
    train_dataset = Dataset.from_pandas(train_df, preserve_index=False)
    val_dataset = Dataset.from_pandas(val_df, preserve_index=False)
    test_dataset = Dataset.from_pandas(test_df, preserve_index=False)

    return train_dataset, val_dataset, test_dataset

In [4]:
from import_datasets import load_bioasq_pubmedqa, train_test_split

bioasq, pubmedqa = load_bioasq_pubmedqa()

# Display the first few samples of the PubMedQA dataset
print(pubmedqa['train'].to_pandas().head())

responses = pubmedqa['train']['final_decision']
# Counting the occurrences of each value
yes_count = responses.count('yes')
no_count = responses.count('no')
maybe_count = responses.count('maybe')

# Display the counts
print(f"Yes: {yes_count}")
print(f"No: {no_count}")
print(f"Maybe: {maybe_count}")

pubmedqa_train, pubmedqa_test = pubmed_train_test_split(pubmedqa)
print(f"Train size: {len(pubmedqa_train)}")
print(f"Test size: {len(pubmedqa_test)}")

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

      pubid                                           question  \
0  21645374  Do mitochondria play a role in remodelling lac...   
1  16418930  Landolt C and snellen e acuity: differences in...   
2   9488747  Syncope during bathing in infants, a pediatric...   
3  17208539  Are the long-term results of the transanal pul...   
4  10808977  Can tailored interventions increase mammograph...   

                                             context  \
0  {'contexts': ['Programmed cell death (PCD) is ...   
1  {'contexts': ['Assessment of visual acuity dep...   
2  {'contexts': ['Apparent life-threatening event...   
3  {'contexts': ['The transanal endorectal pull-t...   
4  {'contexts': ['Telephone counseling and tailor...   

                                         long_answer final_decision  \
0  Results depicted mitochondrial dynamics in viv...            yes   
1  Using the charts described, there was only a s...             no   
2  "Aquagenic maladies" could be a pediatric form... 

In [5]:
from collections import defaultdict
# Initialize a defaultdict to hold the bucket counts
length_buckets = defaultdict(int)

# Define the bucket size
bucket_size = 128

# Loop through each string in the list
for s in pubmedqa_train['long_answer']:
    # Determine the bucket for the current string length
    bucket = (len(s) // bucket_size) * bucket_size
    # Increment the count for the appropriate bucket
    length_buckets[bucket] += 1

# Display the counts for each bucket
for bucket, count in sorted(length_buckets.items()):
    print(f"Length {bucket} - {bucket + bucket_size - 1}: {count} strings")

Length 0 - 127: 66 strings
Length 128 - 255: 326 strings
Length 256 - 383: 244 strings
Length 384 - 511: 82 strings
Length 512 - 639: 25 strings
Length 640 - 767: 4 strings
Length 768 - 895: 3 strings


In [6]:
bioasq, pubmedqa_artificial = load_bioasq_pubmedqa(pubmed_kaggle_path='/kaggle/input/pubmed-qa/pubmed_qa_pga_artificial.parquet')

Map:   0%|          | 0/211269 [00:00<?, ? examples/s]

Map:   0%|          | 0/211269 [00:00<?, ? examples/s]

In [7]:
from datasets import load_dataset, concatenate_datasets, DatasetDict

# Convert to pandas DataFrame to handle schema mismatch
pubmedqa_df = pubmedqa['train'].to_pandas()
pubmedqa_artificial_df = pubmedqa_artificial['train'].to_pandas()
# Ensure both DataFrames have the same columns
common_columns = list(set(pubmedqa_df.columns).intersection(set(pubmedqa_artificial_df.columns)))

pubmedqa_df = pubmedqa_df[common_columns]
pubmedqa_artificial_df = pubmedqa_artificial_df[common_columns]

# Take 1000 rows from each
pubmedqa_sample = pubmedqa_df.sample(n=1000, random_state=42)
pubmedqa_artificial_sample = pubmedqa_artificial_df.sample(n=1000, random_state=42)

# Step 3: Combine the samples to create pubmed_mix
combined_df = pd.concat([pubmedqa_sample, pubmedqa_artificial_sample], ignore_index=True)

# Step 4: Shuffle the combined DataFrame to mix the rows
pubmed_mix_df = combined_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Step 5: Convert back to DatasetDict format
pubmed_mix = Dataset.from_pandas(pubmed_mix_df)

# Create DatasetDict
pubmed_mix_dataset = DatasetDict({
    'train': pubmed_mix
})

# Verify the structure
print(pubmed_mix_dataset)

DatasetDict({
    train: Dataset({
        features: ['final_decision', 'decision_encoded', 'question', 'context', 'pubid', 'long_answer'],
        num_rows: 2000
    })
})


In [8]:
pubmedqa_mix_train, pubmedqa_mix_test = pubmed_train_test_split(pubmed_mix_dataset)
print(f"Train size: {len(pubmedqa_mix_train)}")
print(f"Test size: {len(pubmedqa_mix_test)}")

Train size: 1500
Test size: 500


In [9]:
import os
import torch
from transformers import BertTokenizer, BertModel, AutoTokenizer, AutoModel, GPT2Tokenizer, GPT2Model
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, f1_score

class QAModel(nn.Module):
    def __init__(self, model, classes=3, dropout_prob=0.5):
        super(QAModel, self).__init__()
        self.bert = model
        self.dropout1 = nn.Dropout(dropout_prob)
        self.linear1 = nn.Linear(model.config.hidden_size, 128)
        self.dropout2 = nn.Dropout(dropout_prob)
        self.linear2 = nn.Linear(128, classes)  # number of classes may vary between BioASQ (2 classes) and PubMedQA (3 classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]  # CLS token
        cls_output = self.dropout1(cls_output)  # Apply first dropout
        cls_output = self.linear1(cls_output)  # Apply first linear layer
        cls_output = self.dropout2(cls_output)  # Apply second dropout
        logits = self.linear2(cls_output)  # Apply second linear layer
        return logits

class Trainandtest:

    def __init__(self, df_train, df_test, stratify_col='decision_encoded'):
        self.train_data = df_train
        self.test_data = df_test
        self.loss_fn = nn.CrossEntropyLoss()
        self.stratify_col = stratify_col
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.results={}

    def initialize_tokenizer(self, model_name, source):
        if isinstance(source, tuple):
                source = source[0]
        if 'GPT' in model_name:
            tokenizer = GPT2Tokenizer.from_pretrained(source)
            if tokenizer.pad_token is None:
                tokenizer.add_special_tokens({'pad_token': '[PAD]'})
            return tokenizer
        elif 'BioLinkBERT' in model_name or 'LinkBERT' in model_name:
            return AutoTokenizer.from_pretrained(source)
        else:
            return BertTokenizer.from_pretrained(source)

    def encode_data(self, df, tokenizer):
        inputs = tokenizer(
            text=df['question'], 
            text_pair=df['long_answer'], 
            padding=True, 
            truncation=True, 
            return_tensors='pt', 
            max_length=128*4
        )
        labels = torch.tensor(df[self.stratify_col])
        return inputs, labels

    def create_dataloader(self, inputs, labels, batch_size):
        dataset = TensorDataset(inputs['input_ids'], inputs['attention_mask'], labels)
        return DataLoader(dataset, batch_size=batch_size, shuffle=True)

    def import_model(self, model_name, source, tokenizer):
        if isinstance(source, tuple):
            source = source[0]
        if 'GPT' in model_name:
            model = GPT2Model.from_pretrained(source)
            model.resize_token_embeddings(len(tokenizer))
            model = QAModel(model)
        elif 'BioLinkBERT' in model_name or 'LinkBERT' in model_name:
            model = AutoModel.from_pretrained(source)
            model = QAModel(model)
        else:
            model = BertModel.from_pretrained(source)
            model = QAModel(model)
        return model
    def model_compile(self, model_name, source, batch_size=64, adamw=True):
        batch_size = 16 if 'GPT' in model_name else batch_size
        tokenizer = self.initialize_tokenizer(model_name, source)
        train_inputs, train_labels = self.encode_data(self.train_data, tokenizer)
        test_inputs, test_labels = self.encode_data(self.test_data, tokenizer)
        self.train_loader = self.create_dataloader(train_inputs, train_labels, batch_size)
        self.test_loader = self.create_dataloader(test_inputs, test_labels, batch_size)
        
        self.model = self.import_model(model_name, source, tokenizer).to(self.device) 
        if adamw:
            self.optimizer = optim.AdamW(self.model.parameters(), lr=2e-5)
        else: 
            self.optimizer = optim.Adam(self.model.parameters(), lr=2e-5)
    
    def training(self, model_name, epochs=10):
        if isinstance(model_name, tuple):
            model_name = model_name[0]        
        self.model.train()
        for epoch in range(epochs):
            total_loss = 0
            all_preds = []
            all_labels = []
        
            for batch in self.train_loader:
                b_input_ids, b_attention_mask, b_labels = [t.to(self.device) for t in batch]
                self.optimizer.zero_grad()
            
                outputs = self.model(b_input_ids, b_attention_mask)
                loss = self.loss_fn(outputs, b_labels)
                loss.backward()
                self.optimizer.step()
                
                total_loss += loss.item()
            
                preds = outputs.detach().cpu().numpy()
                label_ids = b_labels.to('cpu').numpy()
                del b_input_ids 
                del b_attention_mask 
                del b_labels
                gc.collect()
                torch.cuda.empty_cache()
                all_preds.append(preds)
                all_labels.append(label_ids)
        
            avg_loss = total_loss / len(self.train_loader)
            all_preds = np.concatenate(all_preds, axis=0)
            all_labels = np.concatenate(all_labels, axis=0)
            avg_f1_score = self.calculate_f1_score(all_preds, all_labels)
        
            print(f"Epoch {epoch+1}, Loss: {avg_loss}, F1 Score: {avg_f1_score}")
        
        self.save_model(model_name)

    def save_model(self, model_name):
        os.makedirs('/kaggle/working/models', exist_ok=True)
        model_path = f'/kaggle/working/models/{model_name}_model.pth'
        torch.save({
            'model_state_dict': self.model.state_dict(),
            'optimizer_state_dict': self.optimizer.state_dict(),
        }, model_path)
        print(f"Model saved to {model_path}")

    def load_model(self, model_path):
        checkpoint = torch.load(model_path)
        self.model.load_state_dict(checkpoint['model_state_dict'])
        self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        print(f"Model loaded from {model_path}")

    def calculate_f1_score(self, preds, labels):
        preds_flat = np.argmax(preds, axis=1).flatten()
        labels_flat = labels.flatten()
        return f1_score(labels_flat, preds_flat, average='weighted')

    def evaluate(self, dataloader):
        self.model.eval()
        total_loss = 0
        predictions, true_labels = [], []
    
        with torch.no_grad():
            for batch in dataloader:
                b_input_ids, b_attention_mask, b_labels = [t.to(self.device) for t in batch]
                outputs = self.model(b_input_ids, b_attention_mask)
                logits = outputs.detach().cpu().numpy()
                label_ids = b_labels.cpu().numpy()
                predictions.extend(np.argmax(logits, axis=1))
                true_labels.extend(label_ids)
                del b_input_ids 
                del b_attention_mask 
                del b_labels
                gc.collect()
                torch.cuda.empty_cache()
    
        accuracy = accuracy_score(true_labels, predictions)
        precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='weighted')
    
        return accuracy, precision, recall, f1
        

    def val(self, load_model_path=None):
        if load_model_path:
            self.load_model(load_model_path)
                
        test_accuracy, test_precision, test_recall, test_f1 = self.evaluate(self.test_loader)
        print(f"Test - Accuracy: {test_accuracy}, Precision: {test_precision}, Recall: {test_recall}, F1-Score: {test_f1}")
    
        return {
            'test': {
                'accuracy': test_accuracy,
                'precision': test_precision,
                'recall': test_recall,
                'f1': test_f1
            }
        }

In [10]:
models = [
    
    {
        'model_name': 'BERT',
        'source': 'bert-base-uncased',
    },
    {
        'model_name': 'GPT',
        'source': 'gpt2',
    },
    {
        'model_name': 'ColBERT',
        'source': 'colbert-ir/colbertv2.0',
    },

    {
        'model_name': 'LinkBERT',
        'source': 'michiyasunaga/LinkBERT-base',
    },
    {
        'model_name': 'BiomedNLP',
        'source': 'microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract',
    },
    {
        'model_name': 'BioLinkBERT',
        'source': 'michiyasunaga/BioLinkBERT-base',
    },

]

In [31]:
trainer = Trainandtest(pubmedqa_train, pubmedqa_test)

for model in models:
    model_name=model['model_name'],
    source=model['source'],
    trainer.model_compile(model_name,source)
    # Train the model
    trainer.training(model_name, epochs=10)
    
    # test the model
    test_result = trainer.val()
    trainer.results[model['model_name']] = test_result

Epoch 1, Loss: 0.9844380617141724, F1 Score: 0.451102100456621
Epoch 2, Loss: 0.9382130602995554, F1 Score: 0.48689957230021924
Epoch 3, Loss: 0.8745568990707397, F1 Score: 0.4878706680244128
Epoch 4, Loss: 0.7442253679037094, F1 Score: 0.6828985274431058
Epoch 5, Loss: 0.6305228173732758, F1 Score: 0.7394395187833391
Epoch 6, Loss: 0.5366672997673353, F1 Score: 0.7673279708789403
Epoch 7, Loss: 0.4447324052453041, F1 Score: 0.8041047471620227
Epoch 8, Loss: 0.3756386563181877, F1 Score: 0.8379031285741234
Epoch 9, Loss: 0.3128013958533605, F1 Score: 0.869524328521953
Epoch 10, Loss: 0.24801820889115334, F1 Score: 0.8876733767126234
Model saved to /kaggle/working/models/BERT_model.pth
Test - Accuracy: 0.756, Precision: 0.7322769583103609, Recall: 0.756, F1-Score: 0.7432146154023473
Epoch 1, Loss: 1.4042836820825617, F1 Score: 0.42090071953689334
Epoch 2, Loss: 1.0755147274504318, F1 Score: 0.4217221361893262
Epoch 3, Loss: 1.0375938935482756, F1 Score: 0.42722761789827207
Epoch 4, Loss

  _warn_prf(average, modifier, msg_start, len(result))


Test - Accuracy: 0.552, Precision: 0.30470400000000003, Recall: 0.552, F1-Score: 0.392659793814433
Epoch 1, Loss: 0.9800732036431631, F1 Score: 0.4655292470457658
Epoch 2, Loss: 0.9360240449508032, F1 Score: 0.45662408362561036
Epoch 3, Loss: 0.9092981467644373, F1 Score: 0.5078878350984526
Epoch 4, Loss: 0.7796382904052734, F1 Score: 0.655855977887538
Epoch 5, Loss: 0.6443670441706976, F1 Score: 0.7173714285714284
Epoch 6, Loss: 0.5754364828268687, F1 Score: 0.754838728628801
Epoch 7, Loss: 0.4827369898557663, F1 Score: 0.7842122728871859
Epoch 8, Loss: 0.38864947110414505, F1 Score: 0.8298314503416319
Epoch 9, Loss: 0.32261645793914795, F1 Score: 0.8445088593548139
Epoch 10, Loss: 0.2478681622693936, F1 Score: 0.9132765272716787
Model saved to /kaggle/working/models/ColBERT_model.pth
Test - Accuracy: 0.708, Precision: 0.677927786499215, Recall: 0.708, F1-Score: 0.6897143818334734
Epoch 1, Loss: 1.0035943885644276, F1 Score: 0.4399729669645635
Epoch 2, Loss: 0.9621362288792928, F1 Sco

In [59]:
def result_convert(result_dict):
    df = pd.DataFrame({
        'Model': result_dict.keys(),
        'Accuracy': [result_dict[model]['test']['accuracy'] for model in result_dict],
        'Precision': [result_dict[model]['test']['precision'] for model in result_dict],
        'Recall': [result_dict[model]['test']['recall'] for model in result_dict],
        'F1 Score': [result_dict[model]['test']['f1'] for model in result_dict]})
    return df

result_adamw= result_convert(trainer.results)
result_adamw

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,BERT,0.756,0.732277,0.756,0.743215
1,GPT,0.552,0.304704,0.552,0.39266
2,ColBERT,0.708,0.677928,0.708,0.689714
3,LinkBERT,0.748,0.696186,0.748,0.71705
4,BiomedNLP,0.8,0.754382,0.8,0.770213
5,BioLinkBERT,0.804,0.768,0.804,0.766786


# Testing with adam instead of AdamW 

In [52]:
trainer_adam = Trainandtest(pubmedqa_train, pubmedqa_test)

for model in models:
    model_name=model['model_name'],
    source=model['source'],
    trainer_adam.model_compile(model_name,source, adamw=False)
    # Train the model
    trainer_adam.training(model_name, epochs=10)
    
    # test the model
    test_result = trainer_adam.val()
    trainer_adam.results[model['model_name']] = test_result

Epoch 1, Loss: 1.0309644838174183, F1 Score: 0.40058593679164584
Epoch 2, Loss: 0.9316036502520243, F1 Score: 0.4747573362442472
Epoch 3, Loss: 0.8613795091708502, F1 Score: 0.5061758701612865
Epoch 4, Loss: 0.7254956861337026, F1 Score: 0.700234471420467
Epoch 5, Loss: 0.6264600505431493, F1 Score: 0.7411422535411772
Epoch 6, Loss: 0.5110151842236519, F1 Score: 0.7740106259041619
Epoch 7, Loss: 0.41622703646620113, F1 Score: 0.8145444292878178
Epoch 8, Loss: 0.37562190741300583, F1 Score: 0.8349004296838056
Epoch 9, Loss: 0.3377520367503166, F1 Score: 0.8515858841168065
Epoch 10, Loss: 0.2603805400431156, F1 Score: 0.9020769182956309
Model saved to /kaggle/working/models/BERT_model.pth
Test - Accuracy: 0.74, Precision: 0.7112554872695347, Recall: 0.74, F1-Score: 0.7217710309930425
Epoch 1, Loss: 1.3248475029113445, F1 Score: 0.43854165753924795
Epoch 2, Loss: 1.0566828149430296, F1 Score: 0.4334618520959984
Epoch 3, Loss: 1.0054618094829804, F1 Score: 0.43878453721072985
Epoch 4, Loss

  _warn_prf(average, modifier, msg_start, len(result))


Test - Accuracy: 0.552, Precision: 0.30470400000000003, Recall: 0.552, F1-Score: 0.392659793814433
Epoch 1, Loss: 1.0021938135226567, F1 Score: 0.4206173589885405
Epoch 2, Loss: 0.9467902034521103, F1 Score: 0.4320542353976594
Epoch 3, Loss: 0.8945818791786829, F1 Score: 0.5402030197444831
Epoch 4, Loss: 0.8381931483745575, F1 Score: 0.5987180784833515
Epoch 5, Loss: 0.6877349317073822, F1 Score: 0.7134915750915752
Epoch 6, Loss: 0.5719812711079916, F1 Score: 0.7570669644356144
Epoch 7, Loss: 0.4945155245562394, F1 Score: 0.7798998244312088
Epoch 8, Loss: 0.4173024247090022, F1 Score: 0.8108981463255851
Epoch 9, Loss: 0.33342715601126355, F1 Score: 0.8606649627091386
Epoch 10, Loss: 0.261577049891154, F1 Score: 0.8855086290510183
Model saved to /kaggle/working/models/ColBERT_model.pth
Test - Accuracy: 0.72, Precision: 0.6557874636778745, Recall: 0.72, F1-Score: 0.6859459709074117


OutOfMemoryError: CUDA out of memory. Tried to allocate 58.00 MiB. GPU 0 has a total capacty of 15.89 GiB of which 13.12 MiB is free. Process 2197 has 15.87 GiB memory in use. Of the allocated memory 14.44 GiB is allocated by PyTorch, and 1.14 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [58]:
result_adam= result_convert(trainer_adam.results)
result_adam

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,BERT,0.74,0.711255,0.74,0.721771
1,GPT,0.552,0.304704,0.552,0.39266
2,ColBERT,0.72,0.655787,0.72,0.685946


# mixing artificial data 

In [None]:
trainer_mix = Trainandtest(pubmedqa_mix_train, pubmedqa_mix_test)

import torch
import gc
# Function to free up memory
def free_memory():
    gc.collect()
    torch.cuda.empty_cache()

for model in models:
    model_name=model['model_name'],
    source=model['source'],
    trainer_mix.model_compile(model_name,source,batch_size=8)
    free_memory()
    # Train the model
    trainer_mix.training(model_name, epochs=3)
    
    # test the model
    test_result = trainer_mix.val()
    trainer_mix.results[model['model_name']] = test_result

Epoch 1, Loss: 0.6263292911759717, F1 Score: 0.7266546606955939
Epoch 2, Loss: 0.42936402545409635, F1 Score: 0.8301333277319162
Epoch 3, Loss: 0.2957336701541901, F1 Score: 0.8827430232347814
Model saved to /kaggle/working/models/BERT_model.pth
Test - Accuracy: 0.842, Precision: 0.8159382698298586, Recall: 0.842, F1-Score: 0.8216820920080387


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Epoch 1, Loss: 0.9151097472677839, F1 Score: 0.6166968136848008
Epoch 2, Loss: 0.7974374842770556, F1 Score: 0.6303224806048497
Epoch 3, Loss: 0.7702962867123015, F1 Score: 0.638075781614834
Model saved to /kaggle/working/models/GPT_model.pth
Test - Accuracy: 0.738, Precision: 0.544644, Recall: 0.738, F1-Score: 0.6267479861910242


  _warn_prf(average, modifier, msg_start, len(result))


tokenizer_config.json:   0%|          | 0.00/405 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Epoch 1, Loss: 0.639295128273203, F1 Score: 0.7154349656129164
Epoch 2, Loss: 0.44220527273384813, F1 Score: 0.8249534732287269
Epoch 3, Loss: 0.2967134022272806, F1 Score: 0.8797971386603824
Model saved to /kaggle/working/models/ColBERT_model.pth
Test - Accuracy: 0.836, Precision: 0.7857278932018402, Recall: 0.836, F1-Score: 0.8054858255668382


  _warn_prf(average, modifier, msg_start, len(result))


tokenizer_config.json:   0%|          | 0.00/320 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/559 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/433M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


Epoch 1, Loss: 0.6404168241201563, F1 Score: 0.7252281312845033
Epoch 2, Loss: 0.44086362566164833, F1 Score: 0.8322019258281858
