In [1]:
import shutil
import os

# Remove the directory if already exist 
dir_name = 'neural_medical_qa'
if os.path.exists(dir_name):
    shutil.rmtree(dir_name)

#clone the repo from github
!git clone https://github.com/trduc97/neural_medical_qa.git
%cd neural_medical_qa
# install the requirement
!pip install -r requirements.txt

Cloning into 'neural_medical_qa'...
remote: Enumerating objects: 112, done.[K
remote: Counting objects: 100% (112/112), done.[K
remote: Compressing objects: 100% (105/105), done.[K
remote: Total 112 (delta 50), reused 0 (delta 0), pack-reused 0[K
Receiving objects: 100% (112/112), 1.77 MiB | 9.61 MiB/s, done.
Resolving deltas: 100% (50/50), done.
/kaggle/working/neural_medical_qa


In [2]:
from import_datasets import load_bioasq_pubmedqa,  train_val_test_split

bioasq, pubmedqa = load_bioasq_pubmedqa()

# Display the first few samples of the PubMedQA dataset
print(pubmedqa['train'].to_pandas().head())

responses = pubmedqa['train']['final_decision']
# Counting the occurrences of each value
yes_count = responses.count('yes')
no_count = responses.count('no')
maybe_count = responses.count('maybe')

# Display the counts
print(f"Yes: {yes_count}")
print(f"No: {no_count}")
print(f"Maybe: {maybe_count}")

pubmedqa_train,pubmedqa_val, pubmedqa_test = train_val_test_split(pubmedqa)
print(f"Train size: {len(pubmedqa_train)}")
print(f"Validation size: {len(pubmedqa_val)}")
print(f"Test size: {len(pubmedqa_test)}")

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

      pubid                                           question  \
0  21645374  Do mitochondria play a role in remodelling lac...   
1  16418930  Landolt C and snellen e acuity: differences in...   
2   9488747  Syncope during bathing in infants, a pediatric...   
3  17208539  Are the long-term results of the transanal pul...   
4  10808977  Can tailored interventions increase mammograph...   

                                             context  \
0  {'contexts': ['Programmed cell death (PCD) is ...   
1  {'contexts': ['Assessment of visual acuity dep...   
2  {'contexts': ['Apparent life-threatening event...   
3  {'contexts': ['The transanal endorectal pull-t...   
4  {'contexts': ['Telephone counseling and tailor...   

                                         long_answer final_decision  \
0  Results depicted mitochondrial dynamics in viv...            yes   
1  Using the charts described, there was only a s...             no   
2  "Aquagenic maladies" could be a pediatric form... 

In [None]:
from collections import defaultdict
# Initialize a defaultdict to hold the bucket counts
length_buckets = defaultdict(int)

# Define the bucket size
bucket_size = 128

# Loop through each string in the list
for s in pubmedqa_train['long_answer']:
    # Determine the bucket for the current string length
    bucket = (len(s) // bucket_size) * bucket_size
    # Increment the count for the appropriate bucket
    length_buckets[bucket] += 1

# Display the counts for each bucket
for bucket, count in sorted(length_buckets.items()):
    print(f"Length {bucket} - {bucket + bucket_size - 1}: {count} strings")

# Bert-base

In [None]:
import torch
from transformers import BertModel, BertTokenizer
import torch.nn as nn
import torch.optim as optim
from linear_model import QAModel
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, f1_score
import torch

stratify_col = 'decision_encoded'

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
def encode_data(df, tokenizer):
    inputs=tokenizer(
        text=df['question'], 
        text_pair=df['long_answer'], 
        padding=True, 
        truncation=True, 
        return_tensors='pt', 
        max_length=128*4
    )
    labels = torch.tensor(df[stratify_col])
    return inputs, labels

train_inputs, train_labels = encode_data(pubmedqa_train, tokenizer)
validate_inputs, validate_labels = encode_data(pubmedqa_val, tokenizer)
test_inputs, test_labels = encode_data(pubmedqa_test, tokenizer)

# Create DataLoader
def create_dataloader(inputs, labels, batch_size=64):
    dataset = TensorDataset(inputs['input_ids'], inputs['attention_mask'], labels)
    return DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Assuming train_inputs, train_labels, validate_inputs, validate_labels, test_inputs, and test_labels are already defined
train_loader = create_dataloader(train_inputs, train_labels)
validate_loader = create_dataloader(validate_inputs, validate_labels)
test_loader = create_dataloader(test_inputs, test_labels)

bert_model = BertModel.from_pretrained('bert-base-uncased')

# Define the model
class QAModel(nn.Module):
    def __init__(self, bert_model, classes=3, dropout_prob=0.5):
        super(QAModel, self).__init__()
        self.bert = bert_model
        self.dropout1 = nn.Dropout(dropout_prob)
        self.linear1 = nn.Linear(bert_model.config.hidden_size, 128)
        self.dropout2 = nn.Dropout(dropout_prob)
        self.linear2 = nn.Linear(128, classes)  # Assuming 3 classes

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]  # CLS token
        cls_output = self.dropout1(cls_output)  # Apply first dropout
        cls_output = self.linear1(cls_output)  # Apply first linear layer
        cls_output = self.dropout2(cls_output)  # Apply second dropout
        logits = self.linear2(cls_output)  # Apply second linear layer
        return logits

model = QAModel(bert_model)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def calculate_f1_score(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def evaluate(model, dataloader, device):
    model.eval()
    total_loss=0
    predictions, true_labels = [], []

    with torch.no_grad():
        for batch in dataloader:
            b_input_ids, b_attention_mask, b_labels = [t.to(device) for t in batch]  # Move batch to GPU
            outputs = model(b_input_ids, b_attention_mask)
            logits = outputs.detach().cpu().numpy()  # Move outputs to CPU before converting to numpy
            label_ids = b_labels.cpu().numpy()  # Move labels to CPU before converting to numpy
            predictions.extend(np.argmax(logits, axis=1))
            true_labels.extend(label_ids)
    
    accuracy = accuracy_score(true_labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='weighted')
    
    
    return accuracy, precision, recall, f1
# Define optimizer and loss function
optimizer = optim.AdamW(model.parameters(), lr=2e-5)
loss_fn = nn.CrossEntropyLoss()

def train(model, train_loader, optimizer, loss_fn, epochs=15):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        all_preds = []
        all_labels = []
        
        for batch in train_loader:
            b_input_ids, b_attention_mask, b_labels = [t.to(device) for t in batch]  # Move batch to GPU
            optimizer.zero_grad()
            
            outputs = model(b_input_ids, b_attention_mask)
            loss = loss_fn(outputs, b_labels)
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            
            # Collect predictions and true labels
            preds = outputs.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()
            
            all_preds.append(preds)
            all_labels.append(label_ids)
        
        # Calculate average loss and F1 score for the epoch
        avg_loss = total_loss / len(train_loader)
        all_preds = np.concatenate(all_preds, axis=0)
        all_labels = np.concatenate(all_labels, axis=0)
        avg_f1_score = calculate_f1_score(all_preds, all_labels)
        
        print(f"Epoch {epoch+1}, Loss: {avg_loss}, F1 Score: {avg_f1_score}")

# Assuming `model` and `device` are already defined
train(model, train_loader, optimizer, loss_fn)

In [None]:
# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Assuming validate_loader and test_loader are already defined
val_accuracy, val_precision, val_recall, val_f1 = evaluate(model, validate_loader, device)
test_accuracy, test_precision, test_recall, test_f1 = evaluate(model, test_loader, device)

print(f"Validation - Accuracy: {val_accuracy}, Precision: {val_precision}, Recall: {val_recall}, F1-Score: {val_f1}")
print(f"Test - Accuracy: {test_accuracy}, Precision: {test_precision}, Recall: {test_recall}, F1-Score: {test_f1}")

## Bert + artificial data

In [None]:
bioasq, pubmedqa_artificial = load_bioasq_pubmedqa(pubmed_kaggle_path='/kaggle/input/pubmed-qa/pubmed_qa_pga_artificial.parquet')

In [None]:
from datasets import DatasetDict, Dataset
from sklearn.model_selection import train_test_split

df_artificial=pubmedqa_artificial['train'].to_pandas()
df_sample, _=train_test_split(df_artificial, test_size=0.95, random_state=42, stratify=df_artificial['decision_encoded'])   
df_sample=df_sample[['pubid', 'question', 'context', 'long_answer', 'final_decision', 'decision_encoded']]
data_art=Dataset.from_pandas(df_sample,preserve_index=False)

In [None]:
# Convert back to datasets
pubmedqa_arti = DatasetDict({'train': data_art})
pubmedqa_art_train,pubmedqa_art_val, pubmedqa_art_test = train_val_test_split(pubmedqa_arti)

In [None]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [None]:
import torch
from transformers import BertModel, BertTokenizer
import torch.nn as nn
import torch.optim as optim
from linear_model import QAModel
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, f1_score
import torch

stratify_col = 'decision_encoded'

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
def encode_data(df, tokenizer):
    inputs=tokenizer(
        text=df['question'], 
        text_pair=df['long_answer'], 
        padding=True, 
        truncation=True, 
        return_tensors='pt', 
        max_length=128*4
    )
    labels = torch.tensor(df[stratify_col])
    return inputs, labels

train_inputs, train_labels = encode_data(pubmedqa_art_train, tokenizer)
validate_inputs, validate_labels = encode_data(pubmedqa_val, tokenizer)
test_inputs, test_labels = encode_data(pubmedqa_test, tokenizer)

# Create DataLoader
def create_dataloader(inputs, labels, batch_size=16):
    dataset = TensorDataset(inputs['input_ids'], inputs['attention_mask'], labels)
    return DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Assuming train_inputs, train_labels, validate_inputs, validate_labels, test_inputs, and test_labels are already defined
train_loader = create_dataloader(train_inputs, train_labels)
validate_loader = create_dataloader(validate_inputs, validate_labels)
test_loader = create_dataloader(test_inputs, test_labels)

bert_model = BertModel.from_pretrained('bert-base-uncased')

# Define the model
class QAModel(nn.Module):
    def __init__(self, bert_model, classes=3, dropout_prob=0.5):
        super(QAModel, self).__init__()
        self.bert = bert_model
        self.dropout1 = nn.Dropout(dropout_prob)
        self.linear1 = nn.Linear(bert_model.config.hidden_size, 128)
        self.dropout2 = nn.Dropout(dropout_prob)
        self.linear2 = nn.Linear(128, classes)  # Assuming 3 classes

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]  # CLS token
        cls_output = self.dropout1(cls_output)  # Apply first dropout
        cls_output = self.linear1(cls_output)  # Apply first linear layer
        cls_output = self.dropout2(cls_output)  # Apply second dropout
        logits = self.linear2(cls_output)  # Apply second linear layer
        return logits

model = QAModel(bert_model)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def calculate_f1_score(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def evaluate(model, dataloader, device):
    model.eval()
    total_loss=0
    predictions, true_labels = [], []

    with torch.no_grad():
        for batch in dataloader:
            b_input_ids, b_attention_mask, b_labels = [t.to(device) for t in batch]  # Move batch to GPU
            outputs = model(b_input_ids, b_attention_mask)
            logits = outputs.detach().cpu().numpy()  # Move outputs to CPU before converting to numpy
            label_ids = b_labels.cpu().numpy()  # Move labels to CPU before converting to numpy
            predictions.extend(np.argmax(logits, axis=1))
            true_labels.extend(label_ids)
    
    accuracy = accuracy_score(true_labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='weighted')
    
    
    return accuracy, precision, recall, f1
# Define optimizer and loss function
optimizer = optim.AdamW(model.parameters(), lr=2e-5)
loss_fn = nn.CrossEntropyLoss()

def train(model, train_loader, optimizer, loss_fn, epochs=10):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        all_preds = []
        all_labels = []
        
        for batch in train_loader:
            b_input_ids, b_attention_mask, b_labels = [t.to(device) for t in batch]  # Move batch to GPU
            optimizer.zero_grad()
            
            outputs = model(b_input_ids, b_attention_mask)
            loss = loss_fn(outputs, b_labels)
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            
            # Collect predictions and true labels
            preds = outputs.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()
            
            all_preds.append(preds)
            all_labels.append(label_ids)
        
        # Calculate average loss and F1 score for the epoch
        avg_loss = total_loss / len(train_loader)
        all_preds = np.concatenate(all_preds, axis=0)
        all_labels = np.concatenate(all_labels, axis=0)
        avg_f1_score = calculate_f1_score(all_preds, all_labels)
        
        print(f"Epoch {epoch+1}, Loss: {avg_loss}, F1 Score: {avg_f1_score}")

# Assuming `model` and `device` are already defined
train(model, train_loader, optimizer, loss_fn)

In [None]:
# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Assuming validate_loader and test_loader are already defined
val_accuracy, val_precision, val_recall, val_f1 = evaluate(model, validate_loader, device)
test_accuracy, test_precision, test_recall, test_f1 = evaluate(model, test_loader, device)

print(f"Validation - Accuracy: {val_accuracy}, Precision: {val_precision}, Recall: {val_recall}, F1-Score: {val_f1}")
print(f"Test - Accuracy: {test_accuracy}, Precision: {test_precision}, Recall: {test_recall}, F1-Score: {test_f1}")

# BioLinkBERT

In [None]:
# Clear GPU memory


torch.cuda.empty_cache()
torch.cuda.synchronize()

# Reset GPU memory settings
import os
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'

In [None]:
import torch
from transformers import AutoTokenizer, AutoModel
import torch.nn as nn
import torch.optim as optim
from linear_model import QAModel
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, f1_score
import torch

tokenizer = AutoTokenizer.from_pretrained("michiyasunaga/BioLinkBERT-base")
model = AutoModel.from_pretrained("michiyasunaga/BioLinkBERT-base")
stratify_col = 'decision_encoded'

tokenizer = AutoTokenizer.from_pretrained("michiyasunaga/BioLinkBERT-base")
def encode_data(df, tokenizer):
    inputs=tokenizer(
        text=df['question'], 
        text_pair=df['long_answer'], 
        padding=True, 
        truncation=True, 
        return_tensors='pt', 
        max_length=128*4
    )
    labels = torch.tensor(df[stratify_col])
    return inputs, labels

train_inputs, train_labels = encode_data(pubmedqa_train, tokenizer)
validate_inputs, validate_labels = encode_data(pubmedqa_val, tokenizer)
test_inputs, test_labels = encode_data(pubmedqa_test, tokenizer)

# Create DataLoader
def create_dataloader(inputs, labels, batch_size=64):
    dataset = TensorDataset(inputs['input_ids'], inputs['attention_mask'], labels)
    return DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Assuming train_inputs, train_labels, validate_inputs, validate_labels, test_inputs, and test_labels are already defined
train_loader = create_dataloader(train_inputs, train_labels)
validate_loader = create_dataloader(validate_inputs, validate_labels)
test_loader = create_dataloader(test_inputs, test_labels)

bert_model = AutoModel.from_pretrained("michiyasunaga/BioLinkBERT-base")

# Define the model
class QAModel(nn.Module):
    def __init__(self, bert_model, classes=3, dropout_prob=0.5):
        super(QAModel, self).__init__()
        self.bert = bert_model
        self.dropout1 = nn.Dropout(dropout_prob)
        self.linear1 = nn.Linear(bert_model.config.hidden_size, 128)
        self.dropout2 = nn.Dropout(dropout_prob)
        self.linear2 = nn.Linear(128, classes)  # Assuming 3 classes

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]  # CLS token
        cls_output = self.dropout1(cls_output)  # Apply first dropout
        cls_output = self.linear1(cls_output)  # Apply first linear layer
        cls_output = self.dropout2(cls_output)  # Apply second dropout
        logits = self.linear2(cls_output)  # Apply second linear layer
        return logits

model = QAModel(bert_model)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def calculate_f1_score(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def evaluate(model, dataloader, device):
    model.eval()
    total_loss=0
    predictions, true_labels = [], []

    with torch.no_grad():
        for batch in dataloader:
            b_input_ids, b_attention_mask, b_labels = [t.to(device) for t in batch]  # Move batch to GPU
            outputs = model(b_input_ids, b_attention_mask)
            logits = outputs.detach().cpu().numpy()  # Move outputs to CPU before converting to numpy
            label_ids = b_labels.cpu().numpy()  # Move labels to CPU before converting to numpy
            predictions.extend(np.argmax(logits, axis=1))
            true_labels.extend(label_ids)
    
    accuracy = accuracy_score(true_labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='weighted')
    
    
    return accuracy, precision, recall, f1
# Define optimizer and loss function
optimizer = optim.AdamW(model.parameters(), lr=2e-5)
loss_fn = nn.CrossEntropyLoss()

def train(model, train_loader, optimizer, loss_fn, epochs=15):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        all_preds = []
        all_labels = []
        
        for batch in train_loader:
            b_input_ids, b_attention_mask, b_labels = [t.to(device) for t in batch]  # Move batch to GPU
            optimizer.zero_grad()
            
            outputs = model(b_input_ids, b_attention_mask)
            loss = loss_fn(outputs, b_labels)
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            
            # Collect predictions and true labels
            preds = outputs.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()
            
            all_preds.append(preds)
            all_labels.append(label_ids)
        
        # Calculate average loss and F1 score for the epoch
        avg_loss = total_loss / len(train_loader)
        all_preds = np.concatenate(all_preds, axis=0)
        all_labels = np.concatenate(all_labels, axis=0)
        avg_f1_score = calculate_f1_score(all_preds, all_labels)
        
        print(f"Epoch {epoch+1}, Loss: {avg_loss}, F1 Score: {avg_f1_score}")

# Assuming `model` and `device` are already defined
train(model, train_loader, optimizer, loss_fn)

In [None]:
# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Assuming validate_loader and test_loader are already defined
val_accuracy, val_precision, val_recall, val_f1 = evaluate(model, validate_loader, device)
test_accuracy, test_precision, test_recall, test_f1 = evaluate(model, test_loader, device)

print(f"Validation - Accuracy: {val_accuracy}, Precision: {val_precision}, Recall: {val_recall}, F1-Score: {val_f1}")
print(f"Test - Accuracy: {test_accuracy}, Precision: {test_precision}, Recall: {test_recall}, F1-Score: {test_f1}")

# GPT 

In [None]:
from transformers import GPT2Tokenizer, GPT2Model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2Model.from_pretrained('gpt2')
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

In [None]:
import torch
from transformers import AutoTokenizer, AutoModel,AutoModelForCausalLM
import torch.nn as nn
import torch.optim as optim
from linear_model import QAModel
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, f1_score
import torch
from transformers import GPT2Tokenizer, GPT2Model

import gc

#model.cpu()
#del model, checkpoint
gc.collect()
torch.cuda.empty_cache()

stratify_col = 'decision_encoded'
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2Model.from_pretrained('gpt2')
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

def encode_data(df, tokenizer):
    inputs=tokenizer(
        text=df['question'], 
        text_pair=df['long_answer'], 
        padding=True, 
        truncation=True, 
        return_tensors='pt', 
        max_length=128*4
    )
    labels = torch.tensor(df[stratify_col])
    return inputs, labels

train_inputs, train_labels = encode_data(pubmedqa_train, tokenizer)
validate_inputs, validate_labels = encode_data(pubmedqa_val, tokenizer)
test_inputs, test_labels = encode_data(pubmedqa_test, tokenizer)

# Create DataLoader
def create_dataloader(inputs, labels, batch_size=16):
    dataset = TensorDataset(inputs['input_ids'], inputs['attention_mask'], labels)
    return DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Assuming train_inputs, train_labels, validate_inputs, validate_labels, test_inputs, and test_labels are already defined
train_loader = create_dataloader(train_inputs, train_labels)
validate_loader = create_dataloader(validate_inputs, validate_labels)
test_loader = create_dataloader(test_inputs, test_labels)

# Define the model
class QAModel(nn.Module):
    def __init__(self, model, classes=3, dropout_prob=0.5):
        super(QAModel, self).__init__()
        self.bert = model
        self.dropout1 = nn.Dropout(dropout_prob)
        self.linear1 = nn.Linear(model.config.hidden_size, 128)
        self.dropout2 = nn.Dropout(dropout_prob)
        self.linear2 = nn.Linear(128, classes)  # Assuming 3 classes

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]  # CLS token
        cls_output = self.dropout1(cls_output)  # Apply first dropout
        cls_output = self.linear1(cls_output)  # Apply first linear layer
        cls_output = self.dropout2(cls_output)  # Apply second dropout
        logits = self.linear2(cls_output)  # Apply second linear layer
        return logits

model = QAModel(model)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def calculate_f1_score(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def evaluate(model, dataloader, device):
    model.eval()
    total_loss=0
    predictions, true_labels = [], []

    with torch.no_grad():
        for batch in dataloader:
            b_input_ids, b_attention_mask, b_labels = [t.to(device) for t in batch]  # Move batch to GPU
            outputs = model(b_input_ids, b_attention_mask)
            logits = outputs.detach().cpu().numpy()  # Move outputs to CPU before converting to numpy
            label_ids = b_labels.cpu().numpy()  # Move labels to CPU before converting to numpy
            predictions.extend(np.argmax(logits, axis=1))
            true_labels.extend(label_ids)
    
    accuracy = accuracy_score(true_labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='weighted')
    
    
    return accuracy, precision, recall, f1
# Define optimizer and loss function
optimizer = optim.AdamW(model.parameters(), lr=2e-5)
loss_fn = nn.CrossEntropyLoss()

def train(model, train_loader, optimizer, loss_fn, epochs=5):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        all_preds = []
        all_labels = []
        
        for batch in train_loader:
            b_input_ids, b_attention_mask, b_labels = [t.to(device) for t in batch]  # Move batch to GPU
            optimizer.zero_grad()
            
            outputs = model(b_input_ids, b_attention_mask)
            loss = loss_fn(outputs, b_labels)
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            
            # Collect predictions and true labels
            preds = outputs.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()
            
            all_preds.append(preds)
            all_labels.append(label_ids)
        
        # Calculate average loss and F1 score for the epoch
        avg_loss = total_loss / len(train_loader)
        all_preds = np.concatenate(all_preds, axis=0)
        all_labels = np.concatenate(all_labels, axis=0)
        avg_f1_score = calculate_f1_score(all_preds, all_labels)
        
        print(f"Epoch {epoch+1}, Loss: {avg_loss}, F1 Score: {avg_f1_score}")

# Assuming `model` and `device` are already defined
train(model, train_loader, optimizer, loss_fn)

In [None]:
# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Assuming validate_loader and test_loader are already defined
val_accuracy, val_precision, val_recall, val_f1 = evaluate(model, validate_loader, device)
test_accuracy, test_precision, test_recall, test_f1 = evaluate(model, test_loader, device)

print(f"Validation - Accuracy: {val_accuracy}, Precision: {val_precision}, Recall: {val_recall}, F1-Score: {val_f1}")
print(f"Test - Accuracy: {test_accuracy}, Precision: {test_precision}, Recall: {test_recall}, F1-Score: {test_f1}")

# BioGPT 

In [None]:
pip install sacremoses

In [None]:
model

In [None]:
import torch
from transformers import AutoTokenizer, AutoModel,AutoModelForCausalLM, GPT2Tokenizer, GPT2Model
import torch.nn as nn
import torch.optim as optim
from linear_model import QAModel
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, f1_score

# Load model directly
from transformers import BioGptTokenizer, BioGptForCausalLM

tokenizer = AutoTokenizer.from_pretrained("microsoft/biogpt")
gpt_model = AutoModelForCausalLM.from_pretrained("microsoft/biogpt", output_hidden_states=True)

stratify_col = 'decision_encoded'
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

def encode_data(df, tokenizer):
    inputs=tokenizer(
        text=df['question'], 
        text_pair=df['long_answer'], 
        padding=True, 
        truncation=True, 
        return_tensors='pt', 
        max_length=128*4
    )
    labels = torch.tensor(df[stratify_col])
    return inputs, labels

train_inputs, train_labels = encode_data(pubmedqa_train, tokenizer)
validate_inputs, validate_labels = encode_data(pubmedqa_val, tokenizer)
test_inputs, test_labels = encode_data(pubmedqa_test, tokenizer)

# Create DataLoader
def create_dataloader(inputs, labels, batch_size=16):
    dataset = TensorDataset(inputs['input_ids'], inputs['attention_mask'], labels)
    return DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Assuming train_inputs, train_labels, validate_inputs, validate_labels, test_inputs, and test_labels are already defined
train_loader = create_dataloader(train_inputs, train_labels)
validate_loader = create_dataloader(validate_inputs, validate_labels)
test_loader = create_dataloader(test_inputs, test_labels)

# Define the model
class QAModel(nn.Module):
    def __init__(self, model, classes=3, dropout_prob=0.5):
        super(QAModel, self).__init__()
        self.bert = model
        self.dropout1 = nn.Dropout(dropout_prob)
        self.linear1 = nn.Linear(model.config.hidden_size, 128)
        self.dropout2 = nn.Dropout(dropout_prob)
        self.linear2 = nn.Linear(128, classes)  # Assuming 3 classes

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.hidden_states[-1] # CLS token
        cls_output = self.dropout1(cls_output)  # Apply first dropout
        cls_output = self.linear1(cls_output)  # Apply first linear layer
        cls_output = self.dropout2(cls_output)  # Apply second dropout
        logits = self.linear2(cls_output)  # Apply second linear layer
        return logits

model = QAModel(gpt_model)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def calculate_f1_score(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def evaluate(model, dataloader, device):
    model.eval()
    total_loss=0
    predictions, true_labels = [], []

    with torch.no_grad():
        for batch in dataloader:
            b_input_ids, b_attention_mask, b_labels = [t.to(device) for t in batch]  # Move batch to GPU
            outputs = model(b_input_ids, b_attention_mask)
            logits = outputs.detach().cpu().numpy()  # Move outputs to CPU before converting to numpy
            label_ids = b_labels.cpu().numpy()  # Move labels to CPU before converting to numpy
            predictions.extend(np.argmax(logits, axis=1))
            true_labels.extend(label_ids)
    
    accuracy = accuracy_score(true_labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='weighted')
    
    
    return accuracy, precision, recall, f1
# Define optimizer and loss function
optimizer = optim.AdamW(model.parameters(), lr=2e-5)
loss_fn = nn.CrossEntropyLoss()

def train(model, train_loader, optimizer, loss_fn, epochs=15):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        all_preds = []
        all_labels = []
        
        for batch in train_loader:
            b_input_ids, b_attention_mask, b_labels = [t.to(device) for t in batch]  # Move batch to GPU
            optimizer.zero_grad()
            
            outputs = model(b_input_ids, b_attention_mask)
            loss = loss_fn(outputs, b_labels)
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            
            # Collect predictions and true labels
            preds = outputs.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()
            
            all_preds.append(preds)
            all_labels.append(label_ids)
        
        # Calculate average loss and F1 score for the epoch
        avg_loss = total_loss / len(train_loader)
        all_preds = np.concatenate(all_preds, axis=0)
        all_labels = np.concatenate(all_labels, axis=0)
        avg_f1_score = calculate_f1_score(all_preds, all_labels)
        
        print(f"Epoch {epoch+1}, Loss: {avg_loss}, F1 Score: {avg_f1_score}")

# Assuming `model` and `device` are already defined
train(model, train_loader, optimizer, loss_fn)

In [None]:
from numba import cuda
cuda.select_device(0)
cuda.close()

In [None]:
cuda.open()

In [None]:
!pip install GPUtil

import torch
from GPUtil import showUtilization as gpu_usage
from numba import cuda

def free_gpu_cache():
    print("Initial GPU Usage")
    gpu_usage()                             

    torch.cuda.empty_cache()

    cuda.select_device(0)
    cuda.close()
    cuda.select_device(0)

    print("GPU Usage after emptying the cache")
    gpu_usage()

free_gpu_cache()                           


In [None]:
import torch
from numba import cuda
torch.cuda.empty_cache()
cuda.select_device(0)
cuda.close()
cuda.select_device(0)

# BiomedNLP

In [None]:
# Move the model and tensor to CPU
model.cpu()

# Delete the model and tensor to free up memory
del model

# Clear the GPU cache
torch.cuda.empty_cache()

In [None]:
from transformers import AutoModelForMaskedLM

bert_model = BertModel.from_pretrained("microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract")

In [None]:
import torch
from transformers import BertModel, BertTokenizer
import torch.nn as nn
import torch.optim as optim
from linear_model import QAModel
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, f1_score
import torch


from transformers import AutoTokenizer, AutoModelForMaskedLM

stratify_col = 'decision_encoded'

tokenizer =  AutoTokenizer.from_pretrained("microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract")
def encode_data(df, tokenizer):
    inputs=tokenizer(
        text=df['question'], 
        text_pair=df['long_answer'], 
        padding=True, 
        truncation=True, 
        return_tensors='pt', 
        max_length=128*4
    )
    labels = torch.tensor(df[stratify_col])
    return inputs, labels

train_inputs, train_labels = encode_data(pubmedqa_train, tokenizer)
validate_inputs, validate_labels = encode_data(pubmedqa_val, tokenizer)
test_inputs, test_labels = encode_data(pubmedqa_test, tokenizer)

# Create DataLoader
def create_dataloader(inputs, labels, batch_size=64):
    dataset = TensorDataset(inputs['input_ids'], inputs['attention_mask'], labels)
    return DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Assuming train_inputs, train_labels, validate_inputs, validate_labels, test_inputs, and test_labels are already defined
train_loader = create_dataloader(train_inputs, train_labels)
validate_loader = create_dataloader(validate_inputs, validate_labels)
test_loader = create_dataloader(test_inputs, test_labels)

bert_model = BertModel.from_pretrained("microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract")

# Define the model
class QAModel(nn.Module):
    def __init__(self, bert_model, classes=3, dropout_prob=0.5):
        super(QAModel, self).__init__()
        self.bert = bert_model
        self.dropout1 = nn.Dropout(dropout_prob)
        self.linear1 = nn.Linear(bert_model.config.hidden_size, 128)
        self.dropout2 = nn.Dropout(dropout_prob)
        self.linear2 = nn.Linear(128, classes)  # Assuming 3 classes

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :] # CLS token
        cls_output = self.dropout1(cls_output)  # Apply first dropout
        cls_output = self.linear1(cls_output)  # Apply first linear layer
        cls_output = self.dropout2(cls_output)  # Apply second dropout
        logits = self.linear2(cls_output)  # Apply second linear layer
        return logits

model = QAModel(bert_model)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def calculate_f1_score(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def evaluate(model, dataloader, device):
    model.eval()
    total_loss=0
    predictions, true_labels = [], []

    with torch.no_grad():
        for batch in dataloader:
            b_input_ids, b_attention_mask, b_labels = [t.to(device) for t in batch]  # Move batch to GPU
            outputs = model(b_input_ids, b_attention_mask)
            logits = outputs.detach().cpu().numpy()  # Move outputs to CPU before converting to numpy
            label_ids = b_labels.cpu().numpy()  # Move labels to CPU before converting to numpy
            predictions.extend(np.argmax(logits, axis=1))
            true_labels.extend(label_ids)
    
    accuracy = accuracy_score(true_labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='weighted')
    
    
    return accuracy, precision, recall, f1
# Define optimizer and loss function
optimizer = optim.AdamW(model.parameters(), lr=2e-5)
loss_fn = nn.CrossEntropyLoss()

def train(model, train_loader, optimizer, loss_fn, epochs=15):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        all_preds = []
        all_labels = []
        
        for batch in train_loader:
            b_input_ids, b_attention_mask, b_labels = [t.to(device) for t in batch]  # Move batch to GPU
            optimizer.zero_grad()
            
            outputs = model(b_input_ids, b_attention_mask)
            loss = loss_fn(outputs, b_labels)
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            
            # Collect predictions and true labels
            preds = outputs.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()
            
            all_preds.append(preds)
            all_labels.append(label_ids)
        
        # Calculate average loss and F1 score for the epoch
        avg_loss = total_loss / len(train_loader)
        all_preds = np.concatenate(all_preds, axis=0)
        all_labels = np.concatenate(all_labels, axis=0)
        avg_f1_score = calculate_f1_score(all_preds, all_labels)
        
        print(f"Epoch {epoch+1}, Loss: {avg_loss}, F1 Score: {avg_f1_score}")

# Assuming `model` and `device` are already defined
train(model, train_loader, optimizer, loss_fn)

In [None]:
# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Assuming validate_loader and test_loader are already defined
val_accuracy, val_precision, val_recall, val_f1 = evaluate(model, validate_loader, device)
test_accuracy, test_precision, test_recall, test_f1 = evaluate(model, test_loader, device)

print(f"Validation - Accuracy: {val_accuracy}, Precision: {val_precision}, Recall: {val_recall}, F1-Score: {val_f1}")
print(f"Test - Accuracy: {test_accuracy}, Precision: {test_precision}, Recall: {test_recall}, F1-Score: {test_f1}")

# BIDAF

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import BertModel

class BiDAF(nn.Module):
    def __init__(self, bert_model_name='bert-base-uncased', hidden_size=128, dropout_prob=0.2):
        super(BiDAF, self).__init__()
        self.hidden_size = hidden_size
        self.dropout_prob = dropout_prob
        
        # Load pre-trained BERT model
        self.bert = BertModel.from_pretrained(bert_model_name)
        bert_hidden_size = self.bert.config.hidden_size
        
        # Character embedding
        self.char_emb = nn.Embedding(num_embeddings=94, embedding_dim=8, padding_idx=0)
        self.char_conv = nn.Conv2d(in_channels=1, out_channels=100, kernel_size=(5, 8))
        
        # Highway network
        self.highway = Highway(input_size=bert_hidden_size + 100, num_layers=2)
        
        # Contextual embedding layer
        self.context_LSTM = nn.LSTM(input_size=bert_hidden_size + 100, hidden_size=hidden_size, num_layers=1, batch_first=True, bidirectional=True)
        
        # Attention flow layer
        self.att_flow = AttentionFlowLayer(hidden_size)
        
        # Modeling layer
        self.modeling_LSTM = nn.LSTM(input_size=hidden_size*8, hidden_size=hidden_size, num_layers=2, batch_first=True, bidirectional=True, dropout=dropout_prob)
        
        # Output layer
        self.output_LSTM = nn.LSTM(input_size=hidden_size*2, hidden_size=hidden_size, num_layers=1, batch_first=True, bidirectional=True)
        self.p1 = nn.Linear(hidden_size*10, 1)
        self.p2 = nn.Linear(hidden_size*10, 1)

    def forward(self, context_input_ids, context_attention_mask, query_input_ids, query_attention_mask, context_char_idxs, query_char_idxs):
        # BERT embeddings
        context_bert_output = self.bert(input_ids=context_input_ids, attention_mask=context_attention_mask)
        query_bert_output = self.bert(input_ids=query_input_ids, attention_mask=query_attention_mask)
        
        context_word_emb = context_bert_output.last_hidden_state  # (batch, context_len, bert_hidden_size)
        query_word_emb = query_bert_output.last_hidden_state  # (batch, query_len, bert_hidden_size)
        
        # Character embedding and convolution
        context_char_emb = self.char_emb(context_char_idxs).unsqueeze(1)  # (batch, 1, context_len, char_emb_dim)
        query_char_emb = self.char_emb(query_char_idxs).unsqueeze(1)  # (batch, 1, query_len, char_emb_dim)
        
        context_char_emb = self.char_conv(context_char_emb).squeeze()  # (batch, char_emb_dim, context_len)
        query_char_emb = self.char_conv(query_char_emb).squeeze()  # (batch, char_emb_dim, query_len)
        
        context_emb = torch.cat([context_word_emb, context_char_emb], dim=-1)  # (batch, context_len, bert_hidden_size+char_emb_dim)
        query_emb = torch.cat([query_word_emb, query_char_emb], dim=-1)  # (batch, query_len, bert_hidden_size+char_emb_dim)
        
        # Highway network
        context_emb = self.highway(context_emb)
        query_emb = self.highway(query_emb)
        
        # Contextual embedding layer
        context_emb, _ = self.context_LSTM(context_emb)  # (batch, context_len, 2*hidden_size)
        query_emb, _ = self.context_LSTM(query_emb)  # (batch, query_len, 2*hidden_size)
        
        # Attention flow layer
        G = self.att_flow(context_emb, query_emb)  # (batch, context_len, 8*hidden_size)
        
        # Modeling layer
        M, _ = self.modeling_LSTM(G)  # (batch, context_len, 2*hidden_size)
        
        # Output layer
        M2, _ = self.output_LSTM(M)  # (batch, context_len, 2*hidden_size)
        
        p1 = self.p1(torch.cat([G, M], dim=-1)).squeeze()  # (batch, context_len)
        p2 = self.p2(torch.cat([G, M2], dim=-1)).squeeze()  # (batch, context_len)
        
        return p1, p2

class Highway(nn.Module):
    def __init__(self, input_size, num_layers=2):
        super(Highway, self).__init__()
        self.num_layers = num_layers
        self.linear = nn.ModuleList([nn.Linear(input_size, input_size) for _ in range(num_layers)])
        self.gate = nn.ModuleList([nn.Linear(input_size, input_size) for _ in range(num_layers)])

    def forward(self, x):
        for i in range(self.num_layers):
            gate = torch.sigmoid(self.gate[i](x))
            non_linear = F.relu(self.linear[i](x))
            x = gate * non_linear + (1 - gate) * x
        return x

class AttentionFlowLayer(nn.Module):
    def __init__(self, hidden_size):
        super(AttentionFlowLayer, self).__init__()
        self.hidden_size = hidden_size
        
        self.Wc = nn.Linear(2 * hidden_size, 1, bias=False)
        self.Wq = nn.Linear(2 * hidden_size, 1, bias=False)
        self.Wcq = nn.Linear(2 * hidden_size, 1, bias=False)

    def forward(self, context, query):
        batch_size, context_len, _ = context.size()
        query_len = query.size(1)
        
        context = context.unsqueeze(2).expand(-1, -1, query_len, -1)  # (batch, context_len, query_len, hidden_size*2)
        query = query.unsqueeze(1).expand(-1, context_len, -1, -1)  # (batch, context_len, query_len, hidden_size*2)
        
        S = self.Wc(context) + self.Wq(query) + self.Wcq(context * query)  # (batch, context_len, query_len)
        S = S.squeeze(-1)
        
        c2q = torch.bmm(F.softmax(S, dim=-1), query)  # (batch, context_len, hidden_size*2)
        b = F.softmax(S.max(dim=-1)[0], dim=-1)  # (batch, context_len)
        q2c = torch.bmm(b.unsqueeze(1), context).squeeze(1).unsqueeze(1).expand(-1, context_len, -1)  # (batch, context_len, hidden_size*2)
        
        G = torch.cat([context, c2q, context * c2q, context * q2c], dim=-1)  # (batch, context_len, hidden_size*8)
        
        return G


In [None]:
import torch

# Initialize the model
model = BiDAF(bert_model_name='bert-base-uncased')

# Move the model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)


In [None]:
from transformers import BertTokenizer

# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Example context and query
context = "The quick brown fox jumps over the lazy dog."
query = "What does the fox do?"

# Tokenize the context and query
context_encodings = tokenizer(context, return_tensors='pt', padding=True, truncation=True, max_length=512)
query_encodings = tokenizer(query, return_tensors='pt', padding=True, truncation=True, max_length=512)

# Dummy character indices for context and query
# (In practice, you need to create character indices based on your dataset)
context_char_idxs = torch.randint(0, 94, (1, context_encodings['input_ids'].size(1), 10))
query_char_idxs = torch.randint(0, 94, (1, query_encodings['input_ids'].size(1), 10))

# Move inputs to device
context_input_ids = context_encodings['input_ids'].to(device)
context_attention_mask = context_encodings['attention_mask'].to(device)
query_input_ids = query_encodings['input_ids'].to(device)
query_attention_mask = query_encodings['attention_mask'].to(device)
context_char_idxs = context_char_idxs.to(device)
query_char_idxs = query_char_idxs.to(device)


In [None]:
# Forward pass
model.eval()  # Set the model to evaluation mode
with torch.no_grad():
    p1, p2 = model(context_input_ids, context_attention_mask, query_input_ids, query_attention_mask, context_char_idxs, query_char_idxs)

# p1 and p2 are the start and end logits for the answer span
print(p1)
print(p2)


In [None]:
import torch
from transformers import BertModel, BertTokenizer
import torch.nn as nn
import torch.optim as optim
from linear_model import QAModel
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, f1_score
import torch

stratify_col = 'decision_encoded'

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
def encode_data(df, tokenizer):
    inputs=tokenizer(
        text=df['question'], 
        text_pair=df['long_answer'], 
        padding=True, 
        truncation=True, 
        return_tensors='pt', 
        max_length=128*4
    )
    labels = torch.tensor(df[stratify_col])
    return inputs, labels

train_inputs, train_labels = encode_data(pubmedqa_train, tokenizer)
validate_inputs, validate_labels = encode_data(pubmedqa_val, tokenizer)
test_inputs, test_labels = encode_data(pubmedqa_test, tokenizer)

# Create DataLoader
def create_dataloader(inputs, labels, batch_size=64):
    dataset = TensorDataset(inputs['input_ids'], inputs['attention_mask'], labels)
    return DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Assuming train_inputs, train_labels, validate_inputs, validate_labels, test_inputs, and test_labels are already defined
train_loader = create_dataloader(train_inputs, train_labels)
validate_loader = create_dataloader(validate_inputs, validate_labels)
test_loader = create_dataloader(test_inputs, test_labels)

bert_model = BertModel.from_pretrained('bert-base-uncased')

# Define the model
class BiDAF(nn.Module):
    def __init__(self, bert_model_name='bert-base-uncased', hidden_size=128, dropout_prob=0.2):
        super(BiDAF, self).__init__()
        self.hidden_size = hidden_size
        self.dropout_prob = dropout_prob
        
        # Load pre-trained BERT model
        self.bert = BertModel.from_pretrained(bert_model_name)
        bert_hidden_size = self.bert.config.hidden_size
        
        # Character embedding
        self.char_emb = nn.Embedding(num_embeddings=94, embedding_dim=8, padding_idx=0)
        self.char_conv = nn.Conv2d(in_channels=1, out_channels=100, kernel_size=(5, 8))
        
        # Highway network
        self.highway = Highway(input_size=word_vectors.size(1) + 100, num_layers=2)
        
        # Contextual embedding layer
        self.context_LSTM = nn.LSTM(input_size=word_vectors.size(1) + 100, hidden_size=hidden_size, num_layers=1, batch_first=True, bidirectional=True)
        
        # Attention flow layer
        self.att_flow = AttentionFlowLayer(hidden_size)
        
        # Modeling layer
        self.modeling_LSTM = nn.LSTM(input_size=hidden_size*8, hidden_size=hidden_size, num_layers=2, batch_first=True, bidirectional=True, dropout=dropout_prob)
        
        # Output layer
        self.output_LSTM = nn.LSTM(input_size=hidden_size*2, hidden_size=hidden_size, num_layers=1, batch_first=True, bidirectional=True)
        self.p1 = nn.Linear(hidden_size*10, 1)
        self.p2 = nn.Linear(hidden_size*10, 1)

    def forward(self, context_input_ids, context_attention_mask, query_input_ids, query_attention_mask, context_char_idxs, query_char_idxs):
        # BERT embeddings
        context_bert_output = self.bert(input_ids=context_input_ids, attention_mask=context_attention_mask)
        query_bert_output = self.bert(input_ids=query_input_ids, attention_mask=query_attention_mask)
        
        context_word_emb = context_bert_output.last_hidden_state  # (batch, context_len, bert_hidden_size)
        query_word_emb = query_bert_output.last_hidden_state  # (batch, query_len, bert_hidden_size)
        
        # Character embedding and convolution
        context_char_emb = self.char_emb(context_char_idxs).unsqueeze(1)  # (batch, 1, context_len, char_emb_dim)
        query_char_emb = self.char_emb(query_char_idxs).unsqueeze(1)  # (batch, 1, query_len, char_emb_dim)
        
        context_char_emb = self.char_conv(context_char_emb).squeeze()  # (batch, char_emb_dim, context_len)
        query_char_emb = self.char_conv(query_char_emb).squeeze()  # (batch, char_emb_dim, query_len)
        
        context_emb = torch.cat([context_word_emb, context_char_emb], dim=-1)  # (batch, context_len, bert_hidden_size+char_emb_dim)
        query_emb = torch.cat([query_word_emb, query_char_emb], dim=-1)  # (batch, query_len, bert_hidden_size+char_emb_dim)
         # Highway network
        context_emb = self.highway(context_emb)
        query_emb = self.highway(query_emb)
        
        # Contextual embedding layer
        context_emb, _ = self.context_LSTM(context_emb)  # (batch, context_len, 2*hidden_size)
        query_emb, _ = self.context_LSTM(query_emb)  # (batch, query_len, 2*hidden_size)
        
        # Attention flow layer
        G = self.att_flow(context_emb, query_emb)  # (batch, context_len, 8*hidden_size)
        
        # Modeling layer
        M, _ = self.modeling_LSTM(G)  # (batch, context_len, 2*hidden_size)
        
        # Output layer
        M2, _ = self.output_LSTM(M)  # (batch, context_len, 2*hidden_size)
        
        p1 = self.p1(torch.cat([G, M], dim=-1)).squeeze()  # (batch, context_len)
        p2 = self.p2(torch.cat([G, M2], dim=-1)).squeeze()  # (batch, context_len)
        
        return p1, p2

class Highway(nn.Module):
    def __init__(self, input_size, num_layers=2):
        super(Highway, self).__init__()
        self.num_layers = num_layers
        self.linear = nn.ModuleList([nn.Linear(input_size, input_size) for _ in range(num_layers)])
        self.gate = nn.ModuleList([nn.Linear(input_size, input_size) for _ in range(num_layers)])

    def forward(self, x):
        for i in range(self.num_layers):
            gate = torch.sigmoid(self.gate[i](x))
            non_linear = F.relu(self.linear[i](x))
            x = gate * non_linear + (1 - gate) * x
        return x

class AttentionFlowLayer(nn.Module):
    def __init__(self, hidden_size):
        super(AttentionFlowLayer, self).__init__()
        self.hidden_size = hidden_size
        
        self.Wc = nn.Linear(2 * hidden_size, 1, bias=False)
        self.Wq = nn.Linear(2 * hidden_size, 1, bias=False)
        self.Wcq = nn.Linear(2 * hidden_size, 1, bias=False)

    def forward(self, context, query):
        batch_size, context_len, _ = context.size()
        query_len = query.size(1)
        
        context = context.unsqueeze(2).expand(-1, -1, query_len, -1)  # (batch, context_len, query_len, hidden_size*2)
        query = query.unsqueeze(1).expand(-1, context_len, -1, -1)  # (batch, context_len, query_len, hidden_size*2)
        
        S = self.Wc(context) + self.Wq(query) + self.Wcq(context * query)  # (batch, context_len, query_len)
        S = S.squeeze(-1)
        
        c2q = torch.bmm(F.softmax(S, dim=-1), query)  # (batch, context_len, hidden_size*2)
        b = F.softmax(S.max(dim=-1)[0], dim=-1)  # (batch, context_len)
        q2c = torch.bmm(b.unsqueeze(1), context).squeeze(1).unsqueeze(1).expand(-1, context_len, -1)  # (batch, context_len, hidden_size*2)
        
        G = torch.cat([context, c2q, context * c2q, context * q2c], dim=-1)  # (batch, context_len, hidden_size*8)
        
        return G

model = BiDAF(bert_model)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [None]:
import torch

# Initialize the model
model = BiDAF(bert_model_name='bert-base-uncased')

# Move the model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

In [None]:
def calculate_f1_score(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def evaluate(model, dataloader, device):
    model.eval()
    total_loss=0
    predictions, true_labels = [], []

    with torch.no_grad():
        for batch in dataloader:
            b_input_ids, b_attention_mask, b_labels = [t.to(device) for t in batch]  # Move batch to GPU
            outputs = model(b_input_ids, b_attention_mask)
            logits = outputs.detach().cpu().numpy()  # Move outputs to CPU before converting to numpy
            label_ids = b_labels.cpu().numpy()  # Move labels to CPU before converting to numpy
            predictions.extend(np.argmax(logits, axis=1))
            true_labels.extend(label_ids)
    
    accuracy = accuracy_score(true_labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='weighted')
    
    
    return accuracy, precision, recall, f1
# Define optimizer and loss function
optimizer = optim.AdamW(model.parameters(), lr=2e-5)
loss_fn = nn.CrossEntropyLoss()

def train(model, train_loader, optimizer, loss_fn, epochs=15):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        all_preds = []
        all_labels = []
        
        for batch in train_loader:
            b_input_ids, b_attention_mask, b_labels = [t.to(device) for t in batch]  # Move batch to GPU
            optimizer.zero_grad()
            
            outputs = model(b_input_ids, b_attention_mask)
            loss = loss_fn(outputs, b_labels)
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            
            # Collect predictions and true labels
            preds = outputs.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()
            
            all_preds.append(preds)
            all_labels.append(label_ids)
        
        # Calculate average loss and F1 score for the epoch
        avg_loss = total_loss / len(train_loader)
        all_preds = np.concatenate(all_preds, axis=0)
        all_labels = np.concatenate(all_labels, axis=0)
        avg_f1_score = calculate_f1_score(all_preds, all_labels)
        
        print(f"Epoch {epoch+1}, Loss: {avg_loss}, F1 Score: {avg_f1_score}")

# Assuming `model` and `device` are already defined
train(model, train_loader, optimizer, loss_fn)

# Test

In [None]:
import torch
from transformers import BertModel, BertTokenizer
import torch.nn as nn
import torch.optim as optim
from linear_model import QAModel
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, f1_score
import torch

class QAModel(nn.Module):
    def __init__(self, model, classes=3, dropout_prob=0.5):
        super(QAModel, self).__init__()
        self.model_val = model
        self.dropout1 = nn.Dropout(dropout_prob)
        self.linear1 = nn.Linear(model.config.hidden_size, 128)
        self.dropout2 = nn.Dropout(dropout_prob)
        self.linear2 = nn.Linear(128, classes)  # Assuming 3 classes

    def forward(self, input_ids, attention_mask):
        outputs = self.model_val(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]  # CLS token
        cls_output = self.dropout1(cls_output)  # Apply first dropout
        cls_output = self.linear1(cls_output)  # Apply first linear layer
        cls_output = self.dropout2(cls_output)  # Apply second dropout
        logits = self.linear2(cls_output)  # Apply second linear layer
        return logits


class TrainandValidate:
    def __init__(self, model_name, source, df_train,df_val,df_test,stratify_col = 'decision_encoded'):
        self.name = model_name
        self.source = source
        if 'gpt2' in self.name or 'artificial' in self.name:
            self.batch_size = 16
        else:
            self.batch_size = 64

        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.tokenizer = self.initialize_tokenizer()
        self.stratify_col = stratify_col

        self.train_inputs, self.train_labels = self.encode_data(df_train)
        self.validate_inputs, self.validate_labels = self.encode_data(df_val)
        self.test_inputs, self.test_labels = self.encode_data(df_test)

        self.train_loader = self.create_dataloader(self.train_inputs, self.train_labels)
        self.validate_loader = self.create_dataloader(self.validate_inputs, self.validate_labels)
        self.test_loader = self.create_dataloader(self.test_inputs, self.test_labels)

        self.model = self.create_model().to(self.device)  
        self.optimizer = optim.AdamW(self.model.parameters(), lr=2e-5)
        self.loss_fn = nn.CrossEntropyLoss()


    def initialize_tokenizer(self):
        if 'gpt2' in self.name:
            tokenizer = GPT2Tokenizer.from_pretrained(self.source)
            if tokenizer.pad_token is None:
                tokenizer.add_special_tokens({'pad_token': '[PAD]'})
            return tokenizer
        elif 'BioLinkBERT' in self.name:
            tokenizer =  AutoTokenizer.from_pretrained(self.source)
            return tokenizer
        else:
            return BertTokenizer.from_pretrained(self.source)

    def encode_data(self,df):
        inputs=self.tokenizer(
            text=df['question'], 
            text_pair=df['long_answer'], 
            padding=True, 
            truncation=True, 
            return_tensors='pt', 
            max_length=128*4
        )
        labels = torch.tensor(df[self.stratify_col])
        return inputs, labels

    def create_dataloader(self, inputs, labels):
        dataset = TensorDataset(inputs['input_ids'], inputs['attention_mask'], labels)
        return DataLoader(dataset, batch_size=self.batch_size, shuffle=True)

    def create_model(self):
        if 'gpt2' in self.name:
            model =  GPT2Model.from_pretrained(self.source)
            model.resize_token_embeddings(len(self.tokenizer))
            model = QAModel(model)
        elif 'BioLinkBERT' in self.name:
            model =  AutoModel.from_pretrained(self.source)
            model = QAModel(model)
        else:
            model = BertModel.from_pretrained(self.source)
            model = QAModel(model)

        return model

    def calculate_f1_score(self, preds, labels):
        preds_flat = np.argmax(preds, axis=1).flatten()
        labels_flat = labels.flatten()
        return f1_score(labels_flat, preds_flat, average='weighted')

    def evaluate(self, dataloader):
        self.model.eval()
        total_loss=0
        predictions, true_labels = [], []
    
        with torch.no_grad():
            for batch in dataloader:
                b_input_ids, b_attention_mask, b_labels = [t.to(self.device) for t in batch]  # Move batch to GPU
                outputs = self.model(b_input_ids, b_attention_mask)
                logits = outputs.detach().cpu().numpy()  # Move outputs to CPU before converting to numpy
                label_ids = b_labels.cpu().numpy()  # Move labels to CPU before converting to numpy
                predictions.extend(np.argmax(logits, axis=1))
                true_labels.extend(label_ids)
    
        accuracy = accuracy_score(true_labels, predictions)
        precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='weighted')
        
    
        return accuracy, precision, recall, f1

    def training(self, epochs=15):
        self.model.train()
        for epoch in range(epochs):
            total_loss = 0
            all_preds = []
            all_labels = []
        
            for batch in self.train_loader:
                b_input_ids, b_attention_mask, b_labels = [t.to(self.device) for t in batch]  # Move batch to GPU
                self.optimizer.zero_grad()
            
                outputs = self.model(b_input_ids, b_attention_mask)
                loss = self.loss_fn(outputs, b_labels)
                loss.backward()
                self.optimizer.step()
            
                total_loss += loss.item()
            
                # Collect predictions and true labels
                preds = outputs.detach().cpu().numpy()
                label_ids = b_labels.to('cpu').numpy()
            
                all_preds.append(preds)
                all_labels.append(label_ids)
        
            # Calculate average loss and F1 score for the epoch
            avg_loss = total_loss / len(self.train_loader)
            all_preds = np.concatenate(all_preds, axis=0)
            all_labels = np.concatenate(all_labels, axis=0)
            avg_f1_score = self.calculate_f1_score(all_preds, all_labels)
        
            print(f"Epoch {epoch+1}, Loss: {avg_loss}, F1 Score: {avg_f1_score}")
        
        
    def val(self, epochs = 10):
        val_accuracy, val_precision, val_recall, val_f1 = self.evaluate(self.validate_loader)
        print(f"Validation - Accuracy: {val_accuracy}, Precision: {val_precision}, Recall: {val_recall}, F1-Score: {val_f1}")

        test_accuracy, test_precision, test_recall, test_f1=self.evaluate(self.test_loader)
        print(f"Test - Accuracy: {test_accuracy}, Precision: {test_precision}, Recall: {test_recall}, F1-Score: {test_f1}")


In [None]:
model_name = 'BERT-artificial'
source = 'bert-base-uncased'

trainer = TrainandValidate(model_name,source,df_train = pubmedqa_art_train,df_val = pubmedqa_art_val, df_test = pubmedqa_art_test)




In [None]:
trainer.training()

In [None]:
trainer.val()

In [None]:
import gc

#model.cpu()
#del model, checkpoint
gc.collect()
torch.cuda.empty_cache()