In [11]:
# Install the necessary libraries
!pip install transformers datasets nltk

# Mount Google Drive to access the BioASQ dataset
from google.colab import drive
drive.mount('/content/drive')

import json
import os
from transformers import BertTokenizer, BertForMaskedLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import Dataset
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
import torch
from torch.utils.data import Dataset, DataLoader

# Download NLTK data
nltk.download('stopwords')
nltk.download('wordnet')

# Define the path to BioASQ data in Google Drive
bioasq_path = '/content/drive/My Drive/Fall2024/CS410/FinalProject/training12b_new.json'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [12]:
class BioASQProcessor:
    def __init__(self, bioasq_path):
        self.punctuations = '"\\,<>./?@#$%^&*_~/!()-[]{};:\''
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()
        self.bioasq_path = bioasq_path
        self.dataset = self.load_dataset()
        self.info = self.collect_info()

    def load_dataset(self):
        # Load BioASQ dataset
        with open(self.bioasq_path, 'r') as file:
            return json.load(file)

    def preprocess_text(self, text):
        # Lowercase and remove HTML
        text = text.lower().strip()
        if '<' in text:
            text = text.split('<')[0]
        if '>' in text:
            text = text.split('>')[-1]

        # Remove punctuation, tokenize, remove stopwords, and lemmatize
        clean_text = ''.join(char for char in text if char not in self.punctuations and not char.isdigit())
        words = clean_text.split()
        clean_words = [self.lemmatizer.lemmatize(word) for word in words if word not in self.stop_words]

        return ' '.join(clean_words)

    def preprocess_data(self):
        # Preprocess questions, snippets, and answers in each entry
        for entry in self.dataset['questions']:
            entry['body'] = self.preprocess_text(entry['body'])
            for snippet in entry['snippets']:
                snippet['text'] = self.preprocess_text(snippet['text'])
            if 'ideal_answer' in entry:
                entry['ideal_answer'] = [self.preprocess_text(ans) for ans in entry['ideal_answer']]
            if 'concepts' in entry:
                entry['concepts'] = [self.preprocess_text(con) for con in entry['concepts']]
        return self.dataset

    def collect_info(self):
        # Extract and display information about the dataset
        num_entries = len(self.dataset['questions'])
        dataset_size = os.path.getsize(self.bioasq_path) / 1024  # Size in KB
        vocab = set()

        for entry in self.dataset['questions']:
            vocab.update(entry['body'].split())
            for snippet in entry['snippets']:
                vocab.update(snippet['text'].split())
            if 'ideal_answer' in entry:
                for ans in entry['ideal_answer']:
                    vocab.update(ans.split())
            if 'concepts' in entry:
                vocab.update(entry['concepts'])

        return {
            'num_entries': num_entries,
            'dataset_size_kb': dataset_size,
            'vocab_size': len(vocab),
            'avg_words_per_entry': sum(len(entry['body'].split()) + sum(len(snippet['text'].split()) for snippet in entry['snippets']) for entry in self.dataset['questions']) / num_entries,
        }

# Process BioASQ Data
bioasq_processor = BioASQProcessor(bioasq_path)
preprocessed_data = bioasq_processor.preprocess_data()
info = bioasq_processor.info
print("Dataset Info:", info)

Dataset Info: {'num_entries': 5049, 'dataset_size_kb': 38994.5361328125, 'vocab_size': 138371, 'avg_words_per_entry': 348.4848484848485}


In [13]:
class BioASQProcessor:
    def __init__(self, bioasq_path):
        self.punctuations = '"\\,<>./?@#$%^&*_~/!()-[]{};:\''
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()
        self.bioasq_path = bioasq_path
        self.dataset = self.load_dataset()
        self.info = self.collect_info()

    def load_dataset(self):
        # Load BioASQ dataset
        with open(self.bioasq_path, 'r') as file:
            return json.load(file)

    def preprocess_text(self, text, word_counter):
        # Count words before preprocessing
        word_counter['before'] += len(text.split())

        # Lowercase and remove HTML
        text = text.lower().strip()
        if '<' in text:
            text = text.split('<')[0]
        if '>' in text:
            text = text.split('>')[-1]

        # Remove punctuation, tokenize, remove stopwords, and lemmatize
        clean_text = ''.join(char for char in text if char not in self.punctuations and not char.isdigit())
        words = clean_text.split()
        clean_words = [self.lemmatizer.lemmatize(word) for word in words if word not in self.stop_words]

        # Count words after preprocessing
        word_counter['after'] += len(clean_words)
        word_counter['unique_words'].update(clean_words)

        return ' '.join(clean_words)

    def preprocess_data(self):
        word_counter = {'before': 0, 'after': 0, 'unique_words': set()}

        # Preprocess questions, snippets, and answers in each entry
        for entry in self.dataset['questions']:
            entry['body'] = self.preprocess_text(entry['body'], word_counter)
            for snippet in entry['snippets']:
                snippet['text'] = self.preprocess_text(snippet['text'], word_counter)
            if 'ideal_answer' in entry:
                entry['ideal_answer'] = [self.preprocess_text(ans, word_counter) for ans in entry['ideal_answer']]
            if 'concepts' in entry:
                entry['concepts'] = [self.preprocess_text(con, word_counter) for con in entry['concepts']]

        # Add word count info to the dataset info
        self.info['word_count_before'] = word_counter['before']
        self.info['word_count_after'] = word_counter['after']
        self.info['unique_words_after'] = len(word_counter['unique_words'])

        return self.dataset

    def collect_info(self):
        # Extract and display information about the dataset
        num_entries = len(self.dataset['questions'])
        dataset_size = os.path.getsize(self.bioasq_path) / 1024  # Size in KB
        vocab = set()

        for entry in self.dataset['questions']:
            vocab.update(entry['body'].split())
            for snippet in entry['snippets']:
                vocab.update(snippet['text'].split())
            if 'ideal_answer' in entry:
                for ans in entry['ideal_answer']:
                    vocab.update(ans.split())
            if 'concepts' in entry:
                vocab.update(entry['concepts'])

        return {
            'num_entries': num_entries,
            'dataset_size_kb': dataset_size,
            'vocab_size': len(vocab),
            'avg_words_per_entry': sum(len(entry['body'].split()) + sum(len(snippet['text'].split()) for snippet in entry['snippets']) for entry in self.dataset['questions']) / num_entries,
        }

# Process BioASQ Data
bioasq_processor = BioASQProcessor(bioasq_path)
preprocessed_data = bioasq_processor.preprocess_data()
info = bioasq_processor.info

# Print dataset information including word counts before and after preprocessing
print("Dataset Info:", info)

Dataset Info: {'num_entries': 5049, 'dataset_size_kb': 38994.5361328125, 'vocab_size': 138371, 'avg_words_per_entry': 348.4848484848485, 'word_count_before': 2104161, 'word_count_after': 1345637, 'unique_words_after': 57154}


In [14]:
import json

print(json.dumps(info, indent=4))


{
    "num_entries": 5049,
    "dataset_size_kb": 38994.5361328125,
    "vocab_size": 138371,
    "avg_words_per_entry": 348.4848484848485,
    "word_count_before": 2104161,
    "word_count_after": 1345637,
    "unique_words_after": 57154
}


In [15]:
!pip install datasets
from datasets import Dataset




In [16]:
# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
tokenizer.save_pretrained("bioasq_tokenizer")

# Convert processed BioASQ data to text for MLM
texts = []
for entry in preprocessed_data['questions']:
    question = entry['body']
    snippets = " ".join(snippet['text'] for snippet in entry.get('snippets', []))
    combined_text = question + " " + snippets
    texts.append(combined_text)

# Create a Hugging Face Dataset and tokenize
bioasq_dataset = Dataset.from_dict({'text': texts})

def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=128)

# Tokenize the dataset
tokenized_datasets = bioasq_dataset.map(tokenize_function, batched=True, remove_columns=['text'])




Map:   0%|          | 0/5049 [00:00<?, ? examples/s]

In [17]:
# Load the BERT model for MLM
model = BertForMaskedLM.from_pretrained("bert-base-uncased")

# Data collator with MLM functionality
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [22]:
from transformers import Trainer, TrainingArguments

# Define TrainingArguments with evaluation disabled
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    evaluation_strategy="no",  # Disable evaluation
)

# Initialize Trainer with only train dataset
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,  # Your training data
    eval_dataset=None,  # Explicitly set to None to skip evaluation
    tokenizer=tokenizer,
    data_collator=data_collator
)

# Train the model
trainer.train()



Step,Training Loss
500,0.8455
1000,2.3999
1500,2.7236


TrainOutput(global_step=1896, training_loss=2.1246726925363015, metrics={'train_runtime': 507.6458, 'train_samples_per_second': 29.838, 'train_steps_per_second': 3.735, 'total_flos': 996690824870400.0, 'train_loss': 2.1246726925363015, 'epoch': 3.0})

In [23]:
# Save the fine-tuned model
model.save_pretrained("./biomedical-bert")
tokenizer.save_pretrained("./biomedical-bert")


('./biomedical-bert/tokenizer_config.json',
 './biomedical-bert/special_tokens_map.json',
 './biomedical-bert/vocab.txt',
 './biomedical-bert/added_tokens.json')

In [26]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch

# Load the trained model and tokenizer
model_path = "./biomedical-bert"
tokenizer_path = "./biomedical-bert"
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
model = AutoModelForQuestionAnswering.from_pretrained(model_path)

def get_manual_answer(question, context):
    # Tokenize and encode question and context
    inputs = tokenizer(question, context, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
        answer_start = torch.argmax(outputs.start_logits)
        answer_end = torch.argmax(outputs.end_logits) + 1
        answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs.input_ids[0][answer_start:answer_end]))
    return answer

def manual_testing():
    print("Manual Testing of BioASQ QA Model")
    print("Enter 'quit' to exit.")
    while True:
        # Input question
        question = input("Enter your biomedical question: ")
        if question.lower() == 'quit':
            print("Exiting manual testing.")
            break

        # Input optional context (or leave it blank)
        context = input("Enter context (or press Enter to skip): ")

        # Get answer from the model
        answer = get_manual_answer(question, context)
        print(f"Answer: {answer}\n")

# Start manual testing
manual_testing()

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at ./biomedical-bert and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Manual Testing of BioASQ QA Model
Enter 'quit' to exit.
Enter your biomedical question: quit
Exiting manual testing.
