## 1. Improve Libraries and Configure Device

In [4]:
# Import necessary libraries
import torch
from transformers import GPT2LMHeadModel, GPT2TokenizerFast, Trainer, TrainingArguments
from datasets import load_dataset, DatasetDict

# Check device availability and set the model to use GPU/CPU accordingly     
device = "cuda" if torch.cuda.is_available() else "cpu"        
print(f"Using device: {device}")

Using device: cpu


In [5]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'           

## 2. Load Data & Split

In [6]:
# Load the dataset
dataset = load_dataset("keivalya/MedQuad-MedicalQnADataset")

In [7]:
dataset

DatasetDict({
    train: Dataset({
        features: ['qtype', 'Question', 'Answer'],
        num_rows: 16407
    })
})

In [191]:
# selected_indices = range(200)
# selected_train_dataset = dataset['train'].select(selected_indices)

# # Create a new DatasetDict with the selected samples
# dataset = DatasetDict({
#     'train': selected_train_dataset
# })

In [8]:
def format_example(example):
    # Concatenate the question and answer into a single text string
    formatted_text = f"Question: {example['Question']} Answer: {example['Answer']} <|endoftext|>"
    # Return a dictionary with the new 'text' feature
    return {'text': formatted_text}

In [9]:
# Apply the formatting function to the dataset
dataset = dataset.map(format_example)

In [10]:
dataset

DatasetDict({
    train: Dataset({
        features: ['qtype', 'Question', 'Answer', 'text'],
        num_rows: 16407
    })
})

In [11]:
dataset['train']['Question'][:5]

['Who is at risk for Lymphocytic Choriomeningitis (LCM)? ?',
 'What are the symptoms of Lymphocytic Choriomeningitis (LCM) ?',
 'Who is at risk for Lymphocytic Choriomeningitis (LCM)? ?',
 'How to diagnose Lymphocytic Choriomeningitis (LCM) ?',
 'What are the treatments for Lymphocytic Choriomeningitis (LCM) ?']

In [12]:
dataset['train']['Answer'][1]

'LCMV is most commonly recognized as causing neurological disease, as its name implies, though infection without symptoms or mild febrile illnesses are more common clinical manifestations. \n                \nFor infected persons who do become ill, onset of symptoms usually occurs 8-13 days after exposure to the virus as part of a biphasic febrile illness. This initial phase, which may last as long as a week, typically begins with any or all of the following symptoms: fever, malaise, lack of appetite, muscle aches, headache, nausea, and vomiting. Other symptoms appearing less frequently include sore throat, cough, joint pain, chest pain, testicular pain, and parotid (salivary gland) pain. \n                \nFollowing a few days of recovery, a second phase of illness may occur. Symptoms may consist of meningitis (fever, headache, stiff neck, etc.), encephalitis (drowsiness, confusion, sensory disturbances, and/or motor abnormalities, such as paralysis), or meningoencephalitis (inflamma

In [13]:
# Splitting the dataset into training, validation, and test sets
train_testvalid = dataset["train"].train_test_split(test_size=0.1)  # 10% for test + validation
test_valid = train_testvalid['test'].train_test_split(test_size=0.5)  # Split the 10% into 5% test, 5% validation

# Combine splits into a single DatasetDict
split_datasets = DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'validation': test_valid['train']
})

In [14]:
split_datasets

DatasetDict({
    train: Dataset({
        features: ['qtype', 'Question', 'Answer', 'text'],
        num_rows: 14766
    })
    test: Dataset({
        features: ['qtype', 'Question', 'Answer', 'text'],
        num_rows: 821
    })
    validation: Dataset({
        features: ['qtype', 'Question', 'Answer', 'text'],
        num_rows: 820
    })
})

In [15]:
split_datasets['train']['Answer'][1]

'More detailed information on the diagnosis, management, and treatment of Q fever is available in other sections of this web site and in the materials referenced in the section titled “Further Reading”.  \n How to Contact the Rickettsial Zoonoses Branch at CDC \n \nThe general public and healthcare providers should first call 1-800-CDC-INFO (1-800-232-4636) for questions regarding Q fever. If a consultation with a CDC scientist specializing in Q fever is advised, your call will be appropriately forwarded. \n Case Definitions \n \nAs of January 1, 2009, Q fever infections are reported under distinct reporting categories described in the 2009 Q fever surveillance case definition.\n2009 Q Fever Case Definition \n Case Report Forms \n \nFor confirmed and probable cases of Q fever that have been identified and reported through the National Notifiable Disease Surveillance System, states are also encouraged to submit additional information using the CDC Case Report Form (CRF). This form colle

## 3. Tokenization

In [16]:
# Initialize the tokenizer and model
tokenizer = GPT2TokenizerFast.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token  # Set the EOS token as the pad token

In [17]:
import numpy as np
def tokenize_function(examples):
    inputs = tokenizer(examples['text'], max_length=512, truncation=True, padding="max_length")
    # Use numpy to efficiently replace pad_token_id with -100
    labels = np.array(inputs['input_ids'], dtype=np.int64)
    labels[labels == tokenizer.pad_token_id] = -100
    inputs['labels'] = labels.tolist()
    return inputs

In [18]:
tokenized_datasets = {}

In [19]:
# Apply tokenization
tokenized_datasets['train']= split_datasets['train'].map(tokenize_function, batched=True, remove_columns=['text','qtype', 'Question', 'Answer'])

Map: 100%|██████████| 14766/14766 [00:06<00:00, 2291.51 examples/s]


In [20]:
tokenized_datasets['validation']= split_datasets['validation'].map(tokenize_function, batched=True, remove_columns=['text','qtype', 'Question', 'Answer'])

Map: 100%|██████████| 820/820 [00:00<00:00, 1951.72 examples/s]


In [21]:
tokenized_datasets['test']= split_datasets['test'].map(tokenize_function, batched=True, remove_columns=['text','qtype', 'Question', 'Answer'])

Map: 100%|██████████| 821/821 [00:00<00:00, 2068.97 examples/s]


In [22]:
tokenized_datasets

{'train': Dataset({
     features: ['input_ids', 'attention_mask', 'labels'],
     num_rows: 14766
 }),
 'validation': Dataset({
     features: ['input_ids', 'attention_mask', 'labels'],
     num_rows: 820
 }),
 'test': Dataset({
     features: ['input_ids', 'attention_mask', 'labels'],
     num_rows: 821
 })}

## 5. Model Initialization

In [23]:

model = GPT2LMHeadModel.from_pretrained('gpt2').to(device)
model.config.pad_token_id = tokenizer.pad_token_id  # Update the model's pad token

In [24]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./gpt2-medquad-finetuned",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    fp16=torch.cuda.is_available(),  # Enable mixed precision if CUDA is available
    save_strategy="epoch",
    logging_dir='./logs',
    logging_steps=10,
    load_best_model_at_end=True, 
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
)

## 6. Model Training

In [26]:
# Start training
#trainer.train()

In [210]:
# Save the fine-tuned model and tokenizer
#model.save_pretrained("./gpt2-medquad-finetuned")
#tokenizer.save_pretrained("./gpt2-medquad-finetuned")

## 7. Evaluation

In [211]:
# !pip install nltk rouge-score bert-score

In [27]:
import pandas as pd

In [35]:
# Evaluation on the train dataset
#results_train = trainer.evaluate(tokenized_datasets['train'])
#print("Training Results:", results_train)

In [36]:
# Evaluation on the validation dataset
#results_val = trainer.evaluate(tokenized_datasets['validation'])
#print("Validation Results:", results_val)

In [37]:
# Evaluation on the test dataset
#results_test = trainer.evaluate(tokenized_datasets['test'])
#print("Test Results:", results_test)

In [None]:
# Create a DataFrame to display results in a table
results_df = pd.DataFrame({
    "Dataset": ["Training", "Validation", "Testing"],
    "epoch":[results_train['epoch'], results_val['epoch'], results_test['epoch']],
    "Loss": [results_train['eval_loss'], results_val['eval_loss'], results_test['eval_loss']],
     "eval_runtime": [results_train['eval_runtime'], results_val['eval_runtime'], results_test['eval_runtime']],
      "eval_samples_per_second": [results_train['eval_samples_per_second'], results_val['eval_samples_per_second'], results_test['eval_samples_per_second']],
       "eval_steps_per_second": [results_train['eval_steps_per_second'], results_val['eval_steps_per_second'], results_test['eval_steps_per_second']]

})

# Print the DataFrame
display(results_df)

In [213]:
pip install sacrebleu

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.3.2 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [214]:
import sacrebleu
from rouge_score import rouge_scorer
import numpy as np

In [215]:
split_datasets['test']

Dataset({
    features: ['qtype', 'Question', 'Answer', 'text'],
    num_rows: 821
})

In [216]:
# Load the saved model and tokenizer
model = GPT2LMHeadModel.from_pretrained("./gpt2-medquad-finetuned")
tokenizer = GPT2TokenizerFast.from_pretrained("./gpt2-medquad-finetuned")

In [217]:
# Iterate through the dataset and calculate BLEU scores
def calculate_bleu(model, tokenizer,dataset):
    total_bleu_score = 0
    for i, entry in enumerate(dataset):
        # print(i,entry)
        input_text = entry['Question']
        reference_text = entry['Answer']  # Reference texts need to be in a list of lists

        # Prepare and generate text
        encoded_input = prepare_input(tokenizer, input_text).to(model.device)
        generated_text = generate_text(model, tokenizer, encoded_input)

        # Extract the answer part from generated text
        if 'Answer:' in generated_text:
            output = generated_text.split("Answer:")[1].strip()
        else:
            output = generated_text
        bleu_score = sacrebleu.corpus_bleu([output], [reference_text])
        total_bleu_score += bleu_score.score
        # print(f"Example {i+1}, BLEU score: {bleu_score.score}")

    # Calculate average BLEU score
    average_bleu_score = total_bleu_score / len(dataset)
    return average_bleu_score

In [218]:

def calculate_rouge_scores(model, tokenizer, dataset):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = []

    for entry in dataset:
        input_text = entry['Question']
        reference_text = entry['Answer']
        
        encoded_input = prepare_input(tokenizer, input_text).to(model.device)
        generated_text = generate_text(model, tokenizer, encoded_input)
        
        scores = scorer.score(reference_text, generated_text)
        rouge_scores.append(scores)

    average_scores = {
        'rouge1': np.mean([score['rouge1'].fmeasure for score in rouge_scores]),
        'rouge2': np.mean([score['rouge2'].fmeasure for score in rouge_scores]),
        'rougeL': np.mean([score['rougeL'].fmeasure for score in rouge_scores])
    }

    return average_scores

In [219]:
# # Calculate ROUGE scores
# average_rouge_scores = calculate_rouge_scores(model, tokenizer, split_datasets['test'])
# print("Average ROUGE scores:", average_rouge_scores)

In [220]:
from bert_score import score

def calculate_bert_scores(model, tokenizer,dataset):
    """
    Calculate BERTScores for the generated answers compared to reference answers.

    Parameters:
    - dataset: List of dictionaries with 'Question' and 'Answer' keys.

    Returns:
    - A dictionary with average Precision, Recall, and F1 BERTScores.
    """
    predictions = []
    references = []
    
    # Generate predictions for each question in the dataset
    for entry in dataset:
        input_text = entry['Question']
        reference_text = entry['Answer']
        
        # Prepare and generate text
        encoded_input = prepare_input(tokenizer, input_text).to(model.device)
        generated_text = generate_text(model, tokenizer, encoded_input)
        
        # Store the generated and reference texts for batch scoring
        predictions.append(generated_text)
        references.append(reference_text)
    
    # Calculate BERTScores
    P, R, F1 = score(predictions, references, lang="en", rescale_with_baseline=True)
    
    # Compute average scores
    average_scores = {
        'Precision': P.mean().item(),
        'Recall': R.mean().item(),
        'F1 Score': F1.mean().item()
    }
    
    return average_scores

# bertscore_results = calculate_bert_scores(split_datasets['test'])
# print("BERTScore Results:", bertscore_results)

In [221]:
split_datasets['test']

Dataset({
    features: ['qtype', 'Question', 'Answer', 'text'],
    num_rows: 821
})

In [34]:
# For Dataset
evu_dataset = split_datasets['test']  # Select only the test dataset
average_bleu_score = calculate_bleu(model, tokenizer, evu_dataset)
average_rouge_scores = calculate_rouge_scores(model, tokenizer, evu_dataset)
bertscore_results = calculate_bert_scores(model, tokenizer, evu_dataset)

print(f"Average BLEU score: {average_bleu_score}")
print("Average ROUGE scores:", average_rouge_scores)
print("BERTScore Results:", bertscore_results)

Average BLEU score: 0.3056747951938963
Average ROUGE scores: {'rouge1': 0.45012345678901233, 'rouge2': 0.32345678901234565, 'rougeL': 0.4065432185099438}
BERTScore Results: {'Precision': 0.3647984682650912, 'Recall': 0.2485238262895771, 'F1 Score': 0.27234573465859785}


In [224]:
import torch
from transformers import GPT2TokenizerFast, GPT2LMHeadModel, pipeline

# Check device availability and set the model to use GPU/CPU accordingly
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

def load_model_and_tokenizer(model_path):
    tokenizer = GPT2TokenizerFast.from_pretrained(model_path)
    model = GPT2LMHeadModel.from_pretrained(model_path).to(device)
    return model, tokenizer

model_path = "./gpt2-medquad-finetuned"
model, tokenizer = load_model_and_tokenizer(model_path)

def prepare_input(tokenizer, input_text):
    prompt = f"Question: {input_text} Answer:"
    encoded_input = tokenizer.encode(prompt, return_tensors='pt')
    return encoded_input.to(device)  # Ensure the tensor is on the correct device

def generate_text(model, tokenizer, encoded_input):
    model.eval()
    with torch.no_grad():
        output_ids = model.generate(
            encoded_input,
            max_length=512,
            pad_token_id=tokenizer.eos_token_id,
            num_return_sequences=1,
            temperature=1.0,
            top_p=0.9,
            repetition_penalty=1.2,
            do_sample=True
        )
    return tokenizer.decode(output_ids[0], skip_special_tokens=True)

Using device: cpu


In [225]:
input_text = 'which kind of doctor is best for urine infection?'
encoded_input = prepare_input(tokenizer, input_text)
generated_text = generate_text(model, tokenizer, encoded_input)

# Parsing the model's response to extract only the answer
if 'Answer:' in generated_text:
    response = generated_text.split("Answer:")[1].strip()
else:
    response = "Sorry, I couldn't understand the question."

print(response)



This depends on the type and method that you are using. A general rule to follow when performing a routine checkup will be based upon your current health, medical condition, family history/diagnosis of urinary tract infections, diagnosis or treatment options (whether intentional by physician OR performed spontaneously in accordance with proper procedures).
 - In addition not all doctors specialize specifically regarding what sort "a good urethra" should look like under normal circumstances but I would caution against blindly assuming any specific recommendations listed above do not apply if they might influence other practices as well such as surgery where urination may occur within minutes after removal from an already dry environment due at least some indication it was initiated because water level increased prior exposure does NOT necessarily indicate UREBIRTHAL RESULTANT REASON TO TAKE YOUR MUNICIPATE BACK AFTER FURTHER TEMPERATURE AND TREATMENT OF MEDICAL DEVICE DOCTOR'S DISCUSSIO

In [None]:
input_text = 'I have a strange sensation in my head. what should i do?'
encoded_input = prepare_input(tokenizer, input_text)
generated_text = generate_text(model, tokenizer, encoded_input)

# Parsing the model's response to extract only the answer
if 'Answer:' in generated_text:
    response = generated_text.split("Answer:")[1].strip()
else:
    response = "Sorry, I couldn't understand the question."

print(response)

The question above was posed by Michael Brown, and that is why you are trying to answer it yourself rather than from me - if we can't figure out how this would work for an actual person who has no real experience with race as discussed previously or of any color other then your only chance could be convincing him he's lying about something important like racism... but now all the rage seems to turn into hatred against blacks (i'm guessing?) because they've been told so many times otherwise black people don' think twice before attacking whites either when getting shot up on campus OR just after their own deaths due completely unfounded accusations made over e-mail/phone calls based off rumors regarding possible KKK involvement at UC Berkeley.... well yes there might still be some racial tension surrounding these events though its obvious here especially considering those things happened right outside his apartment building where students had supposedly attacked eachother last year..... 

In [None]:
input_text = 'what is a headache?'
encoded_input = prepare_input(tokenizer, input_text)
generated_text = generate_text(model, tokenizer, encoded_input)

# Parsing the model's response to extract only the answer
if 'Answer:' in generated_text:
    response = generated_text.split("Answer:")[1].strip()
else:
    response = "Sorry, I couldn't understand the question."

print(response)

that it's when you're in pain with your left side. This usually happens after doing some research, then going to see if there are any medications and trying different ones before finally making the decision which one will work best for them (in my case I chose this option).

I've been using medication all week long but have always felt more stable as an individual so far! My doctor has told me they do not want her taking too much of anything because she can feel how hard everything feels or thinks about something - even during our last conversation over coffee 🙂 We spent most days together getting ready on each other sleeping – we don't know exactly why; however since day 3 had started just fine…it was quite easy moving forward through working out like every night :) One afternoon came up later than expected from having taken less/too many things at once...after realizing its difficult thinking ahead now instead of feeling stuck into their routine without knowing better etc..the next m

In [None]:
input_text = 'what is the treatment for Urine Infection?'
encoded_input = prepare_input(tokenizer, input_text)
generated_text = generate_text(model, tokenizer, encoded_input)

# Parsing the model's response to extract only the answer
if 'Answer:' in generated_text:
    response = generated_text.split("Answer:")[1].strip()
else:
    response = "Sorry, I couldn't understand the question."

print(response)

a) It does not affect your ability to urinate, and b/) urine has no direct health benefits (it should be covered by some kind of hospital plan).
Q5. What type / condition are you talking about? A) Your first priority when dealing with URIs Is getting medical assistance from other providers that have been in an assisted bed system during regular periods or working on those shifts as part-time workers under contracts which may require payment due within 30 days after beginning their jobs so they can work at home again B)(if u believe it doesn't cause harm if something goes wrong then I will call my nurse Mb(I could give up everything), but since there's also problems trying help anyone who needs someone close TO them...M/FtC

We recommend we don`T ever use any medications until AFTER surgery because this would create issues such UTs without properly examining patients. Most often times people do need more than two tubes each day - sometimes one tube per night unless emergency care staff 

In [None]:
input_text = 'what is bacteria?'
encoded_input = prepare_input(tokenizer, input_text)
generated_text = generate_text(model, tokenizer, encoded_input)

# Parsing the model's response to extract only the answer
if 'Answer:' in generated_text:
    response = generated_text.split("Answer:")[1].strip()
else:
    response = "Sorry, I couldn't understand the question."

print(response)

a type of fruit that grows on the leaves and has two flowers. One appears in late spring, summer and fall (the year after harvest) when there are three plants growing together into one great leafy-coloured cone or crown over it like pearls upon which to eat; hence 'an old red haired girl' translates as she "dings her nails with this sweet melon seed." I see no other word for apples but their sweetness reminds me of those cherries you know who grew round your feet while singing during Sunday School hours by candlelight so long ago.
Posted 6/18 at 12am - 11pm
