# Health Fact or Fiction? A Comparison of BERT-Based Models and LLMs on Detecting Health Misinformation About COVID-19 and Measles
*High Risk Project, uaa99, Spring 2025*

### Part 0: Dependencies

For this project you will need to set your Google Gemini API key below.

In [None]:
!pip install transformers datasets scikit-learn
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm import tqdm
from google import genai
from google.genai import types
import numpy as np
import wandb
import json
import time
import logging
from concurrent.futures import ThreadPoolExecutor

wandb.init(mode='disabled')
client = genai.Client(api_key="YOUR API KEY HERE")

## Part 1: Model Selection and Preparation

We're going to be evaluating four models at this task:  BERT, Clinical-BERT, and BioMedBert and Gemini Flash.

In [None]:
bertSeqClass = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3, force_download=True)
bertSeqTokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

In [None]:
clinicalBert = AutoModelForSequenceClassification.from_pretrained("emilyalsentzer/Bio_ClinicalBERT", num_labels=3, force_download=True)
clinicalBertTokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

In [None]:
bioMedBert = AutoModelForSequenceClassification.from_pretrained("microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext", num_labels=3, force_download=True)
bioMedBertTokenizer = AutoTokenizer.from_pretrained("microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext")

In [None]:
def get_response_from_gemini(system_instruction, content):
  return client.models.generate_content(
      model="gemini-2.0-flash-lite",
      config=types.GenerateContentConfig(
          system_instruction=system_instruction),
      contents=content
  )

In [None]:
def evaluate_claim_with_llm(claim, tokenizer, model):
    sys_message = '''
    You are an AI Medical Assistant trained on a vast dataset of health information. Please evaluate the provided claim
    and respond with the following determination:
    0 - The claim is false
    1 - The claim is true
    2 - I am unable to make a determination

    Please only respond with a 0, 1, or 2. Do not include any other text.
    '''
    # Create messages structured for the chat template
    messages = [{"role": "system", "content": sys_message}, {"role": "user", "content": claim}]

    # Applying chat template
    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer(prompt, return_tensors="pt")
    outputs = model.generate(**inputs, max_new_tokens=100, use_cache=False)

    # Extract and return the generated text, removing the prompt
    response_text = tokenizer.batch_decode(outputs)[0].strip()
    print(response_text)
    answer = response_text.split('<|im_start|>assistant')[-1].strip()
    return answer

## Part 2: Data Loading and Preparation
Let's begin by loading up the data we are going to need to train and evaluate our models. We are going to be using the Covid 19 News Rumors dataset from [A COVID-19 Rumor Dataset](https://www.frontiersin.org/journals/psychology/articles/10.3389/fpsyg.2021.644801/full), published in Frontiers in Psychology. And the Measles Rumors dataset created by me. Measles Rumors is publically available at this link, please download it and save it to a place where it's accessbile by this notebook.

In [None]:
covid_claims = "./news.csv"
df_covid = pd.read_csv(covid_claims, header=None, names=["id", "label", "text", "sentiment"])
df_covid.head()

In [None]:
measles_claims = "./measles_claims.csv"
df_measles = pd.read_csv(measles_claims)
print(df_measles['label'].value_counts())

In [None]:
# map the string labels to integer labels
label_map = {'F': 0, 'T': 1, 'U': 2, 'U(Twitter)': 2}
df_covid['label'] = df_covid['label'].map(lambda x: label_map.get(x))
df_covid.head()

In [None]:
print(df_covid['label'].value_counts())

In [None]:
# create train test split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df_covid['text'].tolist(),
    df_covid['label'].tolist(),
    test_size=0.2,
    random_state=42
)

val_texts_measles, val_labels_measles = df_measles['text'].tolist(), df_measles['label'].tolist()

In [None]:
# create custom pytorch dataset
class MisinformationDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

## Part 3: Model Training

Now let's train all three BERT-based models using the HuggingFace trainer API.

In [None]:
def get_training_args(num_epochs):
  return TrainingArguments(
    output_dir=None,
    num_train_epochs=num_epochs,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir='./logs',
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    learning_rate=2e-5,
    lr_scheduler_type='linear',
    report_to="none"
  )

In [None]:
def compute_metrics(p):
    predictions, labels = p
    predictions = predictions.argmax(axis=-1)
    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted') # Use 'weighted' for multiclass
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
    }

In [None]:
def get_trainer(model, tokenizer, training_args):
  if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = model.config.eos_token_id

  train_encodings = tokenizer(train_texts, truncation=True, padding=True)
  val_encodings = tokenizer(val_texts, truncation=True, padding=True)

  train_dataset = MisinformationDataset(train_encodings, train_labels)
  val_dataset = MisinformationDataset(val_encodings, val_labels)

  return Trainer(
      model=model,
      args=training_args,
      train_dataset=train_dataset,
      eval_dataset=val_dataset,
      compute_metrics=compute_metrics,
  )

In [None]:
# init trainers
training_args = get_training_args(15)
bert_trainer = get_trainer(bertSeqClass, bertSeqTokenizer, training_args)
clinical_bert_trainer = get_trainer(clinicalBert, clinicalBertTokenizer, training_args)
bio_bert_trainer = get_trainer(bioMedBert, bioMedBertTokenizer, training_args)

In [None]:
# train
trainers = {
    "BERT": bert_trainer,
    "ClinicalBERT": clinical_bert_trainer,
    "BioBERT": bio_bert_trainer,
}

for name, trainer in trainers.items():
  trainer.train()
  torch.cuda.empty_cache()

## Part 4: Evaluating Performance

In [None]:
bert_results = bert_trainer.evaluate()
clinical_bert_results = clinical_bert_trainer.evaluate()
bio_bert_results = bio_bert_trainer.evaluate()

In [None]:
print("Bert Evaluation Results:", json.dumps(bert_results, indent=4))

In [None]:
print("ClinicalBert Evaluation Results:", json.dumps(clinical_bert_results, indent=4))

In [None]:
print("BioBert Evaluation Results:", json.dumps(bio_bert_results, indent=4))

In [None]:
def evaluate_llm(val_texts, val_labels):
    gemini_prompt = """
    You are a helpful medical assistant. Your job is to evaluate the factuality of a sentance about a health topic.
    Please respond with one of the following options:
    1. 0: The sentance is false, misleading, or inaccurate
    2. 1: The sentance is true, factual, or correct
    3. 2: You are unable to verify the factuality of the sentance.

    Do not include any other text with the response.
    """
    num_items = len(val_texts)
    requests_sent = 0
    start_time = time.time()
    preds = []
    requests_per_minute = 30

    for i in range(len(val_texts)):
        response = None
        try:
            claim = val_texts[i]
            label = val_labels[i]
            response = get_response_from_gemini(gemini_prompt, claim)
            preds.append(response)
        except Exception as e:
            print(f"Error for request {i+1}/{num_items}: {e}")

        requests_sent += 1

        if requests_sent % requests_per_minute == 0:
            elapsed_time = time.time() - start_time
            if elapsed_time < 60:
                sleep_duration = 60 - elapsed_time
                print(f"Sent {requests_sent}/{num_items} requests. Sleeping for {sleep_duration:.2f} seconds to maintain rate limit of {requests_per_minute} per minute.")
                time.sleep(sleep_duration)
            start_time = time.time()

    print(f"Finished sending {num_items} requests sequentially.")

    return preds

In [None]:
preds = evaluate_llm(val_texts, val_labels)

In [None]:
def compute_llm_metrics(predictions, val_labels):
    predictions = [int(pred.text.rstrip('\n')) for pred in predictions]
    accuracy = accuracy_score(val_labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(val_labels, predictions, average='weighted') # Use 'weighted' for multiclass
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
    }

In [None]:
llm_results = compute_llm_metrics(preds)

In [None]:
print("Gemini Evaluation Results:", json.dumps(llm_results, indent=4))

In [None]:
def load_gemini_covid_preds(path):
  lines = 90
  with open(path, 'r') as file:
    lines = [int(line.strip()) for line in file.readlines()]

  return lines

In [None]:
def get_covid_eval_dataset(tokenizer):
  val_encodings = tokenizer(val_texts, truncation=True, padding=True)
  val_dataset = MisinformationDataset(val_encodings, val_labels)

  return val_dataset

In [None]:
bert_preds_covid, _, _ = bert_trainer.predict(get_covid_eval_dataset(bertSeqTokenizer))
clinical_bert_preds_covid, _, _ = clinical_bert_trainer.predict(get_covid_eval_dataset(clinicalBertTokenizer))
bio_bert_preds_covid, _, _ = bio_bert_trainer.predict(get_covid_eval_dataset(bioMedBertTokenizer))

In [None]:
bert_preds_covid = [prediction.argmax(axis=-1) for prediction in bert_preds_covid]
clinical_bert_preds_covid = [prediction.argmax(axis=-1) for prediction in clinical_bert_preds_covid]
bio_bert_preds_covid = [prediction.argmax(axis=-1) for prediction in bio_bert_preds_covid]

In [None]:
def evaluate_predictions(predictions, labels, texts, is_llm=False):
  errors = {
      0: [],
      1: [],
      2: []
  }
  correct = {
      0: [],
      1: [],
      2: []
  }
  for i in range(len(predictions)):
    prediction = predictions[i]
    label = labels[i]
    claim = texts[i]
    pred = prediction
    if label == pred:
      correct[label].append(claim)
    else:
      errors[label].append({
          "claim": claim,
          "pred": pred
      })

  return errors, correct

In [None]:
errors_bert, correct_bert = evaluate_predictions(bert_preds_covid, val_labels, val_texts)
errors_clinical_bert, correct_clinical_bert = evaluate_predictions(clinical_bert_preds_covid, val_labels, val_texts)
errors_bio_bert, correct_bio_bert = evaluate_predictions(bio_bert_preds_covid, val_labels, val_texts)
errors_gemini, correct_gemini = evaluate_predictions(gemini_covid_preds, val_labels, val_texts, True)

In [None]:
print(len(val_texts))

In [None]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(val_labels, bert_preds_covid))
print(confusion_matrix(val_labels, clinical_bert_preds_covid))
print(confusion_matrix(val_labels, bio_bert_preds_covid))
print(confusion_matrix(val_labels, gemini_covid_preds))

In [None]:
error_text_bert = set([item["claim"] for item in errors_bert[1]])
error_text_clinical_bert = set([item["claim"] for item in errors_clinical_bert[1]])
error_text_bio_bert = set([item["claim"] for item in errors_bio_bert[1]])
common_errors = error_text_clinical_bert & error_text_bert
difference_errors = error_text_bio_bert - error_text_bert

In [None]:
print(len(common_errors))

In [None]:
print(list(difference_errors)[11])

In [None]:
print(errors_gemini[0])

In [None]:
print(difference_errors)

In [None]:
print(errors_clinical_bert[1])

## Part 5: Evaluating Performance on Claims about Measles

In [None]:
def get_measles_eval_dataset(tokenizer):
  val_encodings = tokenizer(val_texts_measles, truncation=True, padding=True)
  val_dataset = MisinformationDataset(val_encodings, val_labels_measles)

  return val_dataset

In [None]:
bert_results_measles = bert_trainer.evaluate(eval_dataset=get_measles_eval_dataset(bertSeqTokenizer))
clinical_bert_results_measles = clinical_bert_trainer.evaluate(eval_dataset=get_measles_eval_dataset(clinicalBertTokenizer))
bio_bert_results_measles = bio_bert_trainer.evaluate(eval_dataset=get_measles_eval_dataset(bioMedBertTokenizer))

In [None]:
print("Bert Evaluation Results:", json.dumps(bert_results_measles, indent=4))

In [None]:
print("ClinicalBert Evaluation Results:", json.dumps(clinical_bert_results_measles, indent=4))

In [None]:
print("BioBert Evaluation Results:", json.dumps(bio_bert_results_measles, indent=4))

In [None]:
preds_measles = evaluate_llm(val_texts_measles, val_labels_measles)

In [None]:
llm_results_measles = compute_llm_metrics(preds_measles, val_labels_measles)

In [None]:
print("Gemini Evaluation Results:", json.dumps(llm_results_measles, indent=4))