Imports

In [None]:
#imports
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback, set_seed
from datasets import Dataset
from tqdm import tqdm
import matplotlib.pyplot as plt

# Import our modules
import sys
import os

utils_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'utils'))
models_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'models'))

if utils_path not in sys.path:
    sys.path.insert(0, utils_path)
if models_path not in sys.path:
    sys.path.insert(0, models_path)

from general_utils import load_data, prepare_all_samples, get_entity_date_pairs, calculate_metrics
from bert_training_utils import create_training_pairs, handle_class_imbalance, add_special_tokens, tokenize_function, compute_metrics, build_gold_lookup, get_label_for_pair
from bert_extractor_utils import preprocess_input, mark_entities_full_text
from bert_model import BertRC

Data Loading

In [None]:
# Load data
df = load_data("../data/training_dataset_synthetic.csv")
print(f"Loaded {len(df)} records")

In [None]:
#Inspect df
df.head()

In [None]:
# Prepare all samples
samples = prepare_all_samples(df)
print(f"Prepared {len(samples)} samples")
#samples[0]

Model & Data Preparation for Finetuning

In [None]:
# Set seed for reproducibility
set_seed(42)

In [None]:
#Set path to save model
model_save_path = '../models/bert_model/'

In [None]:
#Choose model to use - any BERT model from HuggingFace can be used, see: https://huggingface.co/google-bert

model_name = "google/bert_uncased_L-2_H-128_A-2" #2 layers, 128 hidden dim, 2 attention heads
#model_name = "google-bert/bert-base-uncased" #12 layers, 768 hidden dim, 12 attention heads
#model_name = "google-bert/bert-base-cased" #12 layers, 768 hidden dim, 12 attention heads
#model_name = "prajjwal1/bert-tiny"  # 4.4M parameters, 2 layers, 128 hidden dim
#model_name = "distilroberta-base"  # ~82M parameters, 6 layers, 768 hidden dim
#model_name = "boltuix/EntityBERT"
#model_name - "yikuan8/Clinical-Longformer"
#model_name = "SpanBERT/spanbert-base-cased"  # 110M parameters, 12 layers, 768 hidden dim
#model_name = "allenai/biomed_roberta_base"  # 125M parameters, 12 layers, 768 hidden dim
#model_name = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext"  # 125M parameters, 12 layers, 768 hidden dim
#model_name = "../models/PubmedBERTbase-MimicSmall-EntityBERT/"

In [None]:
# Create training pairs
processed_df = create_training_pairs(samples)
print(f"\nCreated {len(processed_df)} training pairs")

In [None]:
# Handle class imbalance
balanced_df, class_weights = handle_class_imbalance(processed_df, method='weighted')
print(f"Class weights: {class_weights}")

print("\nAfter balancing:")
print(balanced_df['label'].value_counts())

In [None]:
# Train-test split
train_df, test_df = train_test_split(balanced_df, test_size=0.2, random_state=42, stratify=balanced_df['label'])
print(f"Train: {len(train_df)}, Test: {len(test_df)}")

print("\nTrain set distribution:")
print(train_df['label'].value_counts())
print("\nTest set distribution:") 
print(test_df['label'].value_counts())

In [None]:
# Setup tokenizer with special tokens
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer = add_special_tokens(tokenizer)

In [None]:
# Prepare PyTorch datasets
train_dataset = Dataset.from_pandas(train_df[['marked_text', 'label']])
test_dataset = Dataset.from_pandas(test_df[['marked_text', 'label']])

In [None]:
# Tokenize
train_tokenized = train_dataset.map(lambda x: tokenize_function(x, tokenizer, max_length=256), batched=True)
test_tokenized = test_dataset.map(lambda x: tokenize_function(x, tokenizer, max_length=256), batched=True)

In [None]:
# Set format for PyTorch
train_tokenized.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_tokenized.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

BERT Finetuning

In [None]:
# Create custom model with span pooling
# Note embeddings are re-sized within bert_model.py so doesn't need to be manually done here
model = BertRC(
    model_name=model_name,
    tokenizer=tokenizer,
    num_labels=2,
    class_weights=class_weights
)

In [None]:
#Or alternatively use the BERT model as is - note you should only use one of these options
#model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
#model.resize_token_embeddings(len(tokenizer))

In [None]:
#Confirm model type
type(model)

In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir=model_save_path,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    logging_strategy="steps",
    logging_steps=100,
    metric_for_best_model="eval_positive_f1",
    greater_is_better=True,
    num_train_epochs=10,
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    warmup_ratio=0.05,
    weight_decay=0.01,
    fp16=torch.cuda.is_available(),
    report_to=[],
    seed=42,
)

In [None]:
# Create trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=test_tokenized,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

In [None]:
#Train
trainer.train()

In [None]:
#Loss and training curves
logs = pd.DataFrame(trainer.state.log_history)

# Training loss per step
train_loss = logs[logs.loss.notna()][["step","loss"]]
train_loss.plot(x="step", y="loss", title="Training Loss"); plt.show()

# Eval loss/metrics per eval
eval_logs = logs[logs.eval_loss.notna()]
# Plot loss and overall metrics
eval_logs.plot(x="epoch", y=["eval_loss","eval_f1_macro","eval_f1_weighted"], 
               title="Overall Metrics"); plt.show()

# Plot positive class metrics separately
eval_logs.plot(x="epoch", 
               y=["eval_positive_precision","eval_positive_recall","eval_positive_f1"],
               title="Positive Class Metrics"); plt.show()

In [None]:
# Evaluate on test set
eval_results = trainer.evaluate(test_tokenized)
print("\nTest Results:")
print("\nOverall Metrics:")
for metric in ['loss', 'accuracy', 'f1_macro', 'f1_weighted', 'f1_micro']:
    print(f"{metric}: {eval_results[f'eval_{metric}']:.4f}")

print("\nPositive Class Metrics:")
for metric in ['positive_precision', 'positive_recall', 'positive_f1']:
    print(f"{metric}: {eval_results[f'eval_{metric}']:.4f}")

print("\nConfusion Matrix:")
print(f"True Positives: {eval_results['eval_true_positives']}")
print(f"False Positives: {eval_results['eval_false_positives']}")
print(f"True Negatives: {eval_results['eval_true_negatives']}")
print(f"False Negatives: {eval_results['eval_false_negatives']}")

In [None]:
# Save the final model
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)
print(f"\nModel saved to {model_save_path}")