Imports

In [None]:
#imports
import sys
import os
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments, set_seed
from datasets import Dataset
from tqdm import tqdm
from collections import Counter

# Import our modules
utils_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'utils'))
if utils_path not in sys.path:
    sys.path.insert(0, utils_path)

from general_utils import load_data
from bert_training_utils import handle_class_imbalance
from bert_relative_date_utils import create_token_label_dataset, upsample_relative_date_sequences, compute_token_metrics

Data Loading

In [None]:
# Load Data
df = load_data("../data/training_dataset.csv")
print(f"Loaded {len(df)} records")

In [None]:
#Inspect df
#df.head()

Model & Data Preparation for Finetuning

In [None]:
# Set seed for reproducibility
set_seed(42)

In [None]:
#Set path to save model
model_save_path = '../models/bert_model_relative_dates/'

In [None]:
#Choose model to use - any BERT model from HuggingFace can be used, see: https://huggingface.co/google-bert

model_name = "google/bert_uncased_L-2_H-128_A-2" #2 layers, 128 hidden dim, 2 attention heads
#model_name = "distilbert/distilroberta-base"  # ~82M parameters, 6 layers, 768 hidden dim
#model_name = "SpanBERT/spanbert-base-cased"  # 110M parameters, 12 layers, 768 hidden dim
#model_name = "microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext"  # 125M parameters, 12 layers, 768 hidden dim


#model_name = "google-bert/bert-base-uncased" #12 layers, 768 hidden dim, 12 attention heads
#model_name = "google-bert/bert-base-cased" #12 layers, 768 hidden dim, 12 attention heads
#model_name = "prajjwal1/bert-tiny"  # 4.4M parameters, 2 layers, 128 hidden dim
#model_name = "boltuix/EntityBERT"
#model_name - "yikuan8/Clinical-Longformer"
#model_name = "allenai/biomed_roberta_base"  # 125M parameters, 12 layers, 768 hidden dim
#model_name = "../models/PubmedBERTbase-MimicSmall-EntityBERT/"

In [None]:
# Setup tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
#tokenizer = add_special_tokens(tokenizer)

In [None]:
# Prepare Token-Level Dataset
print("Creating token-level dataset...")
examples = create_token_label_dataset(df, tokenizer)
dataset = Dataset.from_list(examples)

print(f"Prepared {len(dataset)} samples")

counts = Counter([l for ex in examples for l in ex["labels"]])
print(f"Counts of labels: {counts}")

In [None]:
# Upsample positive examples
examples = upsample_relative_date_sequences(examples, factor=3)

counts = Counter([l for ex in examples for l in ex["labels"]])
print(f"Counts of labels: {counts}")

In [None]:
# Convert to huggingFace dataset
dataset = Dataset.from_list(examples)
print(f"Prepared {len(dataset)} samples")

In [None]:
# Data splitting
train_dataset, temp_dataset = dataset.train_test_split(test_size=0.2, seed=42).values()
val_dataset, test_dataset = temp_dataset.train_test_split(test_size=0.5, seed=42).values()

print(f"Train: {len(train_dataset)}, Val: {len(val_dataset)}, Test: {len(test_dataset)}")

In [None]:
# Model initialization
num_labels = 3  # O, B-RELDATE, I-RELDATE
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=num_labels)

In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir=model_save_path,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=100,
    num_train_epochs=6,
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    warmup_ratio=0.05,
    weight_decay=0.01,
    load_best_model_at_end=True,
    fp16=torch.cuda.is_available(),
    report_to=[],
    seed=42,
)

In [None]:
# Create trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_token_metrics
)

In [None]:
#Train
print("Starting training...")
trainer.train()

In [None]:
# Evaluate on test set
print("Evaluating on test set...")
results = trainer.evaluate(test_dataset)
print(results)

In [None]:
# Save Model
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)
print(f"\nModel saved to {model_save_path}")