In [None]:
!pip install evaluate --upgrade

In [None]:
!pip install evaluate

In [None]:
# =====================================
# Longformer Fine-Tuned (BlueScrubs) ‚Äî F1-Optimized Version (compatible API)
# =====================================

# (Optional) Install/refresh deps; if this errors, it's fine as long as imports work
!pip -q install datasets accelerate evaluate matplotlib
# If transformers install fails or is skipped, Colab's preinstalled version will be used.

# ==== Disable external loggers (W&B, Hugging Face telemetry)
import os, gc, torch
os.environ["WANDB_DISABLED"] = "true"
os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"

# ==== Clean GPU memory before starting
gc.collect()
torch.cuda.empty_cache()

# ==== Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    AutoConfig,
)
import evaluate
import transformers as hf

print("‚úÖ Setup complete.")
print("üî¢ transformers version:", hf.__version__)
print("üíª GPU available:", torch.cuda.get_device_name(0))

# ==== (Recommended) remove old tuned checkpoints to avoid resume/max_steps issues
!rm -rf longformer_bluescrubs_tuned

# ==== Paths
TRAIN = "./bluescrubs_train_chunked_longformer.csv"
VAL   = "./bluescrubs_val_chunked_longformer.csv"
TEST  = "./bluescrubs_test_chunked_longformer.csv"

# ==== Convert CSVs to Hugging Face Datasets
def to_hfds(path):
    df = pd.read_csv(path)
    df["label"] = df["label"].astype(int)
    return Dataset.from_pandas(df[["text", "label"]])

ds = DatasetDict({
    "train": to_hfds(TRAIN),
    "validation": to_hfds(VAL),
    "test": to_hfds(TEST)
})

# ==== Tokenization (with labels preserved)
MODEL_NAME = "allenai/longformer-base-4096"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

def preprocess(batch):
    enc = tokenizer(batch["text"], truncation=True, max_length=4096)
    enc["labels"] = batch["label"]
    return enc

print("üîÑ Tokenizing datasets...")
tokenized = ds.map(preprocess, batched=True, remove_columns=["text"])
print("‚úÖ Tokenization complete. Columns now:", tokenized["train"].column_names)

# ==== Model
config = AutoConfig.from_pretrained(MODEL_NAME, num_labels=2)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, config=config)
print("‚úÖ Model loaded successfully on", torch.cuda.get_device_name(0))

# (Optional) gradient checkpointing for memory
if hasattr(model, "gradient_checkpointing_enable"):
    model.gradient_checkpointing_enable()
    print("üß† Gradient checkpointing enabled.")

# ==== Metrics
accuracy  = evaluate.load("accuracy")
f1        = evaluate.load("f1")
precision = evaluate.load("precision")
recall    = evaluate.load("recall")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy":  accuracy.compute(predictions=preds, references=labels)["accuracy"],
        "precision": precision.compute(predictions=preds, references=labels, average="binary")["precision"],
        "recall":    recall.compute(predictions=preds, references=labels, average="binary")["recall"],
        "f1":        f1.compute(predictions=preds, references=labels, average="binary")["f1"],
    }

# ==== Training Arguments (TUNED) ‚Äî use eval_strategy (old API)
args = TrainingArguments(
    output_dir="./longformer_bluescrubs_tuned",

    eval_strategy="steps",        # üëà older name, works with your version
    eval_steps=500,
    save_strategy="steps",
    save_steps=500,
    save_total_limit=2,

    load_best_model_at_end=True,
    metric_for_best_model="f1",   # select best checkpoint by F1
    greater_is_better=True,

    learning_rate=1e-5,
    warmup_ratio=0.1,             # if this errors, you can comment it out
    lr_scheduler_type="cosine",   # same here; both usually supported

    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=4,

    num_train_epochs=3,           # longer than baseline
    weight_decay=0.01,
    fp16=True,

    dataloader_num_workers=0,
    logging_steps=100,
    report_to=[],                 # disable W&B

    overwrite_output_dir=True,
    max_steps=-1                  # rely on epochs, not a fixed step cap
)

# ==== Trainer (no early stopping; simple & robust)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# ==== Train (fresh run)
print("üöÄ Starting fine-tuning...")
train_result = trainer.train(resume_from_checkpoint=False)
print("‚úÖ Training finished.")

# ==== Evaluate on Test Set with BEST model (by F1)
print("\nüß™ Evaluating on test set with BEST checkpoint...")
test_results = trainer.evaluate(tokenized["test"], metric_key_prefix="test")
print("\n===== TEST RESULTS (TUNED MODEL) =====")
for k, v in test_results.items():
    try:
        print(f"{k}: {v:.4f}")
    except TypeError:
        print(f"{k}: {v}")

# ==== Plot Training & Validation Loss
log_history = pd.DataFrame(trainer.state.log_history)
train_loss = log_history[log_history["loss"].notna()]
eval_metrics = log_history[log_history["eval_loss"].notna()]

plt.figure(figsize=(8,5))
plt.plot(train_loss["step"], train_loss["loss"], label="Train Loss")
plt.plot(eval_metrics["step"], eval_metrics["eval_loss"], label="Validation Loss")
plt.xlabel("Steps")
plt.ylabel("Loss")
plt.legend()
plt.title("Longformer Fine-Tuned: Training vs Validation Loss")
plt.show()

# ==== Save results
pd.DataFrame([test_results]).to_csv("longformer_tuned_results.csv", index=False)
log_history.to_csv("longformer_tuned_training_log.csv", index=False)
print("\n‚úÖ Saved tuned test results to longformer_tuned_results.csv and training log to longformer_tuned_training_log.csv")


In [None]:
import os
from pathlib import Path

# Where to export this tuned model in Colab
LONGFORMER_EXPORT_DIR = "/content/longformer_oncosummarizer_tuned"
Path(LONGFORMER_EXPORT_DIR).mkdir(parents=True, exist_ok=True)

# Because load_best_model_at_end=True, trainer.model is already the BEST checkpoint (by F1)
best_model = trainer.model

best_model.save_pretrained(LONGFORMER_EXPORT_DIR)
tokenizer.save_pretrained(LONGFORMER_EXPORT_DIR)

print("üìÅ Saved tuned Longformer model to:", LONGFORMER_EXPORT_DIR)
!ls -la "$LONGFORMER_EXPORT_DIR"
