In [1]:
from google.colab import files
uploaded = files.upload()


Saving briefly_fullpaper_run1.zip to briefly_fullpaper_run1.zip


In [2]:
!mkdir -p /content/Briefly_FullPaper
!mv briefly_fullpaper_run1.zip /content/Briefly_FullPaper


In [3]:
%cd /content/Briefly_FullPaper
!unzip briefly_fullpaper_run1.zip


/content/Briefly_FullPaper
Archive:  briefly_fullpaper_run1.zip
  inflating: train_led.py            
  inflating: pdf_train.jsonl         
  inflating: pdf_val.jsonl           


In [4]:
!mkdir -p src
!mkdir -p data/processed/dataset

!mv train_led.py src/train_led.py
!mv pdf_train.jsonl data/processed/dataset/pdf_train.jsonl
!mv pdf_val.jsonl data/processed/dataset/pdf_val.jsonl



In [5]:
!ls src
!ls data/processed/dataset


train_led.py
pdf_train.jsonl  pdf_val.jsonl


In [6]:
!nvidia-smi


Thu Dec 18 19:08:36 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   33C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [1]:
# !pip -q install transformers datasets accelerate sentencepiece peft rouge-score
!python src/train_led.py


python3: can't open file '/content/src/train_led.py': [Errno 2] No such file or directory


In [9]:
import numpy as np, torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from peft import PeftModel
from rouge_score import rouge_scorer
from tqdm import tqdm

# -----------------------
# CONFIG (LED-large)
# -----------------------
MODEL_ID = "allenai/led-large-16384-arxiv"
ADAPTER_PATH = "/content/Briefly_FullPaper/led_large_ckpt/final_adapter"
VAL_JSONL = "/content/Briefly_FullPaper/data/processed/dataset/pdf_val.jsonl"

MAX_INPUT_LEN = 2048
MAX_NEW_TOKENS = 256
NUM_BEAMS = 4

# safety clip for raw text before tokenizer
MAX_CHARS = 12000

# Optional: quick check (set to an int like 20), or None for full 150
LIMIT = None
# -----------------------

device = "cuda" if torch.cuda.is_available() else "cpu"
dtype = torch.float16 if device == "cuda" else torch.float32

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
ds = load_dataset("json", data_files={"val": VAL_JSONL})["val"]
if LIMIT is not None:
    ds = ds.select(range(min(LIMIT, len(ds))))

scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)

def build_inputs(text: str):
    text = text[:MAX_CHARS]
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        max_length=MAX_INPUT_LEN
    ).to(device)

    # LED global attention: first token attends globally
    global_attention_mask = torch.zeros_like(inputs["input_ids"])
    global_attention_mask[:, 0] = 1
    return inputs, global_attention_mask

@torch.no_grad()
def eval_model(model, name="model"):
    model.eval()
    scores = []

    for row in tqdm(ds, desc=f"Evaluating {name}", total=len(ds)):
        inputs, gmask = build_inputs(row["article"])

        out = model.generate(
            **inputs,
            global_attention_mask=gmask,
            max_new_tokens=MAX_NEW_TOKENS,
            num_beams=NUM_BEAMS,
            no_repeat_ngram_size=3,
            repetition_penalty=1.15
        )

        pred = tokenizer.decode(out[0], skip_special_tokens=True)
        ref = row["abstract"]

        scores.append(scorer.score(ref, pred))

    return {
        "rouge1": float(np.mean([s["rouge1"].fmeasure for s in scores])),
        "rouge2": float(np.mean([s["rouge2"].fmeasure for s in scores])),
        "rougeL": float(np.mean([s["rougeL"].fmeasure for s in scores])),
    }

# -----------------------
# 1) Baseline LED-large
# -----------------------
baseline = AutoModelForSeq2SeqLM.from_pretrained(MODEL_ID, torch_dtype=dtype).to(device)
baseline_scores = eval_model(baseline, "ORIGINAL LED-LARGE (baseline)")

# free memory before loading fine-tuned
del baseline
torch.cuda.empty_cache()

# -----------------------
# 2) Fine-tuned (LED-large + LoRA)
# -----------------------
ft_base = AutoModelForSeq2SeqLM.from_pretrained(MODEL_ID, torch_dtype=dtype).to(device)
ft_model = PeftModel.from_pretrained(ft_base, ADAPTER_PATH).to(device)
finetuned_scores = eval_model(ft_model, "LoRA Fine-tuned LED-LARGE")

print("\n========== RESULTS ==========")
print("Baseline  :", baseline_scores)
print("Fine-tuned:", finetuned_scores)
print("Delta     :", {k: finetuned_scores[k] - baseline_scores[k] for k in baseline_scores})


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Generating val split: 0 examples [00:00, ? examples/s]

`torch_dtype` is deprecated! Use `dtype` instead!
Evaluating ORIGINAL LED-LARGE (baseline): 100%|██████████| 150/150 [12:29<00:00,  5.00s/it]
Evaluating LoRA Fine-tuned LED-LARGE: 100%|██████████| 150/150 [19:36<00:00,  7.84s/it]


Baseline  : {'rouge1': 0.6062529256521636, 'rouge2': 0.45530736788518994, 'rougeL': 0.5032599351798548}
Fine-tuned: {'rouge1': 0.6550154731464548, 'rouge2': 0.51244332771215, 'rougeL': 0.55323750832549}
Delta     : {'rouge1': 0.04876254749429121, 'rouge2': 0.057135959826960014, 'rougeL': 0.04997757314563522}





In [12]:
!zip -r led_large_fullpaper_lora_adapter.zip /content/Briefly_FullPaper/led_large_ckpt/final_adapter


  adding: content/Briefly_FullPaper/led_large_ckpt/final_adapter/ (stored 0%)
  adding: content/Briefly_FullPaper/led_large_ckpt/final_adapter/adapter_model.safetensors (deflated 8%)
  adding: content/Briefly_FullPaper/led_large_ckpt/final_adapter/special_tokens_map.json (deflated 85%)
  adding: content/Briefly_FullPaper/led_large_ckpt/final_adapter/adapter_config.json (deflated 57%)
  adding: content/Briefly_FullPaper/led_large_ckpt/final_adapter/tokenizer_config.json (deflated 75%)
  adding: content/Briefly_FullPaper/led_large_ckpt/final_adapter/merges.txt (deflated 53%)
  adding: content/Briefly_FullPaper/led_large_ckpt/final_adapter/README.md (deflated 66%)
  adding: content/Briefly_FullPaper/led_large_ckpt/final_adapter/vocab.json (deflated 59%)
  adding: content/Briefly_FullPaper/led_large_ckpt/final_adapter/tokenizer.json (deflated 82%)


In [13]:
from google.colab import files
files.download("led_large_fullpaper_lora_adapter.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>