Mounting drive

In [None]:
from google.colab import drive


drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Loading the cleaned dataset

In [None]:
from datasets import load_dataset, Dataset
import pandas as pd


projectFolderPath = "/content/drive/MyDrive/Colab Notebooks/Gen AI/Mini Project 1/"
trainingDatasetPath = projectFolderPath + "training_dataset.csv"
validationDatasetPath = projectFolderPath + "validation_dataset.csv"
testDatasetPath = projectFolderPath + "test_dataset.csv"

dataset = load_dataset(
    "csv",
    data_files={
        "train": trainingDatasetPath,
        "validation": validationDatasetPath,
        "test": testDatasetPath
    }
)


# temp code to reduce dataset size
# Load limited rows using pandas
dataset["train"] = dataset["train"].select(range(70000))
dataset["validation"] = dataset["validation"].select(range(10000))
dataset["test"] = dataset["test"].select(range(10000))

print(dataset)

DatasetDict({
    train: Dataset({
        features: ['source', 'target'],
        num_rows: 70000
    })
    validation: Dataset({
        features: ['source', 'target'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['source', 'target'],
        num_rows: 10000
    })
})


Setting-up model and tokenizer

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM


device = (
    "cuda" if torch.cuda.is_available() else
    "mps"  if getattr(torch.backends, "mps", None) and torch.backends.mps.is_available() else
    "cpu"
)
print("Device:", device)

def assert_vocab_alignment(model, tokenizer):
    emb = model.get_input_embeddings()
    num_emb = emb.num_embeddings
    voc = len(tokenizer)
    cfg = model.config.vocab_size
    assert num_emb == cfg, f"Embedding({num_emb}) != config.vocab_size({cfg})"
    assert voc <= num_emb, f"Tokenizer size({voc}) > embedding size({num_emb})"
    assert tokenizer.pad_token_id is not None

modelName = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(modelName, use_fast=True)

# adding special tokens if not present
summaryEndToken = "</SUMMARY>"
specialTokens = []
if summaryEndToken not in tokenizer.get_vocab():
    specialTokens.append(summaryEndToken)

# adding a PAD token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

if specialTokens:
    tokenizer.add_special_tokens({"additional_special_tokens": specialTokens})

specialTokenIDs = tokenizer.convert_tokens_to_ids(summaryEndToken)

model = AutoModelForCausalLM.from_pretrained(modelName)

# Resize embeddings to handle newly added tokens
model.resize_token_embeddings(len(tokenizer))
model.to(device)

model.config.pad_token_id = tokenizer.pad_token_id
model.config.eos_token_id = tokenizer.eos_token_id
model.generation_config.pad_token_id = tokenizer.pad_token_id
model.generation_config.eos_token_id = tokenizer.eos_token_id

model.generation_config.max_new_tokens = 288   # your target cap + a little headroom
model.generation_config.num_beams = 1
model.generation_config.max_length = 1024      # input window + new tokens
tokenizer.padding_side = "left"

assert_vocab_alignment(model, tokenizer)

Device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Tokenizing the dataset

In [None]:
import numpy as np

maxSourceTokens = 704   # try 256–512 depending on VRAM
maxTargetTokens = 256   # typical summary length cap
maxTokens= maxSourceTokens + maxTargetTokens + 64  # 64 adds room for prefixes

tokenizer.model_max_length = maxTokens

summaryOutputColumn = "target"
summaryInputColumn = "source"

def capTargetTokensWithEOS(ids, eosID):
    # Reserve 1 token for EOS if we need to cap
    if len(ids) >= maxTargetTokens:
        ids = ids[: maxTargetTokens - 1]
    ids = ids + [eosID]
    return ids

def tokenizeDataset(row):
    sourceText = str(row[summaryInputColumn]).rstrip() + "\n"
    targetText = str(row[summaryOutputColumn]).rstrip()

    sourceIDs = tokenizer(sourceText, add_special_tokens=False).input_ids
    targetIDs = tokenizer(targetText, add_special_tokens=False).input_ids

    # hard-cap the TARGET and add EOS explicitly (we WANT the model to learn to stop)
    targetIDs = capTargetTokensWithEOS(targetIDs, tokenizer.eos_token_id)

    # build the fixed target segment: [SEP] + target_ids
    # (we will mask SEP in the labels so loss is only on the target+EOS)
    targetSegment = [specialTokenIDs] + targetIDs

    # reducing the source text tokens when they exceed the max tokens length to
    # accumulate target text tokens
    maxSourceTokensLen = max(1, maxTokens - len(targetSegment))
    maxSourceTokensLen = min(maxSourceTokensLen, maxSourceTokens)

    if len(sourceIDs) > maxSourceTokensLen:
        sourceIDs = sourceIDs[-maxSourceTokensLen:]

    inputTextTokens = sourceIDs + targetSegment

    if len(inputTextTokens) > maxTokens:
        # if input tokens are still greater than maximum, truncate extra from
        # source text side
        overflow = len(inputTextTokens) - maxTokens
        # after this, some of the earliest source text tokens may be gone but
        # keeping target intact as much as possible
        sourceKeep = max(0, len(sourceIDs) - overflow)
        sourceIDs  = sourceIDs[-sourceKeep:]
        inputTextTokens = sourceIDs + targetSegment

    attentionMask = [1] * len(inputTextTokens)

    # Labels: mask source text tokens with -100 to keep target tokens as labels
    labels = ([-100] * len(sourceIDs)) + ([-100]) + list(targetIDs)

    # pad to max length for efficient batching
    padLength = maxTokens - len(inputTextTokens)
    if padLength > 0:
        padID = int(tokenizer.pad_token_id)
        inputTextTokens = [padID] * padLength + inputTextTokens
        attentionMask = [0] * padLength   + attentionMask
        labels = [-100] * padLength + labels

    return {
        "input_ids": inputTextTokens,
        "attention_mask": attentionMask,
        "labels": labels,
    }

tokenizedDataset = dataset.map(tokenizeDataset, remove_columns=dataset["train"].column_names, desc="Tokenizing & masking", load_from_cache_file=False)

Tokenizing & masking:   0%|          | 0/70000 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1202 > 1024). Running this sequence through the model will result in indexing errors


Tokenizing & masking:   0%|          | 0/10000 [00:00<?, ? examples/s]

Tokenizing & masking:   0%|          | 0/10000 [00:00<?, ? examples/s]

Tokenizing the dataset for evaluation during training with prompt only

Configuring LoRA adapter for faster learning

In [None]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training


loraConfig = LoraConfig(
    r=32,
    lora_alpha=2*32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["c_attn", "c_proj", "mlp.c_fc", "mlp.c_proj"],
)

model = get_peft_model(model, loraConfig)
model.print_trainable_parameters()

trainable params: 2,359,296 || all params: 84,272,640 || trainable%: 2.7996




Features collator for the dataset

In [None]:
import torch


def collateFeaturesOfDataset(features):
    # All are already padded to max length
    batch = {k: torch.tensor([f[k] for f in features], dtype=torch.long) for k in features[0]}
    return batch

Specifying trainer arguments and performing training

In [None]:
from transformers import (Trainer, TrainingArguments, Seq2SeqTrainingArguments, Seq2SeqTrainer)


modeltrainerArguments = Seq2SeqTrainingArguments(
    output_dir=projectFolderPath,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=16,
    num_train_epochs=6.0,
    learning_rate=1e-4,
    weight_decay=0.01,
    warmup_ratio=0.1,
    logging_strategy="steps",
    logging_steps=50,
    eval_strategy="steps",
    eval_steps=250,
    save_steps=250,
    save_total_limit=2,
    lr_scheduler_type="cosine_with_restarts",
    fp16=False,
    bf16=True,
    dataloader_pin_memory=True,
    report_to="none",
    max_grad_norm = 0.5,
    group_by_length = True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    load_best_model_at_end=True,
    seed=42,
    remove_unused_columns=False,
    predict_with_generate=False,
    generation_num_beams=1,
    # dataloader_num_workers=2,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=modeltrainerArguments,
    train_dataset=tokenizedDataset["train"],
    eval_dataset=tokenizedDataset["validation"],
    data_collator=collateFeaturesOfDataset,
    processing_class=tokenizer,
)

trainer.train()

`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss,Validation Loss
250,4.7904,4.395711
500,4.2288,3.99343
750,4.0228,3.794136
1000,3.9015,3.678534
1250,3.7712,3.557521
1500,3.6599,3.444511
1750,3.5644,3.355908
2000,3.4796,3.287972
2250,3.4391,3.227671
2500,3.3767,3.177826




TrainOutput(global_step=13128, training_loss=3.219030669082744, metrics={'train_runtime': 15763.3966, 'train_samples_per_second': 26.644, 'train_steps_per_second': 0.833, 'total_flos': 1.1583275139072e+17, 'train_loss': 3.219030669082744, 'epoch': 6.0})

Saving the tokenizer and trained model

In [None]:
import os

trainer.save_model(projectFolderPath+"Trained Model Data/")
tokenizer.save_pretrained(projectFolderPath+"Trained Model Data/")
model.generation_config.save_pretrained(projectFolderPath + "Trained Model Data/")
print(os.path.exists(projectFolderPath+"Trained Model Data/"))
print("Saved to:", projectFolderPath+"Trained Model Data/")

True
Saved to: /content/drive/MyDrive/Colab Notebooks/Gen AI/Mini Project 1/Trained Model Data/


Generation from trained model

In [None]:
from transformers import TextStreamer

model.eval()
model.to(device)

def generate_summary(input_text, max_new_tokens=maxTokens, decoding="beam"):
    ctx = getattr(model.config, "n_positions",
                  getattr(model.config, "max_position_embeddings", 1024))
    prompt = input_text.rstrip() + "\n"  # should already end with "Summary:" in your data contract
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=ctx)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    inputTokensLen = inputs["input_ids"].shape[1]
    remainingTokensLen = max(ctx - inputTokensLen, 1)
    safeTokensLen = max(1, min(max_new_tokens, remainingTokensLen))

    gen_kwargs = dict(
        max_new_tokens=safeTokensLen,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,  # not our summary end, but safe fallback
        no_repeat_ngram_size=3,
    )

    if decoding == "beam":
        gen_kwargs.update(dict(num_beams=4, length_penalty=1.05, early_stopping=True))
    else:  # nucleus
        gen_kwargs.update(dict(do_sample=True, top_p=0.9, temperature=0.8))

    with torch.no_grad():
        out = model.generate(**inputs, **gen_kwargs)
    text = tokenizer.decode(out[0][inputs["input_ids"].shape[1]:], skip_special_tokens=False)

    # Cut off at your custom end marker if present
    if summaryEndToken in text:
        text = text.split(summaryEndToken)[0].rstrip() + f"{summaryEndToken}"

    return text

# Try a couple from validation
for i in range(2):
    ex = dataset["validation"][i]
    gen = generate_summary(ex[summaryInputColumn])
    print("=== INPUT (truncated) ===")
    print(ex[summaryInputColumn][:600], "...\n")
    print("=== GENERATED ===")
    print(gen, "\n")
    print("=== REFERENCE ===")
    print(ex[summaryOutputColumn], "\n")
    print("="*80)

=== INPUT (truncated) ===
Summarize the following hospitalization for the discharge summary.
Chief Complaint: Fever
Discharge Diagnosis: Fever Thrombocytopenia Anemia Chronic lymphocytic leukemia Diabetes-non insulin dependent Chronic kidney disease Hypertension
Discharge Instructions: Ms  you were admitted to the hospital because of fever. You had a fever work-up including blood count, urine analysis, blood cutures and urine culture. You were started on antibiotics because of initial evidence of a possible urinary tract infection, but the final urine culture came back negative for an infection and you have no symptoms ...

=== GENERATED ===
The patient presented to the emergency department and was evaluated by the orthopedic surgery team. The patient was found to have<|endoftext|> 

=== REFERENCE ===
Summary: with chronic lymphocytic leukemia on low-dose BendamustineRituximab therapy who presens with fever 4 days after receiving BendamustineRituxan with neulasta support . . Fever:Pt a

Reloading the saved model and tokenizer

In [None]:
from pathlib import Path
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
from peft import PeftModel

path = Path("/content/drive/MyDrive/Colab Notebooks/Gen AI/Mini Project 1/Trained Model Merged")

tokenizer = AutoTokenizer.from_pretrained(path, local_files_only=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# base = AutoModelForCausalLM.from_pretrained("distilgpt2")
# base.resize_token_embeddings(len(tokenizer))
# base.config.pad_token_id = tokenizer.pad_token_id

# model = PeftModel.from_pretrained(base, path)   # <- attaches your trained adapter
# model.eval()

# after model is saved, run this code instead
config = AutoConfig.from_pretrained(path, local_files_only=True)
config.vocab_size = len(tokenizer)

model = AutoModelForCausalLM.from_pretrained(path, local_files_only=True,
                                             ignore_mismatched_sizes=True,
                                             config=config)
model.resize_token_embeddings(len(tokenizer))
model.config.pad_token_id = tokenizer.pad_token_id
model.generation_config.pad_token_id = tokenizer.pad_token_id
model.generation_config.eos_token_id = tokenizer.eos_token_id

The module name Trained Model Merged (originally Trained Model Merged) is not a valid Python identifier. Please rename the original module to avoid import issues.


In [None]:
# didn't save the model properly so saving it here after loading the adapters
# merging it into base weights

merged = model.merge_and_unload()
merged.save_pretrained("/content/drive/MyDrive/.../Trained Model Merged")
tokenizer.save_pretrained("/content/drive/MyDrive/.../Trained Model Merged")

('/content/drive/MyDrive/.../Trained Model Merged/tokenizer_config.json',
 '/content/drive/MyDrive/.../Trained Model Merged/special_tokens_map.json',
 '/content/drive/MyDrive/.../Trained Model Merged/vocab.json',
 '/content/drive/MyDrive/.../Trained Model Merged/merges.txt',
 '/content/drive/MyDrive/.../Trained Model Merged/added_tokens.json',
 '/content/drive/MyDrive/.../Trained Model Merged/tokenizer.json')

Evaluation of trained model

In [None]:
!pip install -q evaluate
!pip install -q rouge_score
!pip install -q bert_score

In [None]:
import evaluate, numpy as np
import torch

maxSourceTokens = 704   # try 256–512 depending on VRAM
maxTargetTokens = 256   # typical summary length cap
maxTokens= maxSourceTokens + maxTargetTokens + 64  # 64 adds room for prefixes
summaryOutputColumn = "target"
summaryInputColumn = "source"
device = (
    "cuda" if torch.cuda.is_available() else
    "mps"  if getattr(torch.backends, "mps", None) and torch.backends.mps.is_available() else
    "cpu"
)
print("Device:", device)
model.to(device)

# 0) one-time setup
model.eval()
model.config.use_cache = True             # speeds up generation
tokenizer.padding_side = "left"           # decoder-only
PAD = tokenizer.pad_token_id
SEP = summaryEndToken                     # "</SUMMARY>"
MAX_NEW = 256                             # pick your typical target length
BATCH = 16                                # tune for your GPU

# 1) build a prompt-only validation slice
def build_prompt(ex):
    src = ex[summaryInputColumn].rstrip() + "\n" + SEP  # prompt ends with SEP
    tgt = ex[summaryOutputColumn].rstrip()              # reference text
    return {"prompt": src, "ref": tgt}

val = dataset["validation"].select(range(min(5000, len(dataset["validation"])))) \
                           .map(build_prompt, remove_columns=dataset["validation"].column_names)

# 2) pre-tokenize prompts in bulk (pad to longest per-batch later)
def batched_generate(prompts):
    enc = tokenizer(prompts, padding=True, truncation=True,
                    max_length=1024 - MAX_NEW, return_tensors="pt")
    enc = {k: v.to(model.device) for k, v in enc.items()}
    with torch.inference_mode():
        gen = model.generate(
            **enc,
            max_new_tokens=MAX_NEW,
            do_sample=False,           # greedy
            num_beams=1,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=PAD,
        )
    # decode once per batch
    texts = tokenizer.batch_decode(gen, skip_special_tokens=True)
    # keep only the portion after SEP
    out = [t.split(SEP, 1)[1].strip() if SEP in t else t.strip() for t in texts]
    return out

# 3) run generation in batches
preds, refs = [], []
for i in range(0, len(val), BATCH):
    batch = val[i : i + BATCH]
    preds.extend(batched_generate(batch["prompt"]))
    refs.extend([r.strip() for r in batch["ref"]])

# 4) vectorized ROUGE (faster than per-example rouge_scorer loop)
rouge = evaluate.load("rouge")
res = rouge.compute(predictions=preds, references=refs, use_stemmer=True)
print({
    "rouge1": round(res["rouge1"], 4),
    "rouge2": round(res["rouge2"], 4),
    "rougeL": round(res["rougeL"], 4),
    "rougeLsum": round(res["rougeLsum"], 4),  # optional
})

# {'rouge1': np.float64(0.2625), 'rouge2': np.float64(0.0683), 'rougeL': np.float64(0.1147), 'rougeLsum': np.float64(0.162)}

Device: cuda
{'rouge1': np.float64(0.2626), 'rouge2': np.float64(0.0683), 'rougeL': np.float64(0.1147), 'rougeLsum': np.float64(0.162)}


BERT evaluation

In [None]:
from evaluate import load


bertscore = load("bertscore")
results = bertscore.compute(
    predictions=preds, references=refs,
    lang="en",
    device=device,
    batch_size=BATCH,
    model_type="roberta-base",
)

print("BERTScore (F1):", np.mean(results["f1"]))
# BERTScore (F1): 0.8109372292041779

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script: 0.00B [00:00, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTScore (F1): 0.8109372292041779
