In [None]:
import os
os.environ['WANDB_DISABLED']="true"

# Install and load libraries

In [None]:
!pip install -q bitsandbytes datasets accelerate loralib rouge_score evaluate
!pip install -q git+https://github.com/huggingface/transformers.git@main git+https://github.com/huggingface/peft.git

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [None]:
# install git-fls for pushing model and logs to the hugging face hub
!sudo apt-get install git-lfs --yes

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git-lfs is already the newest version (3.0.2-1ubuntu0.2).
0 upgraded, 0 newly installed, 0 to remove and 45 not upgraded.


In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
!nvidia-smi -L

GPU 0: NVIDIA A100-SXM4-40GB (UUID: GPU-ac8d3d25-c34a-2530-810a-6b95e1fabfcc)


In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer
import torch
import time
import evaluate
import pandas as pd
import numpy as np

from datasets import Dataset, load_metric

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Model set up with Lora for short dialogue

Fine-tune version of led-large-16384 - Allenai's Longformer Encoder-Decoder (LED). - https://huggingface.co/allenai/led-large-16384-arxiv - (https://https://arxiv.org/pdf/2004.05150)

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"
import torch
import torch.nn as nn
import bitsandbytes as bnb

model_name = "allenai/led-large-16384"
led_large_lora = AutoModelForSeq2SeqLM.from_pretrained(
    model_name,
    load_in_8bit=True,
)

tokenizer = AutoTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
`low_cpu_mem_usage` was None, now set to True since model is quantized.


In [None]:
for param in led_large_lora.parameters():
  param.requires_grad = False  # freeze the model - train adapters later
  if param.ndim == 1:
    # cast the small parameters (e.g. layernorm) to fp32 for stability
    param.data = param.data.to(torch.float32)

led_large_lora.gradient_checkpointing_enable()  # reduce number of stored activations
led_large_lora.enable_input_require_grads()

class CastOutputToFloat(nn.Sequential):
  def forward(self, x): return super().forward(x).to(torch.float32)
led_large_lora.lm_head = CastOutputToFloat(led_large_lora.lm_head)

In [None]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [None]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=16, #attention heads
    lora_alpha=32, #alpha scaling
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="Seq2Seq" # set this for CLM or Seq2Seq
)

led_large_lora = get_peft_model(led_large_lora, config)
print_trainable_parameters(led_large_lora)

trainable params: 1572864 || all params: 461374464 || trainable%: 0.34090833427660183


In [None]:
!nvidia-smi

Tue Jul 16 07:01:09 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off | 00000000:00:04.0 Off |                    0 |
| N/A   32C    P0              50W / 400W |   1029MiB / 40960MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

# Load data

In [None]:
# Paths to the files
path_train_short = '/content/drive/MyDrive/Colab Notebooks/w266_Project Ideas/Data/Short_dialogue/train_short.csv'
path_test_short = '/content/drive/MyDrive/Colab Notebooks/w266_Project Ideas/Data/Short_dialogue/test_short.csv'
path_val_short = '/content/drive/MyDrive/Colab Notebooks/w266_Project Ideas/Data/Short_dialogue/val_short.csv'

# Load the datasets
train_short = pd.read_csv(path_train_short)
test_short = pd.read_csv(path_test_short)
val_short = pd.read_csv(path_val_short)

# Check the first few rows to ensure they're loaded correctly
print("short_dialogue_train_data:")
print(train_short.head())

# Check the shape
print("Shape of short_dialogue_train_data:")
print(train_short.shape)
print("Shape of short_dialogue_test_data:")
print(test_short.shape)
print("Shape of short_dialogue_val_data:")
print(val_short.shape)


short_dialogue_train_data:
  section_header                                       section_text  \
0          GENHX  The patient is a 75-year-old female who comes ...   
1      FAM/SOCHX         Significant for diabetes and hypertension.   
2  PASTMEDICALHX                  Significant for anxiety disorder.   
3          GENHX  The patient is a 77-year-old female who is una...   
4      FAM/SOCHX                                   Noncontributory.   

                                            dialogue  Dialogue_Length  \
0  Doctor: Welcome to the clinic. I am Doctor Fra...             1396   
1  Doctor: Does anyone else in your family suffer...              175   
2  Doctor: Have we gone over your survey results ...              256   
3  Guest_clinician: How old is the patient? Docto...              438   
4  Doctor: Do you have a known- Patient: Drug all...              105   

   Summary_Length  
0             677  
1              42  
2              33  
3             325  
4      

In [None]:
# Print the number of entries in your datasets
print("Training Set Length:", len(train_short))
print("Validation Set Length:", len(val_short))
print("Test Set Length:", len(test_short))


Training Set Length: 1360
Validation Set Length: 119
Test Set Length: 222


In [None]:
train_short.head(1)

Unnamed: 0,section_header,section_text,dialogue,Dialogue_Length,Summary_Length
0,GENHX,The patient is a 75-year-old female who comes ...,Doctor: Welcome to the clinic. I am Doctor Fra...,1396,677


In [None]:
train_short = Dataset.from_pandas(train_short)
test_short = Dataset.from_pandas(test_short)
val_short = Dataset.from_pandas(val_short)


# Fine-tuning the model

## Tokenization

In [None]:
# tokennize
max_input_length = 2048
max_output_length = 1024
def tokenize_function(batch):
    start_prompt = 'Summarize the following patientdoctor dialogue. Include all medical patientdoctor dialogue. Its relevant information, including family history, diagnosis, past medical and surgical history, immunizations, lab results and known allergies. You should first predict the most relevant clinical note section header and then summarize the dialogue. Dialogue:.\n\n'
    end_prompt = '\n\nSummary: '
    prompt = [start_prompt + dialogue + end_prompt for dialogue in batch["dialogue"]]
    # tokenize the inputs and labels
    inputs = tokenizer(
        prompt,
        padding="max_length",
        truncation=True,
        max_length=max_input_length,
    )
    outputs = tokenizer(
        batch["section_text"],
        padding="max_length",
        truncation=True,
        max_length=max_output_length,
    )

    batch["input_ids"] = inputs.input_ids
    batch["attention_mask"] = inputs.attention_mask

    # create 0 global_attention_mask lists
    batch["global_attention_mask"] = len(batch["input_ids"]) * [
        [0 for _ in range(len(batch["input_ids"][0]))]
    ]

    # since above lists are references, the following line changes the 0 index for all samples
    batch["global_attention_mask"][0][0] = 1
    batch["labels"] = outputs.input_ids

    # We have to make sure that the PAD token is ignored
    batch["labels"] = [
        [-100 if token == tokenizer.pad_token_id else token for token in labels]
        for labels in batch["labels"]
    ]

    return batch

In [None]:
# map the function
batch_size = 2
train_short = train_short.map(
    tokenize_function,
    batched=True,
    batch_size=batch_size,
    remove_columns=["section_header", "section_text", "dialogue","Dialogue_Length", "Summary_Length"],
)

Map:   0%|          | 0/1360 [00:00<?, ? examples/s]

In [None]:
test_short = test_short.map(
    tokenize_function,
    batched=True,
    batch_size=batch_size,
    remove_columns=["section_header", "section_text", "dialogue","Dialogue_Length", "Summary_Length"],
)

Map:   0%|          | 0/222 [00:00<?, ? examples/s]

In [None]:
val_short = val_short.map(
   tokenize_function,
    batched=True,
    batch_size=batch_size,
    remove_columns=["section_header", "section_text", "dialogue","Dialogue_Length", "Summary_Length"],
)

Map:   0%|          | 0/119 [00:00<?, ? examples/s]

In [None]:
from random import sample
# check the shape after tokenization
sample_input_id = train_short['input_ids'][0:1]
sample_label = train_short['labels'][0:1]
print("tokenized_train_exp: ", train_short)
print("\nsample_input_id: ", len(sample_input_id[0]), sample_input_id)
print("\nsample_label: ", len(sample_label[0]), sample_label)


tokenized_train_exp:  Dataset({
    features: ['input_ids', 'attention_mask', 'global_attention_mask', 'labels'],
    num_rows: 1360
})

sample_input_id:  2048 [[0, 38182, 3916, 2072, 5, 511, 3186, 44914, 6054, 4, 39682, 70, 1131, 3186, 44914, 6054, 4, 38, 6634, 4249, 335, 6, 217, 284, 750, 6, 9726, 6, 375, 1131, 8, 15535, 750, 6, 13998, 18391, 6, 6348, 775, 8, 684, 26331, 4, 370, 197, 78, 7006, 5, 144, 4249, 5154, 1591, 2810, 12734, 8, 172, 40402, 5, 6054, 4, 33854, 35, 4, 50118, 50118, 41152, 35, 14826, 7, 5, 8474, 4, 38, 524, 12521, 3848, 4, 27690, 35, 3837, 47, 4, 85, 16, 2579, 7, 972, 47, 4, 12521, 35, 653, 3291, 47, 88, 192, 162, 452, 116, 27690, 35, 38, 206, 38, 189, 33, 56, 10, 8579, 4, 12521, 35, 653, 1102, 7, 146, 47, 206, 14, 47, 56, 10, 8579, 116, 27690, 35, 152, 662, 6, 38, 1299, 101, 402, 11, 127, 14599, 4, 85, 21, 10, 7782, 2157, 4, 1892, 38, 1299, 269, 24719, 219, 4, 1308, 314, 865, 8, 15345, 1299, 31086, 4, 12521, 35, 6553, 47, 2217, 16253, 116, 27690, 35, 38, 218, 75,

## Training

In [None]:
import evaluate
import nltk
import numpy as np
from nltk.tokenize import sent_tokenize
nltk.download("punkt")

# Metric
metric = evaluate.load("rouge")

# helper function to postprocess text
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(sent_tokenize(label)) for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    result = {k: round(v * 100, 4) for k, v in result.items()}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    return result


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
from transformers import DataCollatorForSeq2Seq

# ignore tokenizer pad token in the loss
label_pad_token_id = -100
# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=led_large_lora,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)

In [None]:
from huggingface_hub import HfFolder
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, get_scheduler
from torch.optim import AdamW

# Define custom optimizer
optimizer = AdamW(led_large_lora.parameters(), lr=3e-5, weight_decay=0.01)

# Define the output directory
repository_id = "./output_directory"

# Define training args
training_args = Seq2SeqTrainingArguments(
    output_dir="outputs",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=6,
    predict_with_generate=True,
    fp16=False, # Overflows with fp16
    learning_rate=3e-5,
    num_train_epochs=5,
    weight_decay=0.01,
    #logging & evaluation strategies
    #logging_dir=f"{repository_id}/logs",
    logging_strategy="steps",
    logging_steps=500,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
)


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [None]:
# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=led_large_lora,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_short,
    eval_dataset=val_short,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, None),  # Pass the custom optimizer
)

Rouge Scores and Len Gen are normalized in the table elow: it means that what is displyed ust be divided for 100.

In [None]:
# Start training
with torch.cuda.amp.autocast():
    trainer.train()




Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,No log,2.811933,0.1293,0.0,0.1293,0.1293,19.369748
2,No log,2.562354,0.8784,0.0856,0.8722,0.8939,18.621849
3,2.661400,2.50075,1.0293,0.2976,1.0009,1.0245,18.487395
4,2.661400,2.48963,1.3888,0.2557,1.1605,1.215,18.521008
5,2.661400,2.484824,1.5797,0.1684,1.2699,1.3722,18.739496




In [None]:
!nvidia-smi

Tue Jul 16 07:53:31 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off | 00000000:00:04.0 Off |                    0 |
| N/A   34C    P0              51W / 400W |  12793MiB / 40960MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
# save the model
trainer.save_model("./led_large_finetuned_MTS_dialogue")
tokenizer.save_pretrained("./led_large_finetuned_MTS_dialogue")

('./led_large_finetuned_MTS_dialogue/tokenizer_config.json',
 './led_large_finetuned_MTS_dialogue/special_tokens_map.json',
 './led_large_finetuned_MTS_dialogue/vocab.json',
 './led_large_finetuned_MTS_dialogue/merges.txt',
 './led_large_finetuned_MTS_dialogue/added_tokens.json',
 './led_large_finetuned_MTS_dialogue/tokenizer.json')

In [None]:
# push to huggingface
from huggingface_hub import HfApi, HfFolder

model_name = "agnesem/led_large_finetuned_MTS_dialogue"
api = HfApi()

# Create the repository if it doesn't exist
api.create_repo(repo_id=model_name, repo_type="model", exist_ok=True)

# Upload the model files
api.upload_folder(
    folder_path="./led_large_finetuned_MTS_dialogue",
    repo_id=model_name,
    repo_type="model"
)

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/6.30M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.30k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/agnesem/led_large_finetuned_MTS_dialogue/commit/e8e6d3671f8347bece99fc711ce18ba125ef0f34', commit_message='Upload folder using huggingface_hub', commit_description='', oid='e8e6d3671f8347bece99fc711ce18ba125ef0f34', pr_url=None, pr_revision=None, pr_num=None)

## Model evaluation on short dataset

Load finetuned model

In [None]:
# Load the fine-tuned model and tokenizer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("agnesem/led_large_finetuned_MTS_dialogue")
model = AutoModelForSeq2SeqLM.from_pretrained("agnesem/led_large_finetuned_MTS_dialogue", torch_dtype=torch.bfloat16)


adapter_model.safetensors:   0%|          | 0.00/6.30M [00:00<?, ?B/s]

Load and prepare data

In [None]:
# Paths to the files
path_test_short = '/content/drive/MyDrive/Colab Notebooks/w266_Project Ideas/Data/Short_dialogue/test_short.csv'

# Load the datasets
test_short = pd.read_csv(path_test_short)

# Check the shape
print("Shape of short_dialogue_test_data:")
print(test_short.shape)


Shape of short_dialogue_test_data:
(222, 5)


In [None]:
num_chunks = 5

# Split the DataFrame
dfs = np.array_split(test_short, num_chunks)

In [None]:
# save dialogue and section_text to list for future Rough score
dialogue_lists = [subset["dialogue"].tolist() for subset in dfs]
note_lists = [subset["section_text"].tolist() for subset in dfs]

In [None]:
datasets = []
for i in range(len(dfs)):
    dataset = Dataset.from_pandas(dfs[i])
    datasets.append(dataset)

In [None]:
datasets

[Dataset({
     features: ['section_header', 'section_text', 'dialogue', 'Dialogue_Length', 'Summary_Length'],
     num_rows: 45
 }),
 Dataset({
     features: ['section_header', 'section_text', 'dialogue', 'Dialogue_Length', 'Summary_Length'],
     num_rows: 45
 }),
 Dataset({
     features: ['section_header', 'section_text', 'dialogue', 'Dialogue_Length', 'Summary_Length'],
     num_rows: 44
 }),
 Dataset({
     features: ['section_header', 'section_text', 'dialogue', 'Dialogue_Length', 'Summary_Length'],
     num_rows: 44
 }),
 Dataset({
     features: ['section_header', 'section_text', 'dialogue', 'Dialogue_Length', 'Summary_Length'],
     num_rows: 44
 })]

Tokenization

In [None]:
# tokennize
max_input_length = 2048
max_output_length = 1024
def process_data_to_model_inputs(batch):
    start_prompt = 'Summarize the following patientdoctor dialogue. Include all medical patientdoctor dialogue. Its relevant information, including family history, diagnosis, past medical and surgical history, immunizations, lab results and known allergies. You should first predict the most relevant clinical note section header and then summarize the dialogue. Dialogue:.\n\n'
    end_prompt = '\n\nSummary: '
    prompt = [start_prompt + dialogue + end_prompt for dialogue in batch["dialogue"]]
    # tokenize the inputs and labels
    inputs = tokenizer(
        prompt,
        padding="max_length",
        truncation=True,
        max_length=max_input_length,
    )
    outputs = tokenizer(
        batch["section_text"],
        padding="max_length",
        truncation=True,
        max_length=max_output_length,
    )

    batch["input_ids"] = inputs.input_ids
    batch["attention_mask"] = inputs.attention_mask

    # create 0 global_attention_mask lists
    batch["global_attention_mask"] = len(batch["input_ids"]) * [
        [0 for _ in range(len(batch["input_ids"][0]))]
    ]

    # since above lists are references, the following line changes the 0 index for all samples
    batch["global_attention_mask"][0][0] = 1
    batch["labels"] = outputs.input_ids


    return batch

In [None]:
for i in range(5):
    processed_dataset = datasets[i].map(
        process_data_to_model_inputs,
        batched=True,
        batch_size=batch_size,
        remove_columns=["section_header", "section_text", "dialogue","Dialogue_Length", "Summary_Length"],
    )
    # Create a variable name dynamically
    globals()[f'dataset_{i}'] = processed_dataset

Map:   0%|          | 0/45 [00:00<?, ? examples/s]

Map:   0%|          | 0/45 [00:00<?, ? examples/s]

Map:   0%|          | 0/44 [00:00<?, ? examples/s]

Map:   0%|          | 0/44 [00:00<?, ? examples/s]

Map:   0%|          | 0/44 [00:00<?, ? examples/s]

In [None]:
dataset_0

Dataset({
    features: ['input_ids', 'attention_mask', 'global_attention_mask', 'labels'],
    num_rows: 45
})

Model generation

subset_0

In [None]:
def model_generation(model, tokenizer, dataset):
    predictions = []

    for example in dataset:
        input_ids = example["input_ids"]
        attention_mask = example["attention_mask"]
        global_attention_mask = example["global_attention_mask"]
        reference_ids = example["labels"]

        # Decode input_ids
        input_text = tokenizer.decode(input_ids, skip_special_tokens=True)

        # Generate summary
        outputs = model.generate(input_ids=torch.tensor([input_ids]),
                                 attention_mask=torch.tensor([attention_mask]),
                                 global_attention_mask=torch.tensor([global_attention_mask]),
                                 num_beams= 4,
                                 min_length=100,
                                 max_length=1024,
                                 length_penalty=2,
                                 no_repeat_ngram_size=3)
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Append to lists for ROUGE calculation
        predictions.append(generated_text)
    return predictions

In [None]:
with torch.cuda.amp.autocast():
  predictions_sub_0 = model_generation(model, tokenizer, dataset_0)

In [None]:
from rouge_score import rouge_scorer

# Initialize ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Compute ROUGE scores for each pair of prediction and reference
rouge_scores = []
for pred, ref in zip(predictions_sub_0, note_lists[0]):
    score = scorer.score(pred, ref)
    rouge_scores.append(score)


# Calculate average ROUGE scores
avg_rouge1 = sum([score['rouge1'].fmeasure for score in rouge_scores]) / len(rouge_scores)
avg_rouge2 = sum([score['rouge2'].fmeasure for score in rouge_scores]) / len(rouge_scores)
avg_rougeL = sum([score['rougeL'].fmeasure for score in rouge_scores]) / len(rouge_scores)

# Print average ROUGE scores
print(f"Average ROUGE-1: {avg_rouge1}")
print(f"Average ROUGE-2: {avg_rouge2}")
print(f"Average ROUGE-L: {avg_rougeL}")

Average ROUGE-1: 0.20307918768669384
Average ROUGE-2: 0.08748904103258041
Average ROUGE-L: 0.12577010318172374


In [None]:
data = {
    "Dialogue": dialogue_lists[0],
    "Reference": note_lists[0],
    "Prediction": predictions_sub_0,
}
df = pd.DataFrame(data)

import io
import os

# Specify the directory to save the CSV files
output_dir = '/content/drive/MyDrive/Colab Notebooks/w266_Project Ideas/Results'

# Ensure the directory exists
os.makedirs(output_dir, exist_ok=True)
df.to_csv(os.path.join(output_dir,"led_large_lora_predictions_sub_0.csv"), index=False)

subset_1

In [None]:
with torch.cuda.amp.autocast():
  predictions_sub_1 = model_generation(model, tokenizer, dataset_1)
  from rouge_score import rouge_scorer

# Initialize ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Compute ROUGE scores for each pair of prediction and reference
rouge_scores = []
for pred, ref in zip(predictions_sub_1, note_lists[1]):
    score = scorer.score(pred, ref)
    rouge_scores.append(score)


# Calculate average ROUGE scores
avg_rouge1 = sum([score['rouge1'].fmeasure for score in rouge_scores]) / len(rouge_scores)
avg_rouge2 = sum([score['rouge2'].fmeasure for score in rouge_scores]) / len(rouge_scores)
avg_rougeL = sum([score['rougeL'].fmeasure for score in rouge_scores]) / len(rouge_scores)

# Print average ROUGE scores
print(f"Average ROUGE-1: {avg_rouge1}")
print(f"Average ROUGE-2: {avg_rouge2}")
print(f"Average ROUGE-L: {avg_rougeL}")

data = {
    "Dialogue": dialogue_lists[1],
    "Reference": note_lists[1],
    "Prediction": predictions_sub_1,
}
df = pd.DataFrame(data)

import io
import os

# Specify the directory to save the CSV files
output_dir = '/content/drive/MyDrive/Colab Notebooks/w266_Project Ideas/Results'

# Ensure the directory exists
os.makedirs(output_dir, exist_ok=True)
df.to_csv(os.path.join(output_dir,"led_large_lora_predictions_sub_1.csv"), index=False)

Average ROUGE-1: 0.21975286337026312
Average ROUGE-2: 0.08924675083324828
Average ROUGE-L: 0.14074079121895242


sebset_2

In [None]:
with torch.cuda.amp.autocast():
  predictions_sub_2 = model_generation(model, tokenizer, dataset_2)
  from rouge_score import rouge_scorer

# Initialize ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Compute ROUGE scores for each pair of prediction and reference
rouge_scores = []
for pred, ref in zip(predictions_sub_2, note_lists[2]):
    score = scorer.score(pred, ref)
    rouge_scores.append(score)


# Calculate average ROUGE scores
avg_rouge1 = sum([score['rouge1'].fmeasure for score in rouge_scores]) / len(rouge_scores)
avg_rouge2 = sum([score['rouge2'].fmeasure for score in rouge_scores]) / len(rouge_scores)
avg_rougeL = sum([score['rougeL'].fmeasure for score in rouge_scores]) / len(rouge_scores)

# Print average ROUGE scores
print(f"Average ROUGE-1: {avg_rouge1}")
print(f"Average ROUGE-2: {avg_rouge2}")
print(f"Average ROUGE-L: {avg_rougeL}")

data = {
    "Dialogue": dialogue_lists[2],
    "Reference": note_lists[2],
    "Prediction": predictions_sub_2,
}
df = pd.DataFrame(data)

import io
import os

# Specify the directory to save the CSV files
output_dir = '/content/drive/MyDrive/Colab Notebooks/w266_Project Ideas/Results'

# Ensure the directory exists
os.makedirs(output_dir, exist_ok=True)
df.to_csv(os.path.join(output_dir,"led_large_lora_predictions_sub_2.csv"), index=False)

Average ROUGE-1: 0.20250855555916666
Average ROUGE-2: 0.08504236080526631
Average ROUGE-L: 0.1398445760779495


subset_3

In [None]:
with torch.cuda.amp.autocast():
  predictions_sub_3 = model_generation(model, tokenizer, dataset_3)
  # from rouge_score import rouge_scorer

# # Initialize ROUGE scorer
# scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# # Compute ROUGE scores for each pair of prediction and reference
# rouge_scores = []
# for pred, ref in zip(predictions_sub_3, note_lists[3]):
#     score = scorer.score(pred, ref)
#     rouge_scores.append(score)


# # Calculate average ROUGE scores
# avg_rouge1 = sum([score['rouge1'].fmeasure for score in rouge_scores]) / len(rouge_scores)
# avg_rouge2 = sum([score['rouge2'].fmeasure for score in rouge_scores]) / len(rouge_scores)
# avg_rougeL = sum([score['rougeL'].fmeasure for score in rouge_scores]) / len(rouge_scores)

# # Print average ROUGE scores
# print(f"Average ROUGE-1: {avg_rouge1}")
# print(f"Average ROUGE-2: {avg_rouge2}")
# print(f"Average ROUGE-L: {avg_rougeL}")

data = {
    "Dialogue": dialogue_lists[3],
    "Reference": note_lists[3],
    "Prediction": predictions_sub_3,
}
df = pd.DataFrame(data)

import io
import os

# Specify the directory to save the CSV files
output_dir = '/content/drive/MyDrive/Colab Notebooks/w266_Project Ideas/Results'

# Ensure the directory exists
os.makedirs(output_dir, exist_ok=True)
df.to_csv(os.path.join(output_dir,"led_large_lora_predictions_sub_3.csv"), index=False)

sunset_4

In [None]:
with torch.cuda.amp.autocast():
  predictions_sub_4 = model_generation(model, tokenizer, dataset_4)
  # from rouge_score import rouge_scorer

# # Initialize ROUGE scorer
# scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# # Compute ROUGE scores for each pair of prediction and reference
# rouge_scores = []
# for pred, ref in zip(predictions_sub_3, note_lists[3]):
#     score = scorer.score(pred, ref)
#     rouge_scores.append(score)


# # Calculate average ROUGE scores
# avg_rouge1 = sum([score['rouge1'].fmeasure for score in rouge_scores]) / len(rouge_scores)
# avg_rouge2 = sum([score['rouge2'].fmeasure for score in rouge_scores]) / len(rouge_scores)
# avg_rougeL = sum([score['rougeL'].fmeasure for score in rouge_scores]) / len(rouge_scores)

# # Print average ROUGE scores
# print(f"Average ROUGE-1: {avg_rouge1}")
# print(f"Average ROUGE-2: {avg_rouge2}")
# print(f"Average ROUGE-L: {avg_rougeL}")

data = {
    "Dialogue": dialogue_lists[4],
    "Reference": note_lists[4],
    "Prediction": predictions_sub_4,
}
df = pd.DataFrame(data)

import io
import os

# Specify the directory to save the CSV files
output_dir = '/content/drive/MyDrive/Colab Notebooks/w266_Project Ideas/Results'

# Ensure the directory exists
os.makedirs(output_dir, exist_ok=True)
df.to_csv(os.path.join(output_dir,"led_large_lora_predictions_sub_4.csv"), index=False)

Combine and evaluate overall

In [None]:
# combine all the predictions:
predictions = []
for i in range(5):
  var_name = f"predictions_sub_{i}"
  predictions.extend(globals()[var_name])


In [None]:
# Paths to the files
path_test_short = '/content/drive/MyDrive/Colab Notebooks/w266_Project Ideas/Data/Short_dialogue/test_short.csv'

# Load the datasets

test_short = pd.read_csv(path_test_short)

# Check the shape

print("Shape of short_dialogue_test_data:")
print(test_short.shape)

Shape of long_dialogue_test_data:
(222, 5)


In [None]:
dialogues = test_short["dialogue"].tolist()
reference = test_short["section_text"].tolist()

In [None]:
data = {
    "Dialogue": dialogues,
    "Reference": reference,
    "Prediction": predictions,
}
df = pd.DataFrame(data)

import io
import os

# Specify the directory to save the CSV files
output_dir = '/content/drive/MyDrive/Colab Notebooks/w266_Project Ideas/Results'

# Ensure the directory exists
os.makedirs(output_dir, exist_ok=True)
df.to_csv(os.path.join(output_dir,"led_lora_short_predictions.csv"), index=False)

In [None]:
from rouge_score import rouge_scorer

# Initialize ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Compute ROUGE scores for each pair of prediction and reference
rouge_scores = []
for pred, ref in zip(predictions,reference):
    score = scorer.score(pred, ref)
    rouge_scores.append(score)


# Calculate average ROUGE scores
avg_rouge1 = sum([score['rouge1'].fmeasure for score in rouge_scores]) / len(rouge_scores)
avg_rouge2 = sum([score['rouge2'].fmeasure for score in rouge_scores]) / len(rouge_scores)
avg_rougeL = sum([score['rougeL'].fmeasure for score in rouge_scores]) / len(rouge_scores)

# Print average ROUGE scores
print(f"Average ROUGE-1: {avg_rouge1}")
print(f"Average ROUGE-2: {avg_rouge2}")
print(f"Average ROUGE-L: {avg_rougeL}")

Average ROUGE-1: 0.2101229163539918
Average ROUGE-2: 0.08529470533933904
Average ROUGE-L: 0.13506212431352624


**Average ROUGE-1: 0.2101**

This score indicates that, on average, about 21.01% of the unigrams (single words) in the generated summaries overlap with those in the reference summaries. This suggests a moderate level of content matching at the word level.

**Average ROUGE-2: 0.0853**

This score indicates that, on average, about 8.53% of the bigrams (two consecutive words) in the generated summaries overlap with those in the reference summaries. This lower score compared to ROUGE-1 is expected and indicates less overlap at the phrase level.

**Average ROUGE-L: 0.1351**

This score indicates that, on average, about 13.51% of the longest common subsequences of words between the generated summaries and the reference summaries overlap. ROUGE-L captures sentence-level structure similarity and indicates a modest overlap in sentence structure.