In [None]:
import os
os.environ['WANDB_DISABLED']="true"

Install and load libraries

In [None]:
%pip install --upgrade pip
%pip install --disable-pip-version-check \
    torch==1.13.1 \
    torchdata==0.5.1 --quiet

%pip install \
    transformers==4.27.2 \
    datasets==2.11.0 \
    evaluate==0.4.0 \
    rouge_score==0.1.2 \
    loralib==0.1.1 \
    peft==0.3.0 --quiet

[0m

In [None]:
# install git-fls for pushing model and logs to the hugging face hub
!sudo apt-get install git-lfs --yes

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git-lfs is already the newest version (3.0.2-1ubuntu0.2).
0 upgraded, 0 newly installed, 0 to remove and 45 not upgraded.


In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer
import torch
import time
import evaluate
import pandas as pd
import numpy as np

from datasets import Dataset, load_metric

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Load data

In [None]:
import pandas as pd
import numpy as np

# Paths to the files
# Load datasets
path_train_short = '/content/drive/MyDrive/Colab Notebooks/w266_Project Ideas/Data/Short_dialogue/train_short.csv'
path_test_short = '/content/drive/MyDrive/Colab Notebooks/w266_Project Ideas/Data/Short_dialogue/test_short.csv'
path_val_short = '/content/drive/MyDrive/Colab Notebooks/w266_Project Ideas/Data/Short_dialogue/val_short.csv'

# Load the datasets
train_short = pd.read_csv(path_train_short)
test_short = pd.read_csv(path_test_short)
val_short = pd.read_csv(path_val_short)

# Check the first few rows to ensure they're loaded correctly
print("Short_dialogue_train_data:")
print(train_short.head())

# Function to calculate statistics
def calculate_statistics(data, text_column, label_column):
    dialogue_lengths = data[text_column].str.split().apply(len)
    label_lengths = data[label_column].str.split().apply(len)

    dialogue_stats = {
        'Min dialogue length': dialogue_lengths.min(),
        'Ave dialogue length': dialogue_lengths.mean(),
        'Max dialogue length': dialogue_lengths.max(),
        '95% dialogue length': np.percentile(dialogue_lengths, 95),
        'Median dialogue length': dialogue_lengths.median()
    }

    label_stats = {
        'Min label length': label_lengths.min(),
        'Ave label length': label_lengths.mean(),
        'Max label length': label_lengths.max(),
        '95% label length': np.percentile(label_lengths, 95),
        'Median label length': label_lengths.median()
    }

    return dialogue_stats, label_stats


Short_dialogue_train_data:
  section_header                                       section_text  \
0          GENHX  The patient is a 75-year-old female who comes ...   
1      FAM/SOCHX         Significant for diabetes and hypertension.   
2  PASTMEDICALHX                  Significant for anxiety disorder.   
3          GENHX  The patient is a 77-year-old female who is una...   
4      FAM/SOCHX                                   Noncontributory.   

                                            dialogue  Dialogue_Length  \
0  Doctor: Welcome to the clinic. I am Doctor Fra...             1396   
1  Doctor: Does anyone else in your family suffer...              175   
2  Doctor: Have we gone over your survey results ...              256   
3  Guest_clinician: How old is the patient? Docto...              438   
4  Doctor: Do you have a known- Patient: Drug all...              105   

   Summary_Length  
0             677  
1              42  
2              33  
3             325  
4      

# Load Flan-T5-large

Reference Flan-T5-large [link text](https://arxiv.org/pdf/2210.11416)

In [None]:
model_name='google/flan-t5-large'

t5_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

print(print_number_of_trainable_model_parameters(t5_model))


trainable model parameters: 783150080
all model parameters: 783150080
percentage of trainable model parameters: 100.00%


Test model

In [None]:
# Test the model
index = 10

# Check the length of the test_short DataFrame
if len(test_short) > index:
    dialogue = test_short.iloc[index]['dialogue']
    summary = test_short.iloc[index]['section_text']

    prompt = f"""
    Summarize the following conversation.

    {dialogue}

    Summary:
    """

    inputs = tokenizer(prompt, return_tensors='pt')
    output = tokenizer.decode(
        t5_model.generate(
            inputs["input_ids"],
            max_new_tokens=200,
        )[0],
        skip_special_tokens=True
    )

    dash_line = '-' * 100
    print(dash_line)
    print(f'INPUT PROMPT:\n{prompt}')
    print(dash_line)
    print(f'BASELINE HUMAN SUMMARY:\n{summary}\n')
    print(dash_line)
    print(f'MODEL GENERATION - ZERO SHOT:\n{output}')
else:
    print(f"Index {index} is out of bounds for the test_short DataFrame with length {len(test_short)}.")


----------------------------------------------------------------------------------------------------
INPUT PROMPT:

    Summarize the following conversation.

    Doctor: History of any illnesses, surgeries, or hospitalizations? Patient: I dislocated my wrist when I was a kid but that's all.

    Summary:
    
----------------------------------------------------------------------------------------------------
BASELINE HUMAN SUMMARY:
Surgeries: None. Injuries: Dislocated wrist. Illnesses: None.

----------------------------------------------------------------------------------------------------
MODEL GENERATION - ZERO SHOT:
Patient: I dislocated my wrist when I was a kid.


Token

In [None]:
# Tokenize function
def tokenize_function(batch):
    start_prompt = 'Summarize the following patient-doctor dialogue. Include all medically relevant information, including family history, diagnosis, past medical and surgical history, immunizations, lab results and known allergies. You should first predict the most relevant clinical note section header and then summarize the dialogue. Dialogue:.\n\n'
    end_prompt = '\n\nSummary: '
    prompt = [start_prompt + dialogue + end_prompt for dialogue in batch["dialogue"]]
    batch['input_ids'] = tokenizer(prompt, padding="max_length", max_length=1024, truncation=True, return_tensors="pt").input_ids
    batch['labels'] = tokenizer(batch["section_text"], padding="max_length", max_length=512, truncation=True, return_tensors="pt").input_ids
    batch["labels"] = [
        [-100 if token == tokenizer.pad_token_id else token for token in labels]
        for labels in batch["labels"]
    ]
    return batch

batch_size = 2

train_short = Dataset.from_pandas(train_short)
test_short = Dataset.from_pandas(test_short)
val_short = Dataset.from_pandas(val_short)


In [None]:
# Mapping
train_short = train_short.map(tokenize_function, batched=True, batch_size=batch_size, remove_columns=["section_header", "section_text", "dialogue", "Dialogue_Length", "Summary_Length"])
test_short = test_short.map(tokenize_function, batched=True, batch_size=batch_size, remove_columns=["section_header", "section_text", "dialogue", "Dialogue_Length", "Summary_Length"])
val_short = val_short.map(tokenize_function, batched=True, batch_size=batch_size, remove_columns=["section_header", "section_text", "dialogue", "Dialogue_Length", "Summary_Length"])


Map:   0%|          | 0/1360 [00:00<?, ? examples/s]

Map:   0%|          | 0/222 [00:00<?, ? examples/s]

Map:   0%|          | 0/119 [00:00<?, ? examples/s]

In [None]:
# rouge = evaluate.load('rouge')

# # Helper function to postprocess text
# import nltk
# nltk.download("punkt")
# from nltk.tokenize import sent_tokenize

# def postprocess_text(preds, labels):
#     preds = [pred.strip() for pred in preds]
#     labels = [label.strip() for label in labels]

#     # rougeLSum expects newline after each sentence
#     preds = ["\n".join(sent_tokenize(pred)) for pred in preds]
#     labels = ["\n".join(sent_tokenize(label)) for label in labels]

#     return preds, labels

# def compute_metrics(eval_preds):
#     preds, labels = eval_preds
#     if isinstance(preds, tuple):
#         preds = preds[0]
#     decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
#     labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
#     decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

#     decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

#     result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
#     result = {k: round(v * 100, 4) for k, v in result.items()}
#     prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
#     result["gen_len"] = np.mean(prediction_lens)

#     # Explicitly log the metrics for debugging
#     print(f"Evaluation metrics: {result}")

#     return result


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
import evaluate
import nltk
import numpy as np
from nltk.tokenize import sent_tokenize
nltk.download("punkt")

# Metric
metric = evaluate.load("rouge")

# helper function to postprocess text
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(sent_tokenize(label)) for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    result = {k: round(v * 100, 4) for k, v in result.items()}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    return result

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
from transformers import DataCollatorForSeq2Seq

# ignore tokenizer pad token in the loss
label_pad_token_id = -100
# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=t5_model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)

Rouge Scores and Len Gen are normalized in the table elow: it means that what is displyed ust be divided for 100.

As it is shown 30.5592, it is 0.305592.

Training

In [None]:
from huggingface_hub import HfFolder
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
from torch.optim import AdamW

# Define training arguments

repository_id = 'hankym/flan-t5'  # Hugging Face Hub repository

training_args = Seq2SeqTrainingArguments(
    output_dir=repository_id,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    predict_with_generate=True,
    fp16=False,  # Enable mixed precision training
    learning_rate=1e-4,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir=f"{repository_id}/logs",
    logging_strategy="steps",
    logging_steps=500,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
)

# Data collator for dynamic padding
data_collator = DataCollatorForSeq2Seq(tokenizer, model=t5_model, label_pad_token_id=-100, pad_to_multiple_of=8)

# Custom optimizer
optimizer = AdamW(t5_model.parameters(), lr=1e-4, weight_decay=0.01)

# Define the trainer
trainer = Seq2SeqTrainer(
    model=t5_model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_short,
    eval_dataset=val_short,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, None),  # Pass the custom optimizer
)

# Train the model
torch.cuda.empty_cache()
trainer.train()


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,1.0574,1.904215,30.5592,13.4807,26.0642,28.1615,14.739496
2,1.036,1.72177,31.2901,13.4791,26.2049,28.2566,14.151261
3,1.5398,1.674238,31.5327,13.888,26.885,28.6391,14.042017
4,1.5046,1.671612,31.2798,13.5902,26.528,28.3489,14.067227
5,1.4828,1.673516,31.7079,13.7886,27.1011,28.9632,14.067227


TrainOutput(global_step=3400, training_loss=1.3522232235179228, metrics={'train_runtime': 1642.5056, 'train_samples_per_second': 4.14, 'train_steps_per_second': 2.07, 'total_flos': 3.13448441315328e+16, 'train_loss': 1.3522232235179228, 'epoch': 5.0})

In [None]:
# save the model
trainer.save_model("./flan_t5_large_finetuned_MTS_dialogue")
tokenizer.save_pretrained("./flan_t5_large_finetuned_MTS_dialogue")

('./flan_t5_large_finetuned_MTS_dialogue/tokenizer_config.json',
 './flan_t5_large_finetuned_MTS_dialogue/special_tokens_map.json',
 './flan_t5_large_finetuned_MTS_dialogue/spiece.model',
 './flan_t5_large_finetuned_MTS_dialogue/added_tokens.json',
 './flan_t5_large_finetuned_MTS_dialogue/tokenizer.json')

In [None]:
# push to huggingface
from huggingface_hub import HfApi, HfFolder

model_name = "hankym/flan_t5_large_finetuned_MTS_dialogue"
api = HfApi()

# Create the repository if it doesn't exist
api.create_repo(repo_id=model_name, repo_type="model", exist_ok=True)

# Upload the model files
api.upload_folder(
    folder_path="./flan_t5_large_finetuned_MTS_dialogue",
    repo_id=model_name,
    repo_type="model"
)

training_args.bin:   0%|          | 0.00/3.64k [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

pytorch_model.bin:   0%|          | 0.00/1.57G [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/hankym/flan_t5_large_finetuned_MTS_dialogue/commit/2aa7dff80334bb34a234920b934b5e222c544040', commit_message='Upload folder using huggingface_hub', commit_description='', oid='2aa7dff80334bb34a234920b934b5e222c544040', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
# Load the fine-tuned model and tokenizer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("hankym/flan_t5_large_finetuned_MTS_dialogue")
model = AutoModelForSeq2SeqLM.from_pretrained("hankym/flan_t5_large_finetuned_MTS_dialogue", torch_dtype=torch.bfloat16)



tokenizer_config.json:   0%|          | 0.00/2.50k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/788 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.57G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
# Paths to the files
path_test_short = '/content/drive/MyDrive/Colab Notebooks/w266_Project Ideas/Data/Short_dialogue/test_short.csv'

# Load the datasets
test_short = pd.read_csv(path_test_short)

# Check the shape
print("Shape of short_dialogue_test_data:")
print(test_short.shape)

Shape of short_dialogue_test_data:
(222, 5)


In [None]:
sub_set_test = test_short[:5]

In [None]:
sub_set_test = Dataset.from_pandas(sub_set_test)

In [None]:
test_short = Dataset.from_pandas(test_short)

In [None]:
max_input_length = 1024
max_output_length = 512
def tokenize_function(batch):
    start_prompt = 'Summarize the following patientdoctor dialogue. Include all medically relevant information, including family history, diagnosis, past medical and surgical history, immunizations, lab results and known allergies. You should first predict the most relevant clinical note section header and then summarize the dialogue. Dialogue:.\n\n'
    end_prompt = '\n\nSummary: '
    prompt = [start_prompt + dialogue + end_prompt for dialogue in batch["dialogue"]]
    batch['input_ids'] = tokenizer(prompt, padding="max_length", max_length=max_input_length, truncation=True, return_tensors="pt").input_ids
    batch['labels'] = tokenizer(batch["section_text"], padding="max_length", max_length=max_output_length, truncation=True, return_tensors="pt").input_ids

    return batch

In [None]:
sub_set_test = sub_set_test.map(
    tokenize_function,
    batched=True,
    batch_size=batch_size,
    remove_columns=["section_header", "section_text", "dialogue","Dialogue_Length","Summary_Length"],
)

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

In [None]:
test_short = test_short.map(
    tokenize_function,
    batched=True,
    batch_size=batch_size,
    remove_columns=["section_header", "section_text", "dialogue","Dialogue_Length","Summary_Length"],
)

Map:   0%|          | 0/222 [00:00<?, ? examples/s]

In [None]:
def model_generation(model, tokenizer, dataset):
    dialogues = []
    predictions = []
    references = []

    for example in dataset:
        input_ids = example["input_ids"]
        reference_ids = example["labels"]

        # Extract dialogue
        dialogue_text = tokenizer.decode(input_ids[0], skip_special_tokens=True)
        dialogues.append(dialogue_text)

        # Decode input_ids
        input_text = tokenizer.decode(input_ids, skip_special_tokens=True)
        reference_text = tokenizer.decode(reference_ids, skip_special_tokens=True)

        # Generate summary
        outputs = model.generate(input_ids=torch.tensor([input_ids]))
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Append to lists for ROUGE calculation
        predictions.append(generated_text)
        references.append(reference_text)
    return predictions, references

# Call the function with the correct argument
t5_predictions, t5_references = model_generation(model, tokenizer, test_short)



In [None]:
from rouge_score import rouge_scorer

# Initialize ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Compute ROUGE scores for each pair of prediction and reference
rouge_scores = []
for pred, ref in zip(t5_predictions, t5_references):
    score = scorer.score(pred, ref)
    rouge_scores.append(score)


# Calculate average ROUGE scores
avg_rouge1 = sum([score['rouge1'].fmeasure for score in rouge_scores]) / len(rouge_scores)
avg_rouge2 = sum([score['rouge2'].fmeasure for score in rouge_scores]) / len(rouge_scores)
avg_rougeL = sum([score['rougeL'].fmeasure for score in rouge_scores]) / len(rouge_scores)

# Print average ROUGE scores
print(f"Average ROUGE-1: {avg_rouge1}")
print(f"Average ROUGE-2: {avg_rouge2}")
print(f"Average ROUGE-L: {avg_rougeL}")

Average ROUGE-1: 0.32690432810436415
Average ROUGE-2: 0.15535610008885106
Average ROUGE-L: 0.2860573732805059


In [None]:
with torch.cuda.amp.autocast():
    dialogues, t5_predictions, t5_references = model_generation(model, tokenizer, test_short)

In [None]:
from rouge_score import rouge_scorer

# Initialize ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Compute ROUGE scores for each pair of prediction and reference
rouge_scores = []
for pred, ref in zip(t5_predictions, t5_references):
    score = scorer.score(pred, ref)
    rouge_scores.append(score)


# Calculate average ROUGE scores
avg_rouge1 = sum([score['rouge1'].fmeasure for score in rouge_scores]) / len(rouge_scores)
avg_rouge2 = sum([score['rouge2'].fmeasure for score in rouge_scores]) / len(rouge_scores)
avg_rougeL = sum([score['rougeL'].fmeasure for score in rouge_scores]) / len(rouge_scores)

# Print average ROUGE scores
print(f"Average ROUGE-1: {avg_rouge1}")
print(f"Average ROUGE-2: {avg_rouge2}")
print(f"Average ROUGE-L: {avg_rougeL}")

Average ROUGE-1: 0.32690432810436415
Average ROUGE-2: 0.15535610008885106
Average ROUGE-L: 0.2860573732805059


In [None]:
data = {
    "Dialogues": dialogues,
    "Reference": t5_references,
    "Prediction": t5_predictions,
}
df = pd.DataFrame(data)

import io
import os

# Specify the directory to save the CSV files
output_dir = '/content/drive/MyDrive/Colab Notebooks/w266_Project Ideas/Results/Models/Baseline_T5large'

# Ensure the directory exists
os.makedirs(output_dir, exist_ok=True)
df.to_csv(os.path.join(output_dir,"baseline_flan_t5_large_shortdialogue_predictions.csv"), index=False)