Check GPU Availability:

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Fri Jul 19 07:45:20 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off | 00000000:00:04.0 Off |                    0 |
| N/A   32C    P0              48W / 400W |      2MiB / 40960MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

Set Environment Variables

In [None]:
import os
os.environ['WANDB_DISABLED']="true"

Install and load libraries

In [None]:
%pip install --upgrade pip
%pip install --disable-pip-version-check \
    torch==1.13.1 \
    torchdata==0.5.1 --quiet

%pip install \
    transformers==4.27.2 \
    datasets==2.11.0 \
    evaluate==0.4.0 \
    rouge_score==0.1.2 \
    loralib==0.1.1 \
    peft==0.3.0 --quiet

[0m

In [None]:
# install git-fls for pushing model and logs to the hugging face hub
!sudo apt-get install git-lfs --yes

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git-lfs is already the newest version (3.0.2-1ubuntu0.2).
0 upgraded, 0 newly installed, 0 to remove and 45 not upgraded.


Login to Hugging Face Hub:

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Import Libraries:

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer
import torch
import time
import evaluate
import pandas as pd
import numpy as np

from datasets import Dataset, load_metric

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Load Flan-T5-base

Reference Flan-T5-base [link text](https://https://arxiv.org/pdf/2210.11416)

In [None]:
model_name = 'google/flan-t5-base'
config = T5Config.from_pretrained(model_name)
config.dropout_rate = 0.1

Load data

In [None]:
path_train_short = '/content/drive/MyDrive/Colab Notebooks/w266_Project Ideas/Data/Clean_Short_NER_Clinical/train_short_clinical_NER.csv'
path_test_short = '/content/drive/MyDrive/Colab Notebooks/w266_Project Ideas/Data/Clean_Short_NER_Clinical/test_short_clinical_NER.csv'
path_val_short = '/content/drive/MyDrive/Colab Notebooks/w266_Project Ideas/Data/Clean_Short_NER_Clinical/val_short_clinical_NER.csv'

train_short = pd.read_csv(path_train_short)
test_short = pd.read_csv(path_test_short)
val_short = pd.read_csv(path_val_short)

print("Short_dialogue_train_data:")
print(train_short.head())

Short_dialogue_train_data:
  section_header                                       section_text  \
0          GENHX  The patient is a 75-year-old female who comes ...   
1      FAM/SOCHX         Significant for diabetes and hypertension.   
2  PASTMEDICALHX                  Significant for anxiety disorder.   
3          GENHX  The patient is a 77-year-old female who is una...   
4      FAM/SOCHX                                   Noncontributory.   

                                            dialogue  \
0  Doctor: Welcome to the clinic. I am Doctor Fra...   
1  Doctor: Does anyone else in your family suffer...   
2  Doctor: Have we gone over your survey results ...   
3  Guest_clinician: How old is the patient? Docto...   
4  Doctor: Do you have a known- Patient: Drug all...   

                                  clinical_ner_label  \
0  [{'phrase': 'stroke', 'label': 'DISEASE_DISORD...   
1  [{'phrase': 'family', 'label': 'HISTORY'}, {'p...   
2  [{'phrase': 'survey', 'label': 'DIAGNO

Calculate Statistics:

In [None]:
def calculate_statistics(data, text_column, label_column):
    dialogue_lengths = data[text_column].str.split().apply(len)
    label_lengths = data[label_column].str.split().apply(len)

    dialogue_stats = {
        'Min dialogue length': dialogue_lengths.min(),
        'Ave dialogue length': dialogue_lengths.mean(),
        'Max dialogue length': dialogue_lengths.max(),
        '95% dialogue length': np.percentile(dialogue_lengths, 95),
        'Median dialogue length': dialogue_lengths.median()
    }

    label_stats = {
        'Min label length': label_lengths.min(),
        'Ave label length': label_lengths.mean(),
        'Max label length': label_lengths.max(),
        '95% label length': np.percentile(label_lengths, 95),
        'Median label length': label_lengths.median()
    }

    return dialogue_stats, label_stats

dialogue_column = 'dialogue'
label_column = 'clinical_ner_unique_no_label'

train_dialogue_stats, train_label_stats = calculate_statistics(train_short, dialogue_column, label_column)
test_dialogue_stats, test_label_stats = calculate_statistics(test_short, dialogue_column, label_column)
val_dialogue_stats, val_label_stats = calculate_statistics(val_short, dialogue_column, label_column)

print("Training Data Dialogue Stats:")
print(train_dialogue_stats)
print("Training Data Label Stats:")
print(train_label_stats)


Training Data Dialogue Stats:
{'Min dialogue length': 6, 'Ave dialogue length': 103.7720588235294, 'Max dialogue length': 1509, '95% dialogue length': 325.04999999999995, 'Median dialogue length': 63.0}
Training Data Label Stats:
{'Min label length': 1, 'Ave label length': 16.394117647058824, 'Max label length': 166, '95% label length': 56.049999999999955, 'Median label length': 10.0}



Based on data analysis
- Min dialogue length: 6
- Ave dialogue length: 104
- Max dialogue length: 1509
- 95% dialogue length: 325
- Median dialogue length: 63
- Min summary length: 1
- Ave summary length: 13
- Max summary length: 82
- 95% summary length: 46
- Median summary length: 9

Model and Tokenizer Loading:

In [None]:
from transformers import T5Config, AutoModelForSeq2SeqLM, AutoTokenizer

model_name = 'google/flan-t5-base'
config = T5Config.from_pretrained(model_name)
config.dropout_rate = 0.1

t5_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, config=config, torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(model_name)

def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

print(print_number_of_trainable_model_parameters(t5_model))

train_short = Dataset.from_pandas(train_short)
test_short = Dataset.from_pandas(test_short)
val_short = Dataset.from_pandas(val_short)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


trainable model parameters: 247577856
all model parameters: 247577856
percentage of trainable model parameters: 100.00%


Testing the model:

In [None]:
index = 10
dialogue = test_short[index]['dialogue']
summary = test_short[index]['section_text']

prompt = f"""
Summarize the following conversation.

{dialogue}

Summary:
"""

inputs = tokenizer(prompt, return_tensors='pt')
output = tokenizer.decode(
    t5_model.generate(
        inputs["input_ids"],
        max_new_tokens=200,
    )[0],
    skip_special_tokens=True
)

dash_line = '-' * 100
print(dash_line)
print(f'INPUT PROMPT:\n{prompt}')
print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{summary}\n')
print(dash_line)
print(f'MODEL GENERATION - ZERO SHOT:\n{output}')


----------------------------------------------------------------------------------------------------
INPUT PROMPT:

Summarize the following conversation.

Doctor: History of any illnesses, surgeries, or hospitalizations? Patient: I dislocated my wrist when I was a kid but that's all.

Summary:

----------------------------------------------------------------------------------------------------
BASELINE HUMAN SUMMARY:
Surgeries: None. Injuries: Dislocated wrist. Illnesses: None.

----------------------------------------------------------------------------------------------------
MODEL GENERATION - ZERO SHOT:
The doctor will take a blood test.


Tokenization Function:

In [None]:
max_input_length = 1024
max_output_length = 512
def tokenize_function(batch):
    start_prompt = (
    "Summarize the following patient-doctor. "
    "Include all medically relevant information, including family history, "
    "diagnosis, past medical and surgical history, immunizations, lab results, and known allergies. "
    "Use the following medical and chemical entities extracted from the dialogue to help summarization, but do not overly use them. "
    "Entities:\n\n")

    end_prompt = '\n\nSummary: '

    prompt = [start_prompt + entities + "\n\n" + 'Dialogue: \n\n' + dialogue + end_prompt
              for entities, dialogue in zip(batch['clinical_ner_unique_no_label'], batch['dialogue'])]

    inputs = tokenizer(
        prompt,
        padding="max_length",
        truncation=True,
        max_length=max_input_length,
    )
    outputs = tokenizer(
        batch["section_text"],
        padding="max_length",
        truncation=True,
        max_length=max_output_length,
    )

    batch["input_ids"] = inputs.input_ids
    batch["attention_mask"] = inputs.attention_mask

    batch["global_attention_mask"] = len(batch["input_ids"]) * [
        [0 for _ in range(len(batch["input_ids"][0]))]
    ]

    batch["global_attention_mask"][0][0] = 1
    batch["labels"] = outputs.input_ids

    batch["labels"] = [
        [-100 if token == tokenizer.pad_token_id else token for token in labels]
        for labels in batch["labels"]
    ]

    return batch


Mapping the Tokenization Function:

In [None]:
batch_size = 2
train_short = train_short.map(
    tokenize_function,
    batched=True,
    batch_size=batch_size,
    remove_columns=["section_header", "section_text","dialogue","clinical_ner_label", "clinical_ner_unique_label","clinical_ner_no_label","clinical_ner_unique_no_label"],
)

test_short = test_short.map(
    tokenize_function,
    batched=True,
    batch_size=batch_size,
    remove_columns=["section_header", "section_text","dialogue","clinical_ner_label", "clinical_ner_unique_label","clinical_ner_no_label","clinical_ner_unique_no_label"],
)

val_short = val_short.map(
    tokenize_function,
    batched=True,
    batch_size=batch_size,
    remove_columns=["section_header", "section_text","dialogue","clinical_ner_label", "clinical_ner_unique_label","clinical_ner_no_label","clinical_ner_unique_no_label"],
)


Map:   0%|          | 0/1360 [00:00<?, ? examples/s]

Map:   0%|          | 0/222 [00:00<?, ? examples/s]

Map:   0%|          | 0/119 [00:00<?, ? examples/s]

Sample Checks:

In [None]:
from random import sample

sample_input_id = test_short['input_ids'][0:1]
sample_label = test_short['labels'][0:1]
sample_attention_mask = test_short['attention_mask'][0:1]
sample_global_attention_mask = test_short['global_attention_mask'][0:1]

print("tokenized_train_exp: ", test_short)
print("\nsample_input_id: ", len(sample_input_id[0]), sample_input_id)
print("\nsample_label: ", len(sample_label[0]), sample_label)
print("\nsample_attention_mask: ", len(sample_attention_mask[0]), sample_attention_mask)
print("\nsample_global_attention_mask: ", len(sample_global_attention_mask[0]), sample_global_attention_mask)


tokenized_train_exp:  Dataset({
    features: ['input_ids', 'attention_mask', 'global_attention_mask', 'labels'],
    num_rows: 222
})

sample_input_id:  1024 [[12198, 1635, 1737, 8, 826, 1868, 18, 26, 32, 5317, 5, 15746, 66, 1035, 120, 2193, 251, 6, 379, 384, 892, 6, 8209, 6, 657, 1035, 11, 11685, 892, 6, 256, 51, 202, 1707, 7, 6, 7690, 772, 6, 11, 801, 18500, 5, 2048, 8, 826, 1035, 11, 5368, 12311, 21527, 45, 8, 7478, 12, 199, 4505, 1635, 1707, 6, 68, 103, 59, 147, 120, 169, 135, 5, 4443, 2197, 10, 784, 31, 13974, 533, 31, 6, 3, 31, 22149, 15, 26, 31, 6, 3, 31, 29, 3589, 15, 31, 6, 3, 31, 26, 13419, 4798, 169, 14863, 4845, 4845, 31, 6, 3, 31, 26, 22024, 31, 6, 3, 31, 3252, 2482, 31, 6, 3, 31, 60, 521, 226, 288, 3, 7, 31, 6, 3, 31, 3843, 1528, 11208, 31, 6, 3, 31, 3843, 1528, 31, 6, 3, 31, 2026, 11830, 7, 239, 23911, 115, 53, 31, 6, 3, 31, 15745, 1087, 31, 6, 3, 31, 5064, 1332, 31, 908, 5267, 10384, 10, 7582, 10, 2018, 6, 132, 5, 17656, 10, 2018, 5, 14252, 834, 15474, 10, 2018, 55, 94

In [None]:
# def print_number_of_trainable_model_parameters(model):
#     trainable_model_params = 0
#     all_model_params = 0
#     for _, param in model.named_parameters():
#         all_model_params += param.numel()
#         if param.requires_grad:
#             trainable_model_params += param.numel()
#     return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

# print(print_number_of_trainable_model_parameters(t5_model))


trainable model parameters: 247577856
all model parameters: 247577856
percentage of trainable model parameters: 100.00%


# Fine-tuning the model

## Tokenization

In [None]:
decode_text = tokenizer.decode(sample_input_id[0], skip_special_tokens=True)
decode_text

"Summarize the following patient-doctor. Include all medically relevant information, including family history, diagnosis, past medical and surgical history, immunizations, lab results, and known allergies. Use the following medical and chemical entities extracted from the dialogue to help summarization, but do not overly use them. Entities: ['mental health','registered', 'nurse', 'drink alcohol use recreational drugs drugs', 'drugs','muscle','relaxant s','sedative medications','sedative', 'prescriptions day prescribing','social activities', 'last March'] Dialogue: Doctor: Hi, there. Patient: Hi. Guest_family: Hi! It is nice to meet you. Doctor: It is nice to meet you as well. You are the husband, correct? Guest_family: Yes. I am the husband. Doctor: How long have you two been married? Guest_family: We have been married for forty eight years. Doctor: Do you live in the same household? Patient: Yes. Doctor: Do you have a primary care physician? Patient: No, I am currently looking for som

Training

In [None]:
import evaluate
import nltk
import numpy as np
from nltk.tokenize import sent_tokenize
nltk.download("punkt")

# Metric
metric = evaluate.load("rouge")

# helper function to postprocess text
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(sent_tokenize(label)) for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    result = {k: round(v * 100, 4) for k, v in result.items()}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    return result

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
from transformers import DataCollatorForSeq2Seq

# ignore tokenizer pad token in the loss
label_pad_token_id = -100
# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=t5_model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)

Adjust dropout and learning learning rate.

***lr_scheduler_type='linear':*** This argument sets the type of learning rate scheduler. 'linear' will linearly decrease the learning rate from the initial learning rate set by learning_rate to zero over the course of training.

***warmup_ratio=0.1:*** This specifies the proportion of total training steps to use for the warmup phase, where the learning rate gradually increases to the initial learning rate. Adjust this based on your training dynamics.

***Custom Optimizer Usage:*** The custom optimizer defined above is passed to the trainer, which respects the initial learning rate and weight decay settings specified. The Trainer will handle the rest of the learning rate scheduling.

Rouge Scores and Len Gen are normalized in the table elow: it means that what is displyed ust be divided for 100.

As it is shown 23.4212, it is 0.234212.

In [None]:
import numpy as np
import evaluate
from huggingface_hub import HfFolder
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq, EarlyStoppingCallback
from torch.optim import AdamW
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, TrainerCallback
import torch

# Define your model and tokenizer
model_name = 'google/flan-t5-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)
t5_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)

# Clear any cached memory to prevent out-of-memory errors
torch.cuda.empty_cache()

# Load or define your ROUGE metric
rouge = evaluate.load('rouge')

# Data collator is set for dynamic padding
data_collator = DataCollatorForSeq2Seq(tokenizer, t5_model, return_tensors="pt")

# Define custom optimizer
optimizer = AdamW(t5_model.parameters(), lr=1e-4, weight_decay=0.01)

repository_id = 'agnesem/flan-t5'  # Hugging Face Hub repository

class CustomTrainer(Seq2SeqTrainer):
    def __init__(self, *args, dropout_changes=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.dropout_changes = dropout_changes or {}

    def on_epoch_begin(self, epoch, **kwargs):
        if epoch in self.dropout_changes:
            new_dropout = self.dropout_changes[epoch]
            self.model.config.dropout_rate = new_dropout
            print(f"Updated dropout to {new_dropout} at epoch {epoch}")

# Define the dropout changes by epoch: {epoch_number: dropout_rate}
dropout_changes = {1: 0.1, 2: 0.2, 3: 0.3}

# Set up training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir=repository_id,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=4,  # Reduced batch size
    per_device_eval_batch_size=4,  # Reduced batch size
    gradient_accumulation_steps=2,  # Accumulate gradients over 2 steps
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=15,
    predict_with_generate=True,
    fp16=False,  # Enable mixed precision training
    load_best_model_at_end=True,
    lr_scheduler_type='linear',
    warmup_ratio=0.1,
    logging_dir=f"{repository_id}/logs",
    logging_strategy="steps",
    logging_steps=500
)

# Detailed Logging Callback
class DetailedLoggingCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        print(f"Step {state.global_step}: Training Loss: {logs.get('loss', 'N/A')}, Validation Loss: {logs.get('eval_loss', 'N/A')}")

# Define the trainer with EarlyStoppingCallback
trainer = CustomTrainer(
    model=t5_model,
    args=training_args,
    train_dataset=train_short,
    eval_dataset=val_short,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, None),
    dropout_changes=dropout_changes,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

# Start training
trainer.train()

# Rouge Scores and Gen Len Results are scaled to 100: it means that instead than expressed as 0.234212, they have to be read as 23.4212.


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,No log,2.468816,23.4212,8.8542,20.0851,21.8305,14.840336
2,No log,2.265691,25.5003,8.8891,21.6324,23.6897,14.260504
3,2.556400,2.209821,25.5243,9.0626,21.9023,23.4577,14.739496
4,2.556400,2.184217,25.2524,9.0974,21.9305,23.5882,15.184874
5,2.556400,2.169249,25.1288,9.261,21.7374,23.2747,14.983193
6,2.251200,2.156644,25.3427,9.4632,21.8527,23.6358,14.655462
7,2.251200,2.145352,24.7661,9.0654,21.4608,23.0862,14.756303
8,2.251200,2.138065,25.5246,9.8187,22.0634,23.8978,14.613445
9,2.182400,2.132812,26.3587,10.032,22.7162,24.6683,14.638655
10,2.182400,2.131565,26.5091,10.2703,22.8917,24.8003,14.613445


TrainOutput(global_step=2550, training_loss=2.2555154718137254, metrics={'train_runtime': 1224.5197, 'train_samples_per_second': 16.66, 'train_steps_per_second': 2.082, 'total_flos': 2.79380999798784e+16, 'train_loss': 2.2555154718137254, 'epoch': 15.0})

Add an ealy stopper f model does not imporve after 3 consecutive evaluations, with dropout = 0.2. No changes in learning rate.

It is not effective, the model doe not learn.

In [None]:
# from transformers import EarlyStoppingCallback

# # Add an early stopping callback
# callback = EarlyStoppingCallback(early_stopping_patience=3)

# trainer = Seq2SeqTrainer(
#     model=t5_model,
#     args=training_args,
#     data_collator=data_collator,
#     train_dataset=train_short,
#     eval_dataset=val_short,
#     compute_metrics=compute_metrics,
#     optimizers=(optimizer, None),
#     callbacks=[callback]  # Add early stopping
# )

# trainer.train()


In [None]:
# Below are rouge scores without dropout and early stopper. Same learning rate (1e-4), 15 epochs.

# # Start training
# torch.cuda.empty_cache()
# trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,No log,2.327271,24.1737,8.3874,20.3572,22.4039,15.319328
2,No log,2.276458,25.2786,8.7507,21.4598,23.1982,14.848739
3,2.399000,2.251904,24.9176,9.0684,21.5494,23.0495,15.184874
4,2.399000,2.226759,25.5989,9.5076,21.8856,23.6668,15.07563
5,2.399000,2.217765,26.1475,10.3773,22.5518,24.3383,15.168067
6,2.223700,2.201484,25.863,10.3042,22.4025,24.0046,14.89916
7,2.223700,2.196232,25.5896,9.5565,21.8808,23.6185,14.92437
8,2.223700,2.190979,25.3709,9.3119,21.6933,23.3567,14.831933
9,2.185900,2.186318,26.0787,9.5289,22.3299,24.189,14.92437
10,2.185900,2.186843,26.1371,10.1335,22.4013,24.2204,14.932773


TrainOutput(global_step=2550, training_loss=2.22265931372549, metrics={'train_runtime': 1055.1624, 'train_samples_per_second': 19.334, 'train_steps_per_second': 2.417, 'total_flos': 2.79380999798784e+16, 'train_loss': 2.22265931372549, 'epoch': 15.0})

In [None]:
# # save the model
trainer.save_model("./flan_t5_base_finetuned_MTS_clinical_adj_unique_dialogue")
tokenizer.save_pretrained("./flan_t5_base_finetuned_MTS_clinical_adj_unique_dialogue")

('./flan_t5_base_finetuned_MTS_clinical_adj_unique_dialogue/tokenizer_config.json',
 './flan_t5_base_finetuned_MTS_clinical_adj_unique_dialogue/special_tokens_map.json',
 './flan_t5_base_finetuned_MTS_clinical_adj_unique_dialogue/spiece.model',
 './flan_t5_base_finetuned_MTS_clinical_adj_unique_dialogue/added_tokens.json',
 './flan_t5_base_finetuned_MTS_clinical_adj_unique_dialogue/tokenizer.json')

In [None]:
# push to huggingface
from huggingface_hub import HfApi, HfFolder

model_name = "hankym/flan_t5_base_finetuned_MTS_clinical_adj_unique_dialogue"
api = HfApi()

# Create the repository if it doesn't exist
api.create_repo(repo_id=model_name, repo_type="model", exist_ok=True)

# Upload the model files
api.upload_folder(
     folder_path="./flan_t5_base_finetuned_MTS_clinical_adj_unique_dialogue",
     repo_id=model_name,
     repo_type="model"
 )

pytorch_model.bin:   0%|          | 0.00/495M [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/3.64k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/hankym/flan_t5_base_finetuned_MTS_clinical_adj_unique_dialogue/commit/1c7c4fefd9d35729717c07d2d9293010b3befc6b', commit_message='Upload folder using huggingface_hub', commit_description='', oid='1c7c4fefd9d35729717c07d2d9293010b3befc6b', pr_url=None, pr_revision=None, pr_num=None)

## Model evaluation

In [None]:
# Load the fine-tuned model and tokenizer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("hankym/flan_t5_base_finetuned_MTS_clinical_adj_unique_dialogue")
model = AutoModelForSeq2SeqLM.from_pretrained("hankym/flan_t5_base_finetuned_MTS_clinical_adj_unique_dialogue", torch_dtype=torch.bfloat16)


In [None]:
# Paths to the files
path_test_short = '/content/drive/MyDrive/Colab Notebooks/w266_Project Ideas/Data/Clean_Short_NER_Clinical/test_short_clinical_NER.csv'

# Load the datasets
test_short = pd.read_csv(path_test_short)

# Check the shape
print("Shape of short_dialogue_test_data:")
print(test_short.shape)


Shape of short_dialogue_test_data:
(222, 7)


In [None]:
sub_set_test = test_short[:5]

In [None]:
sub_set_test = Dataset.from_pandas(sub_set_test)

In [None]:
test_short = Dataset.from_pandas(test_short)

In [None]:
# tokenize
max_input_length = 1024
max_output_length = 512
def tokenize_function(batch):
    start_prompt = (
    "Summarize the following patient-doctor. "
    "Include all medically relevant information, including family history, "
    "diagnosis, past medical and surgical history, immunizations, lab results, and known allergies. "
    "Use the following medical and chemical entities extracted from the dialogue to help summarization, but do not overly use them. "
    "Entities:\n\n")

    # Define the end of the prompt to indicate where the summary should go
    end_prompt = '\n\nSummary: '

    # Construct the full prompt for each dialogue in the batch, using the corresponding entities
    prompt = [start_prompt + entities + "\n\n" + 'Dialogue: \n\n' + dialogue + end_prompt
               for entities, dialogue in zip(batch['clinical_ner_unique_no_label'], batch['dialogue'])]

    # tokenize the inputs and labels
    inputs = tokenizer(
        prompt,
        padding="max_length",
        truncation=True,
        max_length=max_input_length,
    )
    outputs = tokenizer(
        batch["section_text"],
        padding="max_length",
        truncation=True,
        max_length=max_output_length,
    )

    batch["input_ids"] = inputs.input_ids
    batch["attention_mask"] = inputs.attention_mask

    # create 0 global_attention_mask lists
    batch["global_attention_mask"] = len(batch["input_ids"]) * [
        [0 for _ in range(len(batch["input_ids"][0]))]
    ]

    # since above lists are references, the following line changes the 0 index for all samples
    batch["global_attention_mask"][0][0] = 1
    batch["labels"] = outputs.input_ids

    # We have to make sure that the PAD token is ignored
    batch["labels"] = [
        [-100 if token == tokenizer.pad_token_id else token for token in labels]
        for labels in batch["labels"]
    ]

    return batch

In [None]:
sub_set_test = sub_set_test.map(
    tokenize_function,
    batched=True,
    batch_size=batch_size,
     remove_columns=["section_header", "section_text","dialogue","clinical_ner_label", "clinical_ner_unique_label","clinical_ner_no_label","clinical_ner_unique_no_label"],
)

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

In [None]:
test_short = test_short.map(
    tokenize_function,
    batched=True,
    batch_size=batch_size,
    remove_columns=["section_header", "section_text","dialogue","clinical_ner_label", "clinical_ner_unique_label","clinical_ner_no_label","clinical_ner_unique_no_label"],
)

Map:   0%|          | 0/222 [00:00<?, ? examples/s]

In [None]:
def model_generation(model, tokenizer, dataset):
    model.eval()  # Ensure the model is in evaluation mode
    dialogues = []
    predictions = []
    references = []

    for example in dataset:
        input_ids = torch.tensor([example["input_ids"]], dtype=torch.long)
        reference_ids = example["labels"]

        # Extract dialogue
        dialogue_text = tokenizer.decode(input_ids[0], skip_special_tokens=True)
        dialogues.append(dialogue_text)

        # Generate summary
        outputs = model.generate(input_ids)
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Since reference IDs might contain -100, filter those out before decoding
        valid_reference_ids = [rid for rid in reference_ids if rid != -100]
        reference_text = tokenizer.decode(valid_reference_ids, skip_special_tokens=True)

        # Append to lists for ROUGE calculation
        predictions.append(generated_text)
        references.append(reference_text)

    return dialogues, predictions, references


In [None]:
# Call the function with the correct argument
dialogues, t5_predictions, t5_references = model_generation(model, tokenizer, test_short)



In [None]:
from rouge_score import rouge_scorer

# Initialize ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Compute ROUGE scores for each pair of prediction and reference
rouge_scores = []
for pred, ref in zip(t5_predictions, t5_references):
    score = scorer.score(pred, ref)
    rouge_scores.append(score)


# Calculate average ROUGE scores
avg_rouge1 = sum([score['rouge1'].fmeasure for score in rouge_scores]) / len(rouge_scores)
avg_rouge2 = sum([score['rouge2'].fmeasure for score in rouge_scores]) / len(rouge_scores)
avg_rougeL = sum([score['rougeL'].fmeasure for score in rouge_scores]) / len(rouge_scores)

# Print average ROUGE scores
print(f"Average ROUGE-1: {avg_rouge1}")
print(f"Average ROUGE-2: {avg_rouge2}")
print(f"Average ROUGE-L: {avg_rougeL}")

Average ROUGE-1: 0.2890238200414747
Average ROUGE-2: 0.11900181497295013
Average ROUGE-L: 0.2497969654372268


In [None]:
with torch.cuda.amp.autocast():
    dialogues, t5_predictions, t5_references = model_generation(model, tokenizer, test_short)

In [None]:
from rouge_score import rouge_scorer

# Initialize ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Compute ROUGE scores for each pair of prediction and reference
rouge_scores = []
for pred, ref in zip(t5_predictions, t5_references):
    score = scorer.score(pred, ref)
    rouge_scores.append(score)


# Calculate average ROUGE scores
avg_rouge1 = sum([score['rouge1'].fmeasure for score in rouge_scores]) / len(rouge_scores)
avg_rouge2 = sum([score['rouge2'].fmeasure for score in rouge_scores]) / len(rouge_scores)
avg_rougeL = sum([score['rougeL'].fmeasure for score in rouge_scores]) / len(rouge_scores)

# Print average ROUGE scores
print(f"Average ROUGE-1: {avg_rouge1}")
print(f"Average ROUGE-2: {avg_rouge2}")
print(f"Average ROUGE-L: {avg_rougeL}")

Average ROUGE-1: 0.2890238200414747
Average ROUGE-2: 0.11900181497295013
Average ROUGE-L: 0.2497969654372268


In [None]:
data = {
    "Dialogues": dialogues,
    "Reference": t5_references,
    "Prediction": t5_predictions,
}
df = pd.DataFrame(data)

import io
import os

# Specify the directory to save the CSV files
output_dir = '/content/drive/MyDrive/Colab Notebooks/w266_Project Ideas/Results/Models/Clinical_adj'

# Ensure the directory exists
os.makedirs(output_dir, exist_ok=True)
df.to_csv(os.path.join(output_dir,"flan_t5_base_NER_clinical_adj_predictions.csv"), index=False)

In [None]:
print(t5_predictions)

['The patient is a 40-year-old male who lives in the same household as his', 'No cancer. No rashes, moles, or sudden weight loss.', 'Non-insured.', 'The patient is a six-year-old male who is taking Adderall for ', 'The patient has had an abortion recently. She is still bleeding. She will see our Gyn', 'Atrial fibrillation, no dizziness.', 'The patient has no sugar or BPP problem. The patient has no thyroid or other hormone related', 'The patient is a 63-year-old male with a history of chronic back', 'The patient is a 50-year-old female who has had a fall and ', 'No swelling or mass. No pain.', 'Dislocated wrist when a kid.', 'None.', 'None.', 'Disseminated CMV infection. Polymyositis. Predn', 'The patient is a mall-bound patient. She walks around at the mall to get her', 'The patient has a swollen lymph node for 2 days.', 'No medical issues. No surgery or hospitalization.', 'The patient is a 69-year-old male who passed out and was brought to', 'Lamictal was okay. There was no side effe