In [1]:
import torch
from transformers import AutoTokenizer, BioGptForCausalLM, AutoModelForCausalLM

model_checkpoint = "/nfs/turbo/umms-vgvinodv/models/finetuned-checkpoints/radsum/biogpt-mimic-cxr/checkpoint-7620"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(model_checkpoint)
model.config.use_cache = False

model_ref = AutoModelForCausalLM.from_pretrained(model_checkpoint)

In [2]:
from peft import LoraConfig, TaskType, prepare_model_for_kbit_training, get_peft_model

peft_config = LoraConfig(
    r=64, #old=8
    lora_alpha=16, 
    lora_dropout=0.1, #old=0.05
    bias="none",
    task_type=TaskType.CAUSAL_LM,
    #target_modules = ["q_proj", "v_proj"], 
    target_modules=["q_proj","k_proj","v_proj","o_proj"],
    #target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj","lm_head",], #all linear layers
)

for param in model.parameters():
    # freeze base model's layers
    param.requires_grad = False

if hasattr(model, "enable_input_require_grads"):
    model.enable_input_require_grads()
else:
    def make_inputs_require_grad(module, input, output):
        output.requires_grad_(True)

    model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
        
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()
#print_trainable_parameters(model)

trainable params: 9,437,184 || all params: 356,200,448 || trainable%: 2.649402619504847


In [3]:
from datasets import load_dataset

dataset = load_dataset("varuUM/mimic-cxr-dpo-with-metrics", split="train")

In [4]:
def filter_func(examples):
    return examples['rougeL'] < 0.3
    #return examples['rougeL'] < 0.2 and examples['F1RadGraph'] < 0.2

dpo_dataset = dataset.filter(filter_func)
dpo_dataset = dpo_dataset.remove_columns(["rougeL","F1RadGraph","F1CheXbert"])

In [5]:
sanity_check = True
if sanity_check:
    num_samples = int(0.25*len(dpo_dataset)) #10000#
    dpo_dataset = dpo_dataset.select(range(num_samples))

print(len(dpo_dataset))
print(dpo_dataset[1])

9214
{'prompt': 'Heart size is normal. Mediastinal contours are normal with mild aortic tortuosity. Post-surgical changes in the right hemithorax are stable including thickening of the pleura along the costal surface and blunting of the costophrenic sulcus. The right sixth rib surgical fracture is redemonstrated. There are no new lung nodules identified. The main impression based on the given FINDINGS section of the chest X-ray report are:', 'chosen': 'Stable chest radiograph.', 'rejected': 'No radiographic evidence of pneumonia.'}


### Remove examples with length longer thna model max input

In [None]:
'''def filter_function(examples):
    prompt = examples["prompt"]
    chosen = examples["chosen"]
    rejected = examples["rejected"]
    
    def generate_input(_text,_summary):
        return " ".join([_text,_summary])

    chosen_inputs = generate_input(prompt, chosen) #[generate_input(_text,_summary) for _text,_summary in zip(prompt,chosen)]
    rejected_inputs = generate_input(prompt, rejected) #[generate_input(_text,_summary) for _text,_summary in zip(prompt,rejected)]
    
    chosen_input_len = len(tokenizer(chosen_inputs, truncation=False).input_ids)
    rejected_input_len = len(tokenizer(rejected_inputs, truncation=False).input_ids)
    return (chosen_input_len < tokenizer.model_max_length) and (rejected_input_len < tokenizer.model_max_length)

filtered_dataset = dpo_dataset.filter(filter_function)'''

In [6]:
from transformers import TrainingArguments
from trl import DPOTrainer

dataset_config = "mimic-cxr"
model_name = "biogpt"
save_path: str="/nfs/turbo/umms-vgvinodv/models/finetuned-checkpoints/radsum"
save_path = f"{save_path}/{model_name}-dpo-{dataset_config}"

training_args = TrainingArguments(
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=16,
    gradient_checkpointing=True,
    num_train_epochs = 1,
    max_steps =-1,
    save_strategy = 'epoch',
    save_total_limit = 1,
    logging_strategy ='steps',
    logging_steps=20,
    learning_rate=5.0e-7,
    output_dir=save_path,
    remove_unused_columns=False,
    run_name="dpo_biogpt",
    overwrite_output_dir = True,
    bf16=True,
)


dpo_trainer = DPOTrainer(
    model,
    model_ref,
    args=training_args,
    beta=0.1,
    train_dataset=dpo_dataset,
    tokenizer=tokenizer,
    max_prompt_length=512,
    max_length=1024,
)

In [7]:
dpo_trainer.train()

Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss
20,0.698


TrainOutput(global_step=36, training_loss=0.6975356737772623, metrics={'train_runtime': 942.7424, 'train_samples_per_second': 9.774, 'train_steps_per_second': 0.038, 'total_flos': 0.0, 'train_loss': 0.6975356737772623, 'epoch': 1.0})

In [8]:
model.to("cuda")

prompt = "The main impression based on the given FINDINGS section of the chest X-ray report are:"
findings_example = f"""Lateral view somewhat limited due to overlying motion artifact. The lungs are low in volume.  There is no focal airspace consolidation to suggest pneumonia.  A 1.2-cm calcified granuloma just below the medial aspect of the right hemidiaphragm is unchanged from prior study.  No pleural effusions or pulmonary edema. There is no pneumothorax.  The inferior sternotomy wire is fractured but unchanged. Surgical clips and vascular markers in the thorax are related to prior CABG surgery."""
eval_prompt = findings_example + prompt
print(f"Model Input:\n{eval_prompt}\n")

model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")

model.eval()
with torch.no_grad():
    print("---\nGenerated Output:\n")
    print(tokenizer.decode(model.generate(**model_input, max_length=256)[0], skip_special_tokens=True).split(":")[-1].strip())
    
ground_truth_summary="""
---
Ground Truth:
No evidence of acute cardiopulmonary process.
"""    
print(ground_truth_summary)

Model Input:
Lateral view somewhat limited due to overlying motion artifact. The lungs are low in volume.  There is no focal airspace consolidation to suggest pneumonia.  A 1.2-cm calcified granuloma just below the medial aspect of the right hemidiaphragm is unchanged from prior study.  No pleural effusions or pulmonary edema. There is no pneumothorax.  The inferior sternotomy wire is fractured but unchanged. Surgical clips and vascular markers in the thorax are related to prior CABG surgery.The main impression based on the given FINDINGS section of the chest X-ray report are:

---
Generated Output:





No acute cardiopulmonary process.

---
Ground Truth:
No evidence of acute cardiopulmonary process.



## Evaluate

In [9]:
from nltk.tokenize import wordpunct_tokenize
from radgraph import F1RadGraph
from f1chexbert import F1CheXbert
import datasets
from pathlib import Path

def build_test_dataset(dataset_config, tokenizer, split="test"):
    data_path = '/nfs/turbo/umms-vgvinodv/data/bioNLP23-Task-1B/data/'
    findings_file_path = Path(data_path).joinpath(dataset_config).joinpath(split+'.findings.tok')
    impression_file_path = Path(data_path).joinpath(dataset_config).joinpath(split+'.impression.tok')


    findings = [line.strip() for line in open(findings_file_path).readlines()]
    impression = [line.strip() for line in open(impression_file_path).readlines()]

    dataset = datasets.Dataset.from_dict({"text":findings,"summary":impression}) 
    
    return dataset

def generate_summary(sample):
    texts = sample["text"]
    summaries = sample["summary"]
    prompt = "The main impression based on the given FINDINGS section of the chest X-ray report are:"

    def generate_input(_text):
        return " ".join([_text,prompt])

    inputs = generate_input(texts) 
    model_input = tokenizer(inputs, return_tensors="pt").to("cuda")
    with torch.no_grad():
        response = tokenizer.decode(model.generate(**model_input, max_new_tokens=512)[0], skip_special_tokens=True)
    
    formatted_response = response.split(":")[-1].strip()
    return {
        "text": inputs,
        "summary":summaries,
        "pred": formatted_response
    }

def process_impression(impression):
    impression = impression.lower()
    return ' '.join(wordpunct_tokenize(impression))

In [10]:
mini_test = False

test_dataset = build_test_dataset('mimic-cxr',tokenizer,'test')

if mini_test:
    num_samples = int(0.25*len(test_dataset))
    test_dataset = test_dataset.select(range(num_samples))

print(f'Number of test samples: {len(test_dataset)}')

Number of test samples: 1624


In [11]:
model.eval()
model.to("cuda")
results = test_dataset.map(generate_summary, remove_columns=list(test_dataset.features))

pred_str = results["pred"]
pred_str = list(map(process_impression,pred_str))
label_str = results["summary"]

import numpy as np
import evaluate

###################################
rouge = datasets.load_metric("rouge")
rouge_output = rouge.compute(predictions=pred_str, references=label_str)

res = {key: value.mid.fmeasure * 100 for key, value in rouge_output.items()}
print('ROUGE:')
print({k: round(v, 4) for k, v in res.items()})

#################################
f1radgraph = F1RadGraph(reward_level="partial")
score = f1radgraph(hyps=pred_str,refs=label_str)[0]
print("\nF1RadGraph:")
print(score*100)

#################################
f1chexbert = F1CheXbert(device="cuda")
accuracy, accuracy_not_averaged, class_report, class_report_5 = f1chexbert(
    hyps=pred_str,
    refs=label_str)
print("\nF1CheXbert:")
print(class_report_5["micro avg"]["f1-score"])



Map:   0%|          | 0/1624 [00:00<?, ? examples/s]

ROUGE:
{'rouge1': 46.6064, 'rouge2': 31.5597, 'rougeL': 42.8901, 'rougeLsum': 42.88}

F1RadGraph:
40.70844768219216

F1CheXbert:
0.7150368033648792


In [12]:
import numpy as np
import evaluate

hidden_test_dataset = build_test_dataset('mimic-cxr',tokenizer,'test.hidden')

print(f'Number of Hidden test samples: {len(hidden_test_dataset)}\n')

results_hidden_test = hidden_test_dataset.map(generate_summary, remove_columns=list(hidden_test_dataset.features))

pred_str = results_hidden_test["pred"]
pred_str = list(map(process_impression,pred_str))
label_str = results_hidden_test["summary"]

###################################
rouge = datasets.load_metric("rouge")
rouge_output = rouge.compute(predictions=pred_str, references=label_str)

res = {key: value.mid.fmeasure * 100 for key, value in rouge_output.items()}
print('ROUGE:')
print({k: round(v, 4) for k, v in res.items()})

#################################
f1radgraph = F1RadGraph(reward_level="partial")
score = f1radgraph(hyps=pred_str,refs=label_str)[0]
print("\nF1RadGraph:")
print(score*100)

#################################
f1chexbert = F1CheXbert(device="cuda")
accuracy, accuracy_not_averaged, class_report, class_report_5 = f1chexbert(
    hyps=pred_str,
    refs=label_str)
print("\nF1CheXbert:")
print(class_report_5["micro avg"]["f1-score"])

Number of Hidden test samples: 1000



Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

ROUGE:
{'rouge1': 34.9319, 'rouge2': 19.483, 'rougeL': 29.8388, 'rougeLsum': 29.8586}

F1RadGraph:
13.099944417229642


Token indices sequence length is longer than the specified maximum sequence length for this model (521 > 512). Running this sequence through the model will result in indexing errors



F1CheXbert:
0.6434262948207171


## Testing

In [None]:
from radgraph import F1RadGraph
import evaluate
import pandas as pd

rouge = evaluate.load('rouge')
f1radgraph = F1RadGraph(reward_level="partial")

generation_kwargs = {
    "min_length": -1,
    "top_k": 0.0,
    "top_p": 1.0,
    "do_sample": True,
    "max_new_tokens": 512,
}

bs = 32
game_data = dict()
filtered_dataset = dpo_dataset
filtered_dataset.set_format("pandas")
df_batch = filtered_dataset[:].sample(bs)

game_data["GT"] = df_batch["chosen"].tolist()

query_tensors = tokenizer(df_batch["prompt"].tolist()).input_ids
response_tensors_ref, response_tensors = [], []

#### get response from gpt2 and gpt2_ref
for i in range(bs):
    output = model_ref.generate(torch.tensor(query_tensors[i]).unsqueeze(dim=0).to("cuda"), **generation_kwargs).squeeze()
    response_tensors_ref.append(output)
    output = model.generate(torch.tensor(query_tensors[i]).unsqueeze(dim=0).to("cuda"), **generation_kwargs).squeeze()
    response_tensors.append(output)

#### decode responses
game_data["response (before)"] = [tokenizer.decode(response_tensors_ref[i],skip_special_tokens=True).split(":")[-1].strip() for i in range(bs)]
game_data["response (after)"] = [tokenizer.decode(response_tensors[i],skip_special_tokens=True).split(":")[-1].strip() for i in range(bs)]

#### sentiment analysis of query/response pairs before/after
game_data["F1RadGraph rewards (before)"] = f1radgraph(hyps=game_data["response (before)"], refs=game_data["GT"])[1]
game_data["F1RadGraph rewards (after)"] = f1radgraph(hyps=game_data["response (after)"], refs=game_data["GT"])[1]

game_data["rougeL rewards (before)"] = rouge.compute(predictions=game_data["response (before)"], references=game_data["GT"], rouge_types=['rougeL'],  use_aggregator=False)['rougeL']
game_data["rougeL rewards (after)"] = rouge.compute(predictions=game_data["response (after)"], references=game_data["GT"], rouge_types=['rougeL'],  use_aggregator=False)['rougeL']

# store results in a dataframe
df_results = pd.DataFrame(game_data)
df_results

In [None]:
from datasets  import Dataset

result = Dataset.from_pandas(df_results)

In [None]:
result[5]

In [None]:
result[30]

# Evaluation

In [None]:
import torch
from transformers import AutoTokenizer, BioGptForCausalLM

checkpoint = "/nfs/turbo/umms-vgvinodv/models/finetuned-checkpoints/radsum/biogpt-dpo-mimic-cxr/checkpoint-30290"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = BioGptForCausalLM.from_pretrained(checkpoint)

In [None]:
tokenizer.all_special_tokens
tokenizer.pad_token

In [None]:
model.to("cuda")

prompt = "The main impression based on the given FINDINGS section of the chest X-ray report are:"
findings_example = f"""Lateral view somewhat limited due to overlying motion artifact. The lungs are low in volume.  There is no focal airspace consolidation to suggest pneumonia.  A 1.2-cm calcified granuloma just below the medial aspect of the right hemidiaphragm is unchanged from prior study.  No pleural effusions or pulmonary edema. There is no pneumothorax.  The inferior sternotomy wire is fractured but unchanged. Surgical clips and vascular markers in the thorax are related to prior CABG surgery."""
eval_prompt = findings_example + prompt
print(f"Model Input:\n{eval_prompt}\n")

model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")

model.eval()
with torch.no_grad():
    print("---\nGenerated Output:\n")
    print(tokenizer.decode(model.generate(**model_input, max_length=256)[0], skip_special_tokens=True).split(":")[-1].strip())
    
ground_truth_summary="""
---
Ground Truth:
No evidence of acute cardiopulmonary process.
"""    
print(ground_truth_summary)

In [None]:
from nltk.tokenize import wordpunct_tokenize
from radgraph import F1RadGraph
from f1chexbert import F1CheXbert
import datasets
from pathlib import Path

def build_test_dataset(dataset_config, tokenizer, split="test"):
    data_path = '/nfs/turbo/umms-vgvinodv/data/bioNLP23-Task-1B/data/'
    findings_file_path = Path(data_path).joinpath(dataset_config).joinpath(split+'.findings.tok')
    impression_file_path = Path(data_path).joinpath(dataset_config).joinpath(split+'.impression.tok')


    findings = [line.strip() for line in open(findings_file_path).readlines()]
    impression = [line.strip() for line in open(impression_file_path).readlines()]

    dataset = datasets.Dataset.from_dict({"text":findings,"summary":impression}) 
    
    return dataset

def generate_summary(sample):
    texts = sample["text"]
    summaries = sample["summary"]
    prompt = "The main impression based on the given FINDINGS section of the chest X-ray report are:"

    def generate_input(_text):
        return " ".join([_text,prompt])

    inputs = generate_input(texts) 
    model_input = tokenizer(inputs, return_tensors="pt").to("cuda")
    with torch.no_grad():
        response = tokenizer.decode(model.generate(**model_input, max_new_tokens=512)[0], skip_special_tokens=True)
    
    formatted_response = response.split(":")[-1].strip()
    return {
        "text": inputs,
        "summary":summaries,
        "pred": formatted_response
    }

def process_impression(impression):
    impression = impression.lower()
    return ' '.join(wordpunct_tokenize(impression))

In [None]:
test_dataset = build_test_dataset('mimic-cxr',tokenizer,'test')
print(f'Number of test samples: {len(test_dataset)}')

model.eval()
model.to("cuda")
results = test_dataset.map(generate_summary, remove_columns=list(test_dataset.features))

pred_str = results["pred"]
pred_str = list(map(process_impression,pred_str))
label_str = results["summary"]

In [None]:
import numpy as np
import evaluate

###################################
rouge = datasets.load_metric("rouge")
rouge_output = rouge.compute(predictions=pred_str, references=label_str)

res = {key: value.mid.fmeasure * 100 for key, value in rouge_output.items()}
print('ROUGE:')
print({k: round(v, 4) for k, v in res.items()})

##################################
bertscore = datasets.load_metric("bertscore")
bertscore_output = bertscore.compute(predictions=pred_str, references=label_str, lang='en')
res = {key: np.asarray(value).mean()*100 for key, value in bertscore_output.items() if key != 'hashcode'}
print('BertScore:')
print({k: round(v,4) for k, v in res.items()})

#################################
f1radgraph = F1RadGraph(reward_level="partial")
score = f1radgraph(hyps=pred_str,refs=label_str)[0]
print("\nF1RadGraph:")
print(score*100)

#################################
f1chexbert = F1CheXbert(device="cuda")
accuracy, accuracy_not_averaged, class_report, class_report_5 = f1chexbert(
    hyps=pred_str,
    refs=label_str)
print("\nF1CheXbert:")
print(class_report_5["micro avg"]["f1-score"])