In [None]:
import torch
from transformers import AutoTokenizer, BioGptForCausalLM, AutoModelForCausalLM

#checkpoint = "gpt2-medium"
#checkpoint = "gpt2-large"
checkpoint = "microsoft/biogpt"
#checkpoint = "microsoft/BioGPT-Large"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

#model = BioGptForCausalLM.from_pretrained(checkpoint)
model = AutoModelForCausalLM.from_pretrained(checkpoint)

## Check Base Model

In [None]:
model.to("cuda")

prompt = "The main impression based on the given FINDINGS section of the chest X-ray report are:"
findings_example = f"""Lateral view somewhat limited due to overlying motion artifact. The lungs are low in volume.  There is no focal airspace consolidation to suggest pneumonia.  A 1.2-cm calcified granuloma just below the medial aspect of the right hemidiaphragm is unchanged from prior study.  No pleural effusions or pulmonary edema. There is no pneumothorax.  The inferior sternotomy wire is fractured but unchanged. Surgical clips and vascular markers in the thorax are related to prior CABG surgery."""
eval_prompt = findings_example + prompt
print(f"Model Input:\n{eval_prompt}\n")

model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")

model.eval()
with torch.no_grad():
    print("---\nGenerated Output:\n")
    print(tokenizer.decode(model.generate(**model_input, max_length=256)[0], skip_special_tokens=True).split(":")[-1].strip())
    
ground_truth_summary="""
---
Ground Truth:
No evidence of acute cardiopulmonary process.
"""    
print(ground_truth_summary)

## Build Dataset

In [None]:
from pathlib import Path
import datasets
#dataset_config = 'mimic-cxr','mimic-iii'  
#split = 'train','validate',test

max_input_length = 768

def build_dataset(dataset_config, tokenizer, split):
    data_path = '/nfs/turbo/umms-vgvinodv/data/bioNLP23-Task-1B/data/'
    findings_file_path = Path(data_path).joinpath(dataset_config).joinpath(split+'.findings.tok')
    impression_file_path = Path(data_path).joinpath(dataset_config).joinpath(split+'.impression.tok')
    image_file_path = Path(data_path).joinpath(dataset_config).joinpath(split+'image.tok')


    findings = [line.strip() for line in open(findings_file_path).readlines()]
    impression = [line.strip() for line in open(impression_file_path).readlines()]

    dataset = datasets.Dataset.from_dict({"text":findings,"summary":impression}) 
    
    
    def preprocess_function(samples):
        texts = samples["text"]
        summaries = samples["summary"]
        prompt = "The main impression based on the given FINDINGS section of the chest X-ray report are:"
        eos_token = tokenizer.eos_token
        
        def generate_input(_text,_summary):
            return " ".join([_text,prompt,_summary])

        inputs = [generate_input(_text,_summary)+eos_token for _text,_summary in zip(texts,summaries)]
        model_inputs = tokenizer(inputs)
        
        return model_inputs
    
    dataset = dataset.map(preprocess_function, batched=True, num_proc=4, remove_columns=list(dataset.features))

    return dataset

In [None]:
dataset_config = "mimic-cxr"
tokenized_train_data = build_dataset(dataset_config,tokenizer,"train")
tokenized_eval_data = build_dataset(dataset_config,tokenizer,"test")

In [None]:
block_size = 256

def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # Split by chunks of block_size.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [None]:
tokenized_train_data = tokenized_train_data.map(group_texts, batched=True, num_proc=4)
tokenized_eval_data = tokenized_eval_data.map(group_texts, batched=True, num_proc=4)

In [None]:
from transformers import default_data_collator

data_collator = default_data_collator

In [None]:
from transformers import TrainingArguments, Trainer

dataset_config = "mimic-cxr"
model_name = checkpoint.split("/")[-1]
batch_size = 16
num_train_epochs = 1#5
save_path: str="/nfs/turbo/umms-vgvinodv/models/finetuned-checkpoints/radsum"
save_path = f"{save_path}/{model_name}-{dataset_config}"

# Training Args
training_args = TrainingArguments(
    output_dir=save_path,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_train_epochs,
    overwrite_output_dir = True,
    #push_to_hub=True,
)

# Initialize our Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_data,
    eval_dataset=tokenized_eval_data,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [None]:
trainer.train()

In [None]:
model.to("cuda")

prompt = "The main impression based on the given FINDINGS section of the chest X-ray report are:"
findings_example = f"""Lateral view somewhat limited due to overlying motion artifact. The lungs are low in volume.  There is no focal airspace consolidation to suggest pneumonia.  A 1.2-cm calcified granuloma just below the medial aspect of the right hemidiaphragm is unchanged from prior study.  No pleural effusions or pulmonary edema. There is no pneumothorax.  The inferior sternotomy wire is fractured but unchanged. Surgical clips and vascular markers in the thorax are related to prior CABG surgery."""
eval_prompt = findings_example + prompt
print(f"Model Input:\n{eval_prompt}\n")

model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")

model.eval()
with torch.no_grad():
    print("---\nGenerated Output:\n")
    print(tokenizer.decode(model.generate(**model_input, max_length=256)[0], skip_special_tokens=True).split(":")[-1].strip())
    
ground_truth_summary="""
---
Ground Truth:
No evidence of acute cardiopulmonary process.
"""    
print(ground_truth_summary)

## Evaluate

In [None]:
from nltk.tokenize import wordpunct_tokenize
from radgraph import F1RadGraph
from f1chexbert import F1CheXbert
import datasets
from pathlib import Path

def build_test_dataset(dataset_config, tokenizer, split="test"):
    data_path = '/nfs/turbo/umms-vgvinodv/data/bioNLP23-Task-1B/data/'
    findings_file_path = Path(data_path).joinpath(dataset_config).joinpath(split+'.findings.tok')
    impression_file_path = Path(data_path).joinpath(dataset_config).joinpath(split+'.impression.tok')


    findings = [line.strip() for line in open(findings_file_path).readlines()]
    impression = [line.strip() for line in open(impression_file_path).readlines()]

    dataset = datasets.Dataset.from_dict({"text":findings,"summary":impression}) 
    
    return dataset

def generate_summary(sample):
    texts = sample["text"]
    summaries = sample["summary"]
    prompt = "The main impression based on the given FINDINGS section of the chest X-ray report are:"

    def generate_input(_text):
        return " ".join([_text,prompt])

    inputs = generate_input(texts) 
    model_input = tokenizer(inputs, return_tensors="pt").to("cuda")
    with torch.no_grad():
        response = tokenizer.decode(model.generate(**model_input, max_new_tokens=256)[0], skip_special_tokens=True)
    
    formatted_response = response.split(":")[-1].strip()
    return {
        "text": inputs,
        "summary":summaries,
        "pred": formatted_response
    }

def process_impression(impression):
    impression = impression.lower()
    return ' '.join(wordpunct_tokenize(impression))

In [None]:
mini_test = False

test_dataset = build_test_dataset('mimic-cxr',tokenizer,'test')

if mini_test:
    num_samples = int(0.25*len(test_dataset))
    test_dataset = test_dataset.select(range(num_samples))

print(f'Number of test samples: {len(test_dataset)}')

In [None]:
model.eval()
model.to("cuda")
results = test_dataset.map(generate_summary, remove_columns=list(test_dataset.features))


pred_str = results["pred"]
pred_str = list(map(process_impression,pred_str))
label_str = results["summary"]

In [None]:
import numpy as np
import evaluate

###################################
rouge = datasets.load_metric("rouge")
rouge_output = rouge.compute(predictions=pred_str, references=label_str)

res = {key: value.mid.fmeasure * 100 for key, value in rouge_output.items()}
print('ROUGE:')
print({k: round(v, 4) for k, v in res.items()})

#################################
f1radgraph = F1RadGraph(reward_level="partial")
score = f1radgraph(hyps=pred_str,refs=label_str)[0]
print("\nF1RadGraph:")
print(score*100)

#################################
f1chexbert = F1CheXbert(device="cuda")
accuracy, accuracy_not_averaged, class_report, class_report_5 = f1chexbert(
    hyps=pred_str,
    refs=label_str)
print("\nF1CheXbert:")
print(100 * class_report_5["micro avg"]["f1-score"])

## Inference

In [None]:
import torch
from transformers import AutoTokenizer, BioGptForCausalLM

model_checkpoint = "/nfs/turbo/umms-vgvinodv/models/finetuned-checkpoints/radsum/biogpt-mimic-cxr/checkpoint-7620"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = BioGptForCausalLM.from_pretrained(model_checkpoint)

model.eval()
model.to("cuda")

In [None]:
from nltk.tokenize import wordpunct_tokenize
from radgraph import F1RadGraph
from f1chexbert import F1CheXbert
import datasets
from pathlib import Path

def build_test_dataset(dataset_config, tokenizer, split="test"):
    data_path = '/nfs/turbo/umms-vgvinodv/data/bioNLP23-Task-1B/data/'
    findings_file_path = Path(data_path).joinpath(dataset_config).joinpath(split+'.findings.tok')
    impression_file_path = Path(data_path).joinpath(dataset_config).joinpath(split+'.impression.tok')


    findings = [line.strip() for line in open(findings_file_path).readlines()]
    impression = [line.strip() for line in open(impression_file_path).readlines()]

    dataset = datasets.Dataset.from_dict({"text":findings,"summary":impression}) 
    
    return dataset

def generate_summary(sample):
    texts = sample["text"]
    summaries = sample["summary"]
    prompt = "The main impression based on the given FINDINGS section of the chest X-ray report are:"

    def generate_input(_text):
        return " ".join([_text,prompt])

    inputs = generate_input(texts) 
    model_input = tokenizer(inputs, return_tensors="pt").to("cuda")
    with torch.no_grad():
        response = tokenizer.decode(model.generate(**model_input, max_new_tokens=256)[0], skip_special_tokens=True)
    
    formatted_response = response.split(":")[-1].strip()
    return {
        "text": inputs,
        "summary":summaries,
        "pred": formatted_response
    }

def process_impression(impression):
    impression = impression.lower()
    return ' '.join(wordpunct_tokenize(impression))

In [None]:
mini_test = False

test_dataset = build_test_dataset('mimic-cxr',tokenizer,'test')

if mini_test:
    num_samples = int(0.25*len(test_dataset))
    test_dataset = test_dataset.select(range(num_samples))

print(f'Number of test samples: {len(test_dataset)}')

In [None]:
results = test_dataset.map(generate_summary, remove_columns=list(test_dataset.features))

def process_impression(impression):
    impression = impression.lower()
    return ' '.join(wordpunct_tokenize(impression))

pred_str = results["pred"]
pred_str = list(map(process_impression,pred_str))
label_str = results["summary"]

In [None]:
import numpy as np
import evaluate

###################################
rouge = datasets.load_metric("rouge")
rouge_output = rouge.compute(predictions=pred_str, references=label_str)

res = {key: value.mid.fmeasure * 100 for key, value in rouge_output.items()}
print('ROUGE:')
print({k: round(v, 4) for k, v in res.items()})

#################################
f1radgraph = F1RadGraph(reward_level="partial")
score = f1radgraph(hyps=pred_str,refs=label_str)[0]
print("\nF1RadGraph:")
print(score*100)

#################################
f1chexbert = F1CheXbert(device="cuda")
accuracy, accuracy_not_averaged, class_report, class_report_5 = f1chexbert(
    hyps=pred_str,
    refs=label_str)
print("\nF1CheXbert:")
print(class_report_5["micro avg"]["f1-score"])

## Hidden Test

In [None]:
import numpy as np
import evaluate

hidden_test_dataset = build_test_dataset('mimic-cxr',tokenizer,'test.hidden')

print(f'Number of test samples: {len(hidden_test_dataset)}\n')

results_hidden_test = hidden_test_dataset.map(generate_summary, remove_columns=list(hidden_test_dataset.features))



In [None]:
pred_str = results_hidden_test["pred"]
pred_str = list(map(process_impression,pred_str))
label_str = results_hidden_test["summary"]

###################################
rouge = datasets.load_metric("rouge")
rouge_output = rouge.compute(predictions=pred_str, references=label_str)

res = {key: value.mid.fmeasure * 100 for key, value in rouge_output.items()}
print('ROUGE:')
print({k: round(v, 4) for k, v in res.items()})

#################################
f1radgraph = F1RadGraph(reward_level="partial")
score = f1radgraph(hyps=pred_str,refs=label_str)[0]
print("\nF1RadGraph:")
print(score*100)

#################################
f1chexbert = F1CheXbert(device="cuda")
accuracy, accuracy_not_averaged, class_report, class_report_5 = f1chexbert(
    hyps=pred_str,
    refs=label_str)
print("\nF1CheXbert:")
print(class_report_5["micro avg"]["f1-score"])