# Llama 7b

In [1]:
import torch
from datasets import load_dataset
from peft import AutoPeftModelForCausalLM, LoraConfig
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import transformers

checkpoint = "meta-llama/Llama-2-7b-hf"

# load the base model in 4-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

base_model = AutoModelForCausalLM.from_pretrained(
    checkpoint,       
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)
base_model.config.use_cache = False

tokenizer = AutoTokenizer.from_pretrained(checkpoint, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"  # Fix weird overflow issue with fp16 training

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

### Check base model

In [2]:
prompt = "The main impression based on the given FINDINGS section of the chest X-ray report are:"
findings_example = f"""Lateral view somewhat limited due to overlying motion artifact. The lungs are low in volume.  There is no focal airspace consolidation to suggest pneumonia.  A 1.2-cm calcified granuloma just below the medial aspect of the right hemidiaphragm is unchanged from prior study.  No pleural effusions or pulmonary edema. There is no pneumothorax.  The inferior sternotomy wire is fractured but unchanged. Surgical clips and vascular markers in the thorax are related to prior CABG surgery."""
eval_prompt = findings_example + prompt
print(f"Model Input:\n{eval_prompt}\n")

model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")

base_model.eval()
with torch.no_grad():
    print("---\nGenerated Output:\n")
    print(tokenizer.decode(base_model.generate(**model_input, max_new_tokens=256)[0], skip_special_tokens=True).split(":")[-1].strip())
    
ground_truth_summary="""
---
Ground Truth:
No evidence of acute cardiopulmonary process.
"""    
print(ground_truth_summary)

Model Input:
Lateral view somewhat limited due to overlying motion artifact. The lungs are low in volume.  There is no focal airspace consolidation to suggest pneumonia.  A 1.2-cm calcified granuloma just below the medial aspect of the right hemidiaphragm is unchanged from prior study.  No pleural effusions or pulmonary edema. There is no pneumothorax.  The inferior sternotomy wire is fractured but unchanged. Surgical clips and vascular markers in the thorax are related to prior CABG surgery.The main impression based on the given FINDINGS section of the chest X-ray report are:

---
Generated Output:





1. No pneumothorax 2. No focal airspace consolidation to suggest pneumonia 3. No pleural effusions 4. No pulmonary edema 5. No focal calcified granuloma 6. No surgical clips

---
Ground Truth:
No evidence of acute cardiopulmonary process.



### Build Dataset

In [3]:
from tqdm import tqdm
from itertools import chain
from torch.utils.data import Dataset
from pathlib import Path
import datasets

class Concatenator(object):
    def __init__(self, chunk_size=1024):
        self.chunk_size=chunk_size
        self.residual = {"input_ids": [], "attention_mask": []}
        
    def __call__(self, batch):
        concatenated_samples = {
            k: v + list(chain(*batch[k])) for k, v in self.residual.items()
        }

        total_length = len(concatenated_samples[list(concatenated_samples.keys())[0]])

        if total_length >= self.chunk_size:
            chunk_num = total_length // self.chunk_size
            result = {
                k: [
                    v[i : i + self.chunk_size]
                    for i in range(0, chunk_num * self.chunk_size, self.chunk_size)
                ]
                for k, v in concatenated_samples.items()
            }
            self.residual = {
                k: v[(chunk_num * self.chunk_size) :]
                for k, v in concatenated_samples.items()
            }
        else:
            result = concatenated_samples
            self.residual = {k: [] for k in concatenated_samples.keys()}

        result["labels"] = result["input_ids"].copy()

        return result
    
#dataset_config = 'mimic-cxr','mimic-iii'  
#split = 'train','validate',test
def build_dataset(dataset_config, tokenizer, split):
    data_path = '/nfs/turbo/umms-vgvinodv/data/bioNLP23-Task-1B/data/'
    findings_file_path = Path(data_path).joinpath(dataset_config).joinpath(split+'.findings.tok')
    impression_file_path = Path(data_path).joinpath(dataset_config).joinpath(split+'.impression.tok')


    findings = [line.strip() for line in open(findings_file_path).readlines()]
    impression = [line.strip() for line in open(impression_file_path).readlines()]

    dataset = datasets.Dataset.from_dict({"text":findings,"summary":impression}) 
    
   
    #prompt = (
    #    f"FINDINGS:{{text}}\n\n The main impression based on the given FINDINGS section of the chest X-ray report are as follows\n\nIMPRESSION:{{summary}}{{eos_token}}"
    #)
    prompt = (
        f"{{text}} The main impression based on the given FINDINGS section of the chest X-ray report are: {{summary}}{{eos_token}}"
    )

    def apply_prompt_template(sample):
        return {
            "text": prompt.format(
                text=sample["text"],
                summary=sample["summary"],
                eos_token=tokenizer.eos_token,
            )
        }

    dataset = dataset.map(apply_prompt_template, remove_columns=list(dataset.features))
    dataset = dataset.map(
        lambda sample: tokenizer(sample["text"]),
        batched=True,
        num_proc=4,
        remove_columns=list(dataset.features),
    ).map(Concatenator(), batched=True, num_proc=4)
    
    return dataset

In [4]:
dataset_config: str="mimic-cxr"
    
train_dataset = build_dataset(dataset_config, tokenizer, 'train')
eval_dataset = build_dataset(dataset_config, tokenizer, "test")

print(f'Number of training samples: {len(train_dataset)}')
print(f'Number of training samples: {len(eval_dataset)}')

Map:   0%|          | 0/125417 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/125417 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/125417 [00:00<?, ? examples/s]

Map:   0%|          | 0/1624 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/1624 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/1624 [00:00<?, ? examples/s]

Number of training samples: 16519
Number of training samples: 257


In [5]:
from transformers import default_data_collator

data_collator = default_data_collator

### Prepare PEFT Model

In [6]:
def create_peft_config(model):
    from peft import (
        get_peft_model,
        LoraConfig,
        TaskType,
        prepare_model_for_kbit_training,
    )

    peft_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        inference_mode=False,
        r=8,
        lora_alpha=32,
        lora_dropout=0.05,
        target_modules = ["q_proj", "v_proj"]
    )

    # prepare int-8/int-4 model for training
    model = prepare_model_for_kbit_training(model)
    model = get_peft_model(model, peft_config)
    model.print_trainable_parameters()
    return model, peft_config

# create peft config
model, lora_config = create_peft_config(base_model)

trainable params: 4,194,304 || all params: 6,742,609,920 || trainable%: 0.06220594176090199


### Prepare for training

In [7]:
from transformers import Trainer, TrainingArguments

model_name = checkpoint.split("/")[-1]
batch_size = 16
num_train_epochs = 1
save_path: str="/nfs/turbo/umms-vgvinodv/models/finetuned-checkpoints/radsum"
save_path = f"{save_path}/{model_name}-{dataset_config}"

# Training Args
training_args = TrainingArguments(
    output_dir=save_path,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_train_epochs,
    overwrite_output_dir = True,
    fp16=True,
    #push_to_hub=True,
)
#'gradient_accumulation_steps': 4,
#'gradient_checkpointing': True,


# Initialize our Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [None]:
# Start training
trainer.train()

### Evaluate

In [8]:
from nltk.tokenize import wordpunct_tokenize
from radgraph import F1RadGraph
from f1chexbert import F1CheXbert
import datasets
from pathlib import Path
import numpy as np

def build_test_dataset(dataset_config, tokenizer, split="test"):
    data_path = '/nfs/turbo/umms-vgvinodv/data/bioNLP23-Task-1B/data/'
    findings_file_path = Path(data_path).joinpath(dataset_config).joinpath(split+'.findings.tok')
    impression_file_path = Path(data_path).joinpath(dataset_config).joinpath(split+'.impression.tok')


    findings = [line.strip() for line in open(findings_file_path).readlines()]
    impression = [line.strip() for line in open(impression_file_path).readlines()]

    dataset = datasets.Dataset.from_dict({"text":findings,"summary":impression}) 
    
    return dataset

def generate_summary(sample):
    texts = sample["text"]
    summaries = sample["summary"]
    prompt = "The main impression based on the given FINDINGS section of the chest X-ray report are:"

    def generate_input(_text):
        return " ".join([_text,prompt])

    inputs = generate_input(texts) 
    model_input = tokenizer(inputs, return_tensors="pt").to("cuda")
    with torch.no_grad():
        response = tokenizer.decode(model.generate(**model_input, max_new_tokens=256)[0], skip_special_tokens=True)
    
    formatted_response = response.split(":")[-1].strip()
    return {
        "text": inputs,
        "summary":summaries,
        "pred": formatted_response
    }

def process_impression(impression):
    impression = impression.lower()
    return ' '.join(wordpunct_tokenize(impression))

def compute_metrics(pred_str, label_str):
    ###################################
    rouge = datasets.load_metric("rouge")
    rouge_output = rouge.compute(predictions=pred_str, references=label_str)

    res = {key: value.mid.fmeasure * 100 for key, value in rouge_output.items()}
    print('ROUGE:')
    print({k: round(v, 4) for k, v in res.items()})

    ##################################
    bertscore = datasets.load_metric("bertscore")
    bertscore_output = bertscore.compute(predictions=pred_str, references=label_str, lang='en')
    res = {key: np.asarray(value).mean()*100 for key, value in bertscore_output.items() if key != 'hashcode'}
    print('BertScore:')
    print({k: round(v,4) for k, v in res.items()})

    #################################
    f1radgraph = F1RadGraph(reward_level="partial")
    score = f1radgraph(hyps=pred_str,refs=label_str)[0]
    print("\nF1RadGraph:")
    print(score*100)

    #################################
    f1chexbert = F1CheXbert(device="cuda")
    accuracy, accuracy_not_averaged, class_report, class_report_5 = f1chexbert(
        hyps=pred_str,
        refs=label_str)
    print("\nF1CheXbert:")
    print(class_report_5["micro avg"]["f1-score"])

In [None]:
test_dataset = build_test_dataset(dataset_config, tokenizer, 'test')
print(f'Number of test samples: {len(test_dataset)}')

model.eval()

results = test_dataset.map(generate_summary, remove_columns=list(test_dataset.features))
pred_str = results["pred"]
pred_str = list(map(process_impression,pred_str))
label_str = results["summary"]

compute_metrics(pred_str, label_str)

Number of test samples: 1624


In [None]:
test_dataset = build_test_dataset(dataset_config, tokenizer, 'test.hidden')
print(f'Number of test samples: {len(test_dataset)}')

model.eval()

results = test_dataset.map(generate_summary, remove_columns=list(test_dataset.features))
pred_str = results["pred"]
pred_str = list(map(process_impression,pred_str))
label_str = results["summary"]

compute_metrics(pred_str, label_str)

# Testing

In [None]:
import transformers
import torch
model_name = "chaoyi-wu/PMC_LLAMA_7B"#'chaoyi-wu/MedLLaMA_13B'
tokenizer = transformers.LlamaTokenizer.from_pretrained(model_name)
model = transformers.LlamaForCausalLM.from_pretrained(model_name, device_map="auto", load_in_4bit=True)
sentence = 'Hello, doctor' 
batch = tokenizer(
            sentence,
            return_tensors="pt", 
            add_special_tokens=False
        )
with torch.no_grad():
    generated = model.generate(inputs = batch["input_ids"], max_length=200, do_sample=True, top_k=50)
    print('model predict: ',tokenizer.decode(generated[0]))


In [None]:
with torch.no_grad():
    generated = model.generate(inputs = batch["input_ids"].to('cuda'), max_length=100, do_sample=True, top_k=50)
    print('model predict: ',tokenizer.decode(generated[0]))

In [None]:
#model_name = "chaoyi-wu/PMC_LLAMA_7B"
#model_name = "chaoyi-wu/MedLLaMA_13B"
model_name = "meta-llama/Llama-2-7b-hf"

# load the base model in 4-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

base_model = AutoModelForCausalLM.from_pretrained(
    model_name,       
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)
base_model.config.use_cache = False

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"  # Fix weird overflow issue with fp16 training

In [None]:
import torch
from datasets import load_dataset
from peft import AutoPeftModelForCausalLM, LoraConfig
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import transformers

In [None]:
model_name = "chaoyi-wu/PMC_LLAMA_7B"#'chaoyi-wu/MedLLaMA_13B'
tokenizer = transformers.LlamaTokenizer.from_pretrained(model_name)
base_model = transformers.LlamaForCausalLM.from_pretrained(model_name, device_map="auto", load_in_4bit=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"  # Fix weird overflow issue with fp16 training

In [None]:
print(tokenizer.all_special_tokens)

### Build Dataset

In [None]:
from tqdm import tqdm
from itertools import chain
from torch.utils.data import Dataset
from pathlib import Path
import datasets

class Concatenator(object):
    def __init__(self, chunk_size=1024):
        self.chunk_size=chunk_size
        self.residual = {"input_ids": [], "attention_mask": []}
        
    def __call__(self, batch):
        concatenated_samples = {
            k: v + list(chain(*batch[k])) for k, v in self.residual.items()
        }

        total_length = len(concatenated_samples[list(concatenated_samples.keys())[0]])

        if total_length >= self.chunk_size:
            chunk_num = total_length // self.chunk_size
            result = {
                k: [
                    v[i : i + self.chunk_size]
                    for i in range(0, chunk_num * self.chunk_size, self.chunk_size)
                ]
                for k, v in concatenated_samples.items()
            }
            self.residual = {
                k: v[(chunk_num * self.chunk_size) :]
                for k, v in concatenated_samples.items()
            }
        else:
            result = concatenated_samples
            self.residual = {k: [] for k in concatenated_samples.keys()}

        result["labels"] = result["input_ids"].copy()

        return result
    
#dataset_config = 'mimic-cxr','mimic-iii'  
#split = 'train','validate',test
def build_dataset(dataset_config, tokenizer, split):
    data_path = '/nfs/turbo/umms-vgvinodv/data/bioNLP23-Task-1B/data/'
    findings_file_path = Path(data_path).joinpath(dataset_config).joinpath(split+'.findings.tok')
    impression_file_path = Path(data_path).joinpath(dataset_config).joinpath(split+'.impression.tok')


    findings = [line.strip() for line in open(findings_file_path).readlines()]
    impression = [line.strip() for line in open(impression_file_path).readlines()]

    dataset = datasets.Dataset.from_dict({"text":findings,"summary":impression}) 
    
   
    prompt = (
        f"FINDINGS:{{text}}\n\n The main impression based on the given FINDINGS section of the chest X-ray report are as follows\n\nIMPRESSION:{{summary}}{{eos_token}}"
    )

    def apply_prompt_template(sample):
        return {
            "text": prompt.format(
                text=sample["text"],
                summary=sample["summary"],
                eos_token=tokenizer.eos_token,
            )
        }

    dataset = dataset.map(apply_prompt_template, remove_columns=list(dataset.features))
    dataset = dataset.map(
        lambda sample: tokenizer(sample["text"]),
        batched=True,
        remove_columns=list(dataset.features),
    ).map(Concatenator(), batched=True)
    
    return dataset

In [None]:
'''
from pathlib import Path
import datasets
import itertools
#dataset_config = 'mimic-cxr','mimic-iii'  
#split = 'train','validate',test

def build_dataset(dataset_config, tokenizer, split):
    data_path = '/nfs/turbo/umms-vgvinodv/data/bioNLP23-Task-1B/data/'
    findings_file_path = Path(data_path).joinpath(dataset_config).joinpath(split+'.findings.tok')
    impression_file_path = Path(data_path).joinpath(dataset_config).joinpath(split+'.impression.tok')


    findings = [line.strip() for line in open(findings_file_path).readlines()]
    impression = [line.strip() for line in open(impression_file_path).readlines()]

    dataset = datasets.Dataset.from_dict({"text":findings,"summary":impression}) 
    
   
    prompt = (
        f"{{text}} The main impression based on the given FINDINGS section of the chest X-ray report are:"
    )
    def apply_prompt_template(sample):
        return {
            "text": prompt.format(text=sample["text"]),
            "summary": sample["summary"],
        }
        
    dataset = dataset.map(apply_prompt_template, num_proc=4, remove_columns=list(dataset.features))
    
    def tokenize_add_label(samples):
        texts = samples["text"]
        summaries = samples["summary"]
        
        prompt_tokens = [tokenizer.encode(tokenizer.bos_token + _text, add_special_tokens=False) for _text in texts]
        answer_tokens = [tokenizer.encode(_summary +  tokenizer.eos_token, add_special_tokens=False) for _summary in summaries]
        
        dialog_tokens = list(itertools.chain.from_iterable(zip(prompt_tokens, answer_tokens)))
        
        labels_tokens = [[-100] * len(_prompt_tokens) + _answer_tokens for _prompt_tokens,_answer_tokens in zip(prompt_tokens,answer_tokens)]
        
        combined_tokens = {
            "input_ids": list(itertools.chain(*(t for t in dialog_tokens))),
            "labels": list(itertools.chain(*(t for t in labels_tokens))),
        }
        
        return dict(combined_tokens, attention_mask=[1]*len(combined_tokens["input_ids"]))
    
    dataset = dataset.map(tokenize_add_label, batched=True, num_proc=4, remove_columns=list(dataset.features))

    ########
    def preprocess_function(samples):
        texts = samples["text"]
        summaries = samples["summary"]
        prompt = "The main impression based on the given FINDINGS section of the chest X-ray report are:"
        
        def generate_input(_text,_summary):
            return " ".join([_text,prompt,_summary])

        inputs = [generate_input(_text,_summary) for _text,_summary in zip(texts,summaries)]
        model_inputs = tokenizer(inputs)
        
        return model_inputs
    
    dataset = dataset.map(preprocess_function, batched=True, num_proc=4, remove_columns=list(dataset.features))

    
    return dataset

'''

In [None]:
dataset_source: str="mimic-cxr"
    
train_dataset = build_dataset(dataset_source, tokenizer, 'train')
print(f'Number of training samples: {len(train_dataset)}')

eval_dataset = build_dataset(dataset_source,tokenizer,"test")
print(f'Number of training samples: {len(eval_dataset)}')

In [None]:
#num_samples = int(0.01*len(train_dataset))
#train_dataset = train_dataset.select(range(num_samples))
#print(len(train_dataset))

In [None]:
def create_peft_config(model):
    from peft import (
        get_peft_model,
        LoraConfig,
        TaskType,
        prepare_model_for_kbit_training,
    )

    peft_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        inference_mode=False,
        r=8,
        lora_alpha=32,
        lora_dropout=0.05,
        target_modules = ["q_proj", "v_proj"]
    )

    # prepare int-8/int-4 model for training
    model = prepare_model_for_kbit_training(model)
    model = get_peft_model(model, peft_config)
    model.print_trainable_parameters()
    return model, peft_config

# create peft config
model, lora_config = create_peft_config(base_model)

In [None]:
model_name = 'chaoyi-wu/MedLLaMA_13B'
name = model_name.split("/")[-1]
output_dir = f"finetuned-checkpoints/{name}-{dataset_source}"

config = {
    'lora_config': lora_config,
    'learning_rate': 2e-5,
    'num_train_epochs': 1,
    'gradient_accumulation_steps': 4,
    'per_device_train_batch_size': 8,
    'gradient_checkpointing': True,
}


In [None]:
import nltk
import numpy as np
import evaluate
metric = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True).to("cuda")
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True).to("cuda")
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    # Note that other metrics may not have a `use_aggregator` parameter
    # and thus will return a list, computing a metric for each sentence.
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True, use_aggregator=True)
    # Extract a few results
    #result = {key: value * 100 for key, value in result.items()}
    
    # Add mean generated length
    #prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    #result["gen_len"] = np.mean(prediction_lens)
    
    #return {k: round(v, 4) for k, v in result.items()}
    return result

In [None]:
#logits = p.predictions
#labels = p.label_ids
#probabilities = softmax(logits, axis=-1)
#loss = log_loss(labels.flatten(), probabilities.reshape(-1, probabilities.shape[-1]), labels=[i for i in range(logits.shape[-1])])
#perplexity = np.exp(loss)
#return {"perplexity": perplexity}

In [None]:
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling

# Define training args
training_args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=True,
    fp16=True,  # Use FP16 if available
    evaluation_strategy="epoch",
    save_strategy="no",#"epoch",
    optim="adamw_torch",
    **{k:v for k,v in config.items() if k != 'lora_config'}
)


# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
    #compute_metrics=compute_metrics,
)

# Start training
trainer.train()

In [None]:
prompt = "The main impression based on the given FINDINGS section of the chest X-ray report are:"
findings_example = f"""Lateral view somewhat limited due to overlying motion artifact. The lungs are low in volume.  There is no focal airspace consolidation to suggest pneumonia.  A 1.2-cm calcified granuloma just below the medial aspect of the right hemidiaphragm is unchanged from prior study.  No pleural effusions or pulmonary edema. There is no pneumothorax.  The inferior sternotomy wire is fractured but unchanged. Surgical clips and vascular markers in the thorax are related to prior CABG surgery."""
eval_prompt = findings_example + prompt
model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")

model.eval()
with torch.no_grad():
    print("---\nGenerated Output:\n")
    print(tokenizer.decode(model.generate(**model_input, max_new_tokens=100)[0], skip_special_tokens=True)[len(eval_prompt):].strip())
    
ground_truth_summary="""
---
Ground Truth:
No evidence of acute cardiopulmonary process.
"""    
print(ground_truth_summary)