In [None]:
# Install necessary packages
!pip install pandas numpy torch transformers nltk rouge_score bert_score moverscore pyemd pytorch_pretrained_bert accelerate bitsandbytes datasets evaluate

In [None]:
!pip install --upgrade numpy==1.26.0

In [None]:
import pandas as pd
import numpy as np
import re
import torch
from tqdm import tqdm
import evaluate
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, BitsAndBytesConfig
from rouge_score import rouge_scorer
from peft import LoraConfig, get_peft_model, TaskType

In [None]:
# Load the original train dataset
train_data = pd.read_csv('/content/drive/MyDrive/mimic_train.csv')

In [None]:
# Creating our train dataset by taking first 3000 samples from the original train dataset
train_data = train_data.iloc[:3000].copy()

In [None]:
# Pre-process the data: remove unnecessary columns
train_data = train_data.drop(columns=['subject_id', 'study_id'])

In [None]:
# Split train dataset into train and validation
train_df, val_df = train_test_split(train_data, test_size=0.2, random_state=42)

In [None]:
# Save the splits
train_df.to_csv("train_split.csv", index=False)
val_df.to_csv("val_split.csv", index=False)

In [None]:
# Tokenizer and model initialization
model_id="meta-llama/Meta-Llama-3-8B"

tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", load_in_4bit=True)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
# LORA configuration
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

model = get_peft_model(model, lora_config)

In [None]:
# Load the train and val csv files into Hugging Face Datasets
train_dataset = Dataset.from_csv("train_split.csv")
val_dataset = Dataset.from_csv("val_split.csv")

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
# Prompt Template
prompt_template = """You are an expert medical AI assistant.
Your task is to extract the key clinical information from the findings below and generate a single sentence, structured, concise, and clinically relevant Impression. Avoid speculating or including uncertain information. Focus on the most important diagnoses and observations.

## Examples:
# Findings: The heart size is normal. There is no pulmonary edema.
# Impression: Normal chest radiograph.

Findings:
{finding}

Impression:"""

In [None]:
# Tokenization Function
def format_example(example):
    prompt = prompt_template.format(finding=example['finding'])
    target = example['impression']
    input_text = prompt + " " + target
    tokenized = tokenizer(
        input_text,
        truncation=True,
        padding="max_length",
        max_length=512,
        return_tensors="pt"
    )
    tokenized["labels"] = tokenized["input_ids"].clone()
    return {key: val.squeeze() for key, val in tokenized.items()}

In [None]:
# Tokenize datasets
train_dataset = train_dataset.map(format_example)
val_dataset = val_dataset.map(format_example)

Map:   0%|          | 0/2400 [00:00<?, ? examples/s]

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./llama3-radiology",
    evaluation_strategy="epoch",
    learning_rate=2e-4,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=5,
    weight_decay=0.01,
    save_total_limit=1,
    logging_dir='./logs',
    logging_steps=10,
    save_steps=500
)



In [None]:
# Trainer initialization
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [None]:
# Train the model
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33maninbhtry-01[0m ([33maninbhtry-01-jadavpur-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin




Epoch,Training Loss,Validation Loss
1,0.333,0.273948


Epoch,Training Loss,Validation Loss
1,0.333,0.273948
2,0.2785,0.263263
3,0.1924,0.260905
4,0.1921,0.265161
5,0.1299,0.277026


TrainOutput(global_step=6000, training_loss=0.21771506768465043, metrics={'train_runtime': 52186.4807, 'train_samples_per_second': 0.23, 'train_steps_per_second': 0.115, 'total_flos': 2.76912798695424e+17, 'train_loss': 0.21771506768465043, 'epoch': 5.0})

In [None]:
# Load the original test dataset and then take first 500 examples from it to create our test dataset
test_data = pd.read_csv('/content/drive/MyDrive/mimic_test.csv')
test_data = test_data.iloc[:500].copy()

In [None]:
# Pre-process the data: remove unnecessary columns
test_data = test_data.drop(columns=['subject_id', 'study_id'])

In [None]:
# Summarize function
def summarize_finding(finding):
    prompt = prompt_template.format(finding=finding)
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).to(model.device)
    outputs = model.generate(
        **inputs,
        max_length=512,
        num_beams=10,
        no_repeat_ngram_size=2,
        early_stopping=True,
        temperature=0.5,
        repetition_penalty=1.2
    )
    summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Remove the prompt from the generated text
    summary = summary.replace(prompt, '').strip()
    return summary

In [None]:
# Generate summaries
test_data["summary"] = test_data["finding"].apply(summarize_finding)

In [None]:
# Convert to CSV
file_path = "/content/drive/MyDrive/ME THESIS/Predicted Summaries/Llama_finetuned.csv"
test_data.to_csv(file_path, index=False)

In [None]:
# Load the CSV file containing the model generated summaries
test_data = pd.read_csv('/content/drive/MyDrive/ME THESIS/Predicted Summaries/Llama_finetuned.csv')

In [None]:
# Extract ground truth (impression) and generated summaries
references = test_data["impression"].astype(str).tolist()
predictions = test_data["summary"].astype(str).tolist()

In [None]:
# Initialize ROUGE scorer
rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

In [None]:
# Compute ROUGE, METEOR, and store them
rouge1_scores, rouge2_scores, rougeL_scores = [], [], []

In [None]:
from nltk.translate.meteor_score import meteor_score

In [None]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
for ref, pred in zip(references, predictions):
    # Compute ROUGE scores
    scores = rouge.score(ref, pred)
    rouge1_scores.append(scores['rouge1'].fmeasure)
    rouge2_scores.append(scores['rouge2'].fmeasure)
    rougeL_scores.append(scores['rougeL'].fmeasure)

In [None]:
from bert_score import score

In [None]:
# Compute BERTScore
P, R, F1 = score(predictions, references, lang="en", verbose=False)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Calculate averages
average_scores = {
    "ROUGE-1": sum(rouge1_scores) / len(rouge1_scores),
    "ROUGE-2": sum(rouge2_scores) / len(rouge2_scores),
    "ROUGE-L": sum(rougeL_scores) / len(rougeL_scores),
    "BERTScore-F1": F1.mean().item()
}

In [None]:
# Print the results
print("Average Scores:")
for metric, score in average_scores.items():
    print(f"{metric}: {score:.4f}")

Average Scores:
ROUGE-1: 0.2732
ROUGE-2: 0.1111
ROUGE-L: 0.2465
BERTScore-F1: 0.8726


In [None]:
# Load meteor metric
meteor = evaluate.load("meteor")

In [None]:
# Calculate METEOR
meteor_score = meteor.compute(predictions=predictions, references=references)

In [None]:
# Print METEOR score
print("METEOR:", meteor_score)

METEOR: {'meteor': 0.23300526174550037}
