In [None]:
# Install necessary packages
!pip install pandas numpy torch transformers nltk rouge_score bert_score moverscore pyemd pytorch_pretrained_bert accelerate bitsandbytes datasets evaluate

In [None]:
!pip install --upgrade numpy==1.26.0

In [None]:
import pandas as pd
import torch
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, Pipeline
from rouge_score import rouge_scorer
from bert_score import score as bert_score
import numpy as np
import evaluate

In [None]:
# Tokenizer and model initialization
model_id="meta-llama/Meta-Llama-3-8B"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", quantization_config=bnb_config)

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/177 [00:00<?, ?B/s]

In [None]:
# Summarize function
def summarize_finding(finding):
    prompt = f"""You are an expert medical AI assistant.
    Your task is to extract the key clinical information from the findings below and generate a single sentence, structured, concise, and clinically relevant Impression. Avoid speculating or including uncertain information. Focus on the most important diagnoses and observations.

    ## Examples:
    # Findings: The heart size is normal. There is no pulmonary edema.
    # Impression: Normal chest radiograph.

    Findings:
    {finding}

    Impression:"""
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).to(model.device)
    outputs = model.generate(
        **inputs,
        max_length=512,
        num_beams=10,
        no_repeat_ngram_size=2,
        early_stopping=True,
        temperature=0.5,
        repetition_penalty=1.2
    )
    summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
    summary = summary.replace(prompt, '').strip()

    return summary

In [None]:
# Load the original test dataset and take first 500 examples from it to create our test dataset
file_path = '/content/drive/MyDrive/mimic_test.csv'
df = pd.read_csv(file_path)
df = df.iloc[:500].copy()

In [None]:
# Pre-process the data: remove unnecessary columns
df = df.drop(columns=['subject_id', 'study_id'])

In [None]:
# Create a new column 'summary' with summarized findings
df["summary"] = df["finding"].apply(summarize_finding)

In [None]:
# Convert to CSV
file_path = "/content/drive/MyDrive/ME THESIS/Predicted Summaries/Llama.csv"
df.to_csv(file_path, index=False)

In [None]:
# Load the CSV file containing the model generated summaries
df = pd.read_csv("/content/drive/MyDrive/ME THESIS/Predicted Summaries/Llama.csv")

In [None]:
# Extract ground truth (impression) and generated summaries
references = df["impression"].astype(str).tolist()
predictions = df["summary"].astype(str).tolist()

In [None]:
# Initialize ROUGE scorer
rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

In [None]:
# Compute ROUGE, METEOR, and store them
rouge1_scores, rouge2_scores, rougeL_scores = [], [], []

In [None]:
from nltk.translate.meteor_score import meteor_score

In [None]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
for ref, pred in zip(references, predictions):
    # Compute ROUGE scores
    scores = rouge.score(ref, pred)
    rouge1_scores.append(scores['rouge1'].fmeasure)
    rouge2_scores.append(scores['rouge2'].fmeasure)
    rougeL_scores.append(scores['rougeL'].fmeasure)

In [None]:
from bert_score import score

In [None]:
# Compute BERTScore
P, R, F1 = score(predictions, references, lang="en", verbose=False)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Calculate averages
average_scores = {
    "ROUGE-1": sum(rouge1_scores) / len(rouge1_scores),
    "ROUGE-2": sum(rouge2_scores) / len(rouge2_scores),
    "ROUGE-L": sum(rougeL_scores) / len(rougeL_scores),
    "BERTScore-F1": F1.mean().item()
}

In [None]:
# Print the results
print("Average Scores:")
for metric, score in average_scores.items():
    print(f"{metric}: {score:.4f}")

Average Scores:
ROUGE-1: 0.1393
ROUGE-2: 0.0479
ROUGE-L: 0.1247
BERTScore-F1: 0.8205


In [None]:
# Load metrics
meteor = evaluate.load("meteor")

Downloading builder script:   0%|          | 0.00/7.02k [00:00<?, ?B/s]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [None]:
# Calculate METEOR
meteor_score = meteor.compute(predictions=predictions, references=references)

In [None]:
# Print METEOR score
print("METEOR:", meteor_score)

METEOR: {'meteor': 0.12849153248297387}
