# ***Prepare***

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
import sys
sys.path.append('/content/drive/MyDrive/python_env/lib/python3.10/site-packages')

In [None]:
!mkdir -p /content/drive/MyDrive/python_env/lib/python3.10/site-packages


In [None]:
!pip install --target=/content/drive/MyDrive/python_env/lib/python3.10/site-packages -U bitsandbytes datasets accelerate transformers peft sacremoses rouge_score evaluate


In [None]:
!pip install datasets
!pip install --upgrade bitsandbytes
!pip install --upgrade accelerate transformers
!pip install peft
!pip install sacremoses
!pip install rouge_score
!pip install evaluate

# ***Prepare data***

In [None]:
import requests  # Import the requests library to handle HTTP requests
import json       # Import JSON library to parse JSON responses
import pandas as pd  # Import pandas for data manipulation and saving as CSV

# =========================== CONFIGURATION ===========================
# Define the total number of desired records
target_count = 100000    # The desired number of MRI-related records
retrieved_count = 0     # Initialize the counter to track the number of collected records
mri_records = []        # Initialize an empty list to store MRI-related data
page = 1                # Start fetching from page 1 (pagination)

# =========================== DATA EXTRACTION LOOP ===========================
# Continue fetching data until the desired number of records is reached
while retrieved_count < target_count:
    print('retrieved_count:', retrieved_count)  # Display the current count of collected records

    # =========================== API REQUEST ===========================
    # Define the API URL with pagination parameters
    # The 'query=Brain+MRI' term filters the data for MRI-related content
    # 'count=10' retrieves 10 records per request
    # 'page={page}' dynamically increments to request multiple pages
    url = f"https://openi.nlm.nih.gov/api/search?query=Brain+MRI&count=10&page={page}&format=json"

    # Make an HTTP GET request to fetch data from the API
    response = requests.get(url)

    # =========================== RESPONSE VALIDATION ===========================
    # Check if the request was successful (HTTP status code 200)
    if response.status_code == 200:
        data = response.json()  # Convert the JSON response to a Python dictionary

        # Exit the loop if no 'list' key exists or the list is empty (no more records)
        if "list" not in data or len(data["list"]) == 0:
            break

        # =========================== DATA EXTRACTION ===========================
        # Iterate through the list of articles in the response
        for paper in data["list"]:
            # Extract relevant fields with fallback values in case of missing data
            title = paper.get("title", "N/A")             # Extract the paper title
            authors = paper.get("authors", "N/A")         # Extract the authors
            journal = paper.get("journal_title", "N/A")   # Extract the journal name

            # Extract and format the publication date
            if "journal_date" in paper:
                publication_date = f"{paper['journal_date'].get('day', 'N/A')} " \
                                   f"{paper['journal_date'].get('month', 'N/A')} " \
                                   f"{paper['journal_date'].get('year', 'N/A')}"
            else:
                publication_date = "N/A"

            # Extract the PMC URL (Public Medical Center link)
            pmc_url = paper.get("pmc_url", "N/A")

            # Extract MRI-related findings (combine multiple outcomes with '|')
            mri_findings = " | ".join([outcome["#text"] for outcome in paper.get("Outcome", [])])

            # Extract the URL for MRI images (if available)
            img_url = paper.get("imgLarge", "N/A")

            # Append the extracted data as a dictionary into the list
            mri_records.append({
                "Title": title,
                "Authors": authors,
                "Journal": journal,
                "Publication Date": publication_date,
                "PMC URL": pmc_url,
                "MRI Findings": mri_findings,
                "Image URL": img_url
            })

        # =========================== COUNTER UPDATE ===========================
        # Update the total number of retrieved records after each loop iteration
        retrieved_count = len(mri_records)

        # Increment the `page` variable to fetch the next batch of results
        page += 1

    else:
        # =========================== ERROR HANDLING ===========================
        # Display an error message if the API request fails
        print(f"Request failed, status code: {response.status_code}")
        break  # Exit the loop if the API request encounters an error

# =========================== DATA STORAGE ===========================
# Convert the collected data into a Pandas DataFrame for easy manipulation
df = pd.DataFrame(mri_records)

# Define the output file name
csv_filename = "mri_reports_100000.csv"

# Save the DataFrame as a CSV file without including the DataFrame index
df.to_csv(csv_filename, index=False)

# Print the total number of records retrieved and the saved CSV file name
print(f"✅ Successfully retrieved {len(df)} records and saved them as '{csv_filename}'")



# ***Train the model***

In [1]:
import pandas as pd  # Import pandas for handling tabular data
import json  # Import JSON module for saving training data
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
# Import Hugging Face components for LLaMA model training

# ======================= Step 1: Load MRI Data =======================
# ✅ Load the CSV file containing MRI reports
# The dataset should contain detailed MRI findings and other relevant details
df = pd.read_csv("mri_reports_100000.csv")

# ======================= Step 2: Initialize Data Storage =======================
# ✅ Create an empty list to store training data
# The data will follow the 'input-output' format required for fine-tuning LLaMA
train_data = []

# ======================= Step 3: Data Processing Loop =======================
# ✅ Iterate over each row in the DataFrame to extract MRI-related information
for _, row in df.iterrows():

    # Extract the MRI findings (radiology report) from the dataset
    mri_findings = row["MRI Findings"]

    # Extract the title, providing a default value in case it's missing
    # Example: "Brain MRI Case" will be used if no title is available
    title = row.get("Title", "a brain MRI case")

    # ✅ Data Quality Check: Ensure meaningful content
    # Ignore empty or extremely short MRI reports (less than 10 characters)
    if isinstance(mri_findings, str) and len(mri_findings) > 10:

        # ======================= Step 4: Prompt Engineering =======================
        # ✅ Construct a clear prompt that guides the model on what to generate
        # The prompt encourages the model to generate structured radiology reports
        prompt = f"Write a radiology report for '{title}'."

        # Store the data as a dictionary with 'input' (prompt) and 'output' (report)
        train_data.append({"input": prompt, "output": mri_findings})

# ======================= Step 5: Save Data to JSON =======================
# ✅ Define the JSON file path for saving the training data
json_path = "mri_train_data.json"

# ✅ Save the extracted training data in JSON format with proper indentation for readability
with open(json_path, "w") as f:
    json.dump(train_data, f, indent=2)

# ======================= Step 6: Final Confirmation =======================
# ✅ Print a confirmation message showing the number of samples saved
print(f"The training data has been saved to {json_path}, Number of samples: {len(train_data)}")



The training data has been saved to mri_train_data.json, Number of samples: 90000


***load the llama model***

In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: write).
The token `EECS6895-Ass1` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `EECS6895-A

In [None]:
from transformers import BitsAndBytesConfig  # Import for efficient quantization using BitsAndBytes
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq
import pandas as pd  # For dataset manipulation
from sklearn.model_selection import train_test_split  # For splitting data into train/val sets
from peft import LoraConfig, get_peft_model  # Import PEFT for LoRA fine-tuning

# ==========================
# Load LLaMA Tokenizer & Model
# ==========================

# ✅ Specify the LLaMA model name (Meta's LLaMA 2 - 7B Chat model)
model_name = "meta-llama/Llama-2-7b-chat-hf"

# ==========================
# BitsAndBytes (bnb) Quantization Configuration
# ==========================
# ✅ BitsAndBytes enables 4-bit quantization to reduce memory usage while maintaining model accuracy.
# ✅ Quantization is crucial for handling large models like LLaMA on consumer GPUs.

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,            # ✅ Enables 4-bit quantization (significantly reduces memory consumption)
    bnb_4bit_compute_dtype=torch.float16,  # ✅ Use FP16 for computation — ideal for balancing speed and precision
    bnb_4bit_use_double_quant=True,   # ✅ Enables secondary quantization for improved performance
    bnb_4bit_quant_type="nf4",        # ✅ NF4 (Normalized Float 4) — optimal quantization format for LLMs
)

# ==========================
# Load Tokenizer
# ==========================
# ✅ The tokenizer converts text into token IDs that the model understands.
# ✅ Since LLaMA does not include a dedicated `<pad>` token, we set the EOS token as its padding token.

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # ✅ EOS token is used for padding in LLaMA models

# ==========================
# Load the Pretrained LLaMA Model
# ==========================
# ✅ Load the model with 4-bit quantization to optimize performance on limited GPU memory.
# ✅ The `device_map="auto"` option automatically assigns model layers across available devices (e.g., GPUs, CPU).

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,  # ✅ Apply 4-bit quantization
    device_map="auto"  # ✅ Automatically distribute the model across available GPUs and CPU
)

print("✅ LLaMA model and tokenizer successfully loaded.")

# ==========================
# Apply LoRA for Fine-tuning (Using PEFT)
# ==========================
# ✅ LoRA (Low-Rank Adaptation) enables efficient fine-tuning by updating only small trainable matrices
# ✅ LoRA is highly efficient for fine-tuning large models like LLaMA with minimal resource usage.

# ✅ LoRA Configuration:
lora_config = LoraConfig(
    r=64,                # ✅ Rank of the LoRA decomposition (smaller `r` reduces training cost)
    lora_alpha=128,      # ✅ Scaling factor for the LoRA updates (controls LoRA's contribution)
    target_modules=["q_proj", "v_proj"],  # ✅ Apply LoRA to the **Query** (`q_proj`) and **Value** (`v_proj`) layers
    lora_dropout=0.1,    # ✅ Dropout rate for LoRA layers (prevents overfitting)
    bias="none",         # ✅ No bias updates in LoRA layers (avoids unnecessary parameter growth)
    task_type="CAUSAL_LM"  # ✅ Task type: Causal Language Modeling (auto-regressive text generation)
)

# ✅ Wrap the LLaMA model with LoRA fine-tuning configuration
model = get_peft_model(model, lora_config)

# ✅ Display trainable parameters to verify that only LoRA layers are trainable
model.print_trainable_parameters()

print("✅ LoRA fine-tuning setup completed.")



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

LLaMA model and tokenizer successfully loaded.
trainable params: 33,554,432 || all params: 6,771,970,048 || trainable%: 0.4955
LoRA fine-tuning setup completed.


In [None]:
# ==============================================
# ✅ Data Processing: Load and Preprocess MRI Reports
# ==============================================

# ✅ Load the dataset containing MRI radiology reports
# The dataset should have MRI findings and possibly case titles
df = pd.read_csv("mri_reports_100000.csv")

# ✅ Randomly select 4000 samples for training
# - This step reduces dataset size to improve training speed and minimize resource usage.
# - The `random_state=42` ensures reproducibility (consistent random sampling).
df = df.sample(n=4000, random_state=42)

# ✅ Initialize an empty list to store formatted training data
# Each entry will contain:
#    - `input`: Prompt text for the model
#    - `output`: The corresponding MRI findings
train_data = []

# ✅ Iterate through each row in the sampled dataset
for _, row in df.iterrows():
    mri_findings = row["MRI Findings"]  # Extract the MRI findings (medical report text)
    title = row.get("Title", "a brain MRI case")  # Extract the case title (use default if missing)

    # ✅ Ensure the MRI findings text is valid and not too short
    # This step prevents adding incomplete or irrelevant data to the training set.
    if isinstance(mri_findings, str) and len(mri_findings) > 10:
        # ✅ Construct the training prompt
        # The prompt is formatted to mimic realistic user queries for improved model performance.
        prompt = f"Write a radiology report for '{title}'."

        # ✅ Append the input-output pair to the training data list
        train_data.append({"input": prompt, "output": mri_findings})

# ✅ Split the dataset into training (90%) and evaluation (10%) sets
# The `train_test_split()` function ensures that:
# - The model gets ample training data while still retaining evaluation samples.
# - The `random_state=42` ensures reproducibility.
train_texts, eval_texts = train_test_split(train_data, test_size=0.1, random_state=42)

# ==============================================
# ✅ Tokenization: Convert Text into Model-Readable Format
# ==============================================

# ✅ Define a function to tokenize both input (prompt) and output (MRI findings)
def tokenize_function(example):
    """
    Tokenizes the input prompt and the corresponding output (MRI report).
    Ensures that both input and output are truncated and padded to a fixed length (1024 tokens).
    """

    # ✅ Tokenize the input prompt
    # - `truncation=True` ensures that long text is truncated to the maximum token limit.
    # - `padding="max_length"` ensures all samples have the same token length for efficient batch training.
    # - `max_length=1024` optimizes performance while allowing detailed MRI reports.
    model_input = tokenizer(
        example["input"],
        truncation=True,
        padding="max_length",
        max_length=1024
    )

    # ✅ Tokenize the output text (MRI findings) as labels for supervised fine-tuning
    # - Tokenizing the 'output' ensures the model is trained on precise diagnostic language.
    # - `["input_ids"]` extracts the tokenized text as input IDs for the model to learn from.
    labels = tokenizer(
        example["output"],
        truncation=True,
        padding="max_length",
        max_length=1024
    )["input_ids"]

    # ✅ Attach `labels` to the input dictionary (enabling supervised learning)
    model_input["labels"] = labels

    return model_input



In [None]:
# ==============================================
# ✅ Convert Data into Hugging Face Dataset Format
# ==============================================

from datasets import Dataset  # Import Hugging Face's Dataset library for efficient data handling

# ✅ Convert Python lists (containing text pairs) into Hugging Face Dataset objects
# Hugging Face's Dataset format is optimized for efficient I/O, batch processing, and parallelism.
# It also integrates seamlessly with the `Trainer` API for streamlined fine-tuning.

# ✅ Convert training data into Dataset format and apply tokenization
train_dataset = Dataset.from_list(train_texts).map(
    tokenize_function,  # Applies the tokenization function defined earlier
    batched=True,       # ✅ Enables batch processing for faster tokenization
    remove_columns=["input", "output"]  # ✅ Removes the original text entries after tokenization
)

# ✅ Convert evaluation data into Dataset format and apply tokenization
eval_dataset = Dataset.from_list(eval_texts).map(
    tokenize_function,  # Applies the tokenization function defined earlier
    batched=True,       # ✅ Enables batch processing for faster tokenization
    remove_columns=["input", "output"]  # ✅ Removes the original text entries after tokenization
)


Map:   0%|          | 0/3265 [00:00<?, ? examples/s]

Map:   0%|          | 0/363 [00:00<?, ? examples/s]

In [None]:
import torch
torch.cuda.empty_cache()  # ✅ Clears unused memory
torch.cuda.memory_reserved(0)  # ✅ Releases reserved memory

4324327424

In [None]:
# ==============================================
# ✅ Training Configuration: Define Training Hyperparameters
# ==============================================

# ✅ Free up memory before training
# Enabling `allow_tf32` improves speed and stability for matrix multiplications
# TF32 (TensorFloat-32) improves performance without significant precision loss
torch.backends.cuda.matmul.allow_tf32 = True

# ==============================================
# ✅ Mount Google Drive (for saving model checkpoints in Colab)
# ==============================================
from google.colab import drive
drive.mount('/content/drive')

# ==============================================
# ✅ Define Training Arguments
# ==============================================
from transformers import TrainingArguments  # Import Hugging Face's TrainingArguments for managing training configs

training_args = TrainingArguments(
    # ✅ Output Directory
    # This directory will store:
    #  - Model checkpoints
    #  - Logs
    #  - Other essential files like training metrics
    output_dir="/content/drive/MyDrive/llama_mri_model_",

    # ✅ Batch Size
    # Since LLaMA models are large, reducing batch size prevents Out-of-Memory (OOM) errors.
    # Batch size 4 balances memory efficiency and model convergence.
    per_device_train_batch_size=4,

    # ✅ Gradient Accumulation
    # Gradient accumulation combines gradients across multiple steps to simulate larger batch sizes.
    # This improves training stability when memory is limited.
    gradient_accumulation_steps=12,

    # ✅ Training Duration
    # The number of epochs can be adjusted depending on dataset size and model convergence speed.
    # 18 epochs is suitable for smaller datasets like 4,000 MRI reports.
    num_train_epochs=18,

    # ✅ Logging Configuration
    # Display training metrics (loss, learning rate, etc.) every 10 steps for progress monitoring.
    logging_steps=10,

    # ✅ Model Checkpoints
    # Save a checkpoint every 500 steps to prevent data loss in case of Colab disconnects or crashes.
    save_steps=500,

    # ✅ Evaluation Frequency
    # Evaluate the model’s performance every 100 steps for better monitoring.
    eval_steps=100,
    evaluation_strategy="steps",  # Evaluation occurs during training (not just at the end).

    # ✅ Precision Control
    # - `bf16=True` is optimal for **A100** or **H100 GPUs** (faster than FP16 with less precision loss).
    # - `fp16=True` is better for other GPUs like **V100** or **RTX 3090**.
    bf16=False,
    fp16=True,

    # ✅ Checkpoint Control
    # To save disk space, keep only the **latest 3 checkpoints** (older ones are deleted automatically).
    save_total_limit=3,

    # ✅ Best Model Selection
    # Automatically load the **best-performing model** (based on the lowest loss) when training completes.
    load_best_model_at_end=True,

    # ✅ Metric Control
    # Since lower loss values are better, set `greater_is_better=False`.
    # This ensures the model selects checkpoints with the lowest loss score.
    greater_is_better=False,

    # ✅ Optimizer
    # `adamw_bnb_8bit` is a memory-efficient optimizer that improves performance on large models.
    # This optimizer reduces VRAM usage while maintaining stable learning.
    optim="adamw_bnb_8bit",
)

# ==============================================
# ✅ Memory Optimization
# ==============================================
import os

# ✅ Avoids CUDA memory fragmentation during training
# By enabling "expandable_segments," PyTorch dynamically allocates memory blocks,
# reducing memory fragmentation and improving GPU efficiency during large model training.
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

print("✅ Training configuration successfully set up!")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).




In [None]:
# ==============================================
# ✅ Training LLaMA with LoRA (Parameter-Efficient Fine-Tuning)
# ==============================================
import torch
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback
import gc  # Import the garbage collector to manage memory in PyTorch

# ==============================================
# ✅ Memory Optimization for Stable Training
# ==============================================
# Large models like LLaMA can consume significant GPU memory.
# These commands ensure efficient memory usage to prevent crashes.

gc.collect()                # ✅ Clears unreferenced memory from Python objects
torch.cuda.empty_cache()    # ✅ Releases unused GPU memory, preventing memory leaks
torch.cuda.memory_reserved(0)  # ✅ Resets reserved GPU memory blocks (additional cleanup step)

# ==============================================
# ✅ Hugging Face Trainer Setup
# ==============================================
# The Trainer API simplifies the fine-tuning process, handling key components like:
# - Dataset management
# - Model evaluation
# - Checkpoint saving
# - Gradient accumulation
# - Early stopping

trainer = Trainer(
    model=model,  # ✅ Use the LLaMA model wrapped with LoRA (loaded earlier)
    args=training_args,  # ✅ Pass previously defined `TrainingArguments`

    # ✅ Load datasets
    train_dataset=train_dataset,  # The preprocessed training dataset
    eval_dataset=eval_dataset,    # The preprocessed evaluation dataset

    # ✅ Early Stopping for Preventing Overfitting
    # Stops training if `eval_loss` does not improve for **3 consecutive evaluations**.
    # This prevents the model from overfitting or wasting computational resources.
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

# ==============================================
# ✅ Start Fine-Tuning
# ==============================================
# ✅ This command initiates model training.
# During training, logs will display:
# - Loss metrics
# - Learning rate updates
# - Evaluation performance
# - Checkpoint saving progress
trainer.train()


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mtg2935[0m ([33mtg2935-columbia-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss
100,0.2313,0.176688
200,0.017,0.012712
300,0.009,0.007969
400,0.0084,0.007405
500,0.0087,0.007498
600,0.0086,0.007471
700,0.0084,0.007313
800,0.0082,0.007246
900,0.0073,0.007252
1000,0.008,0.007215


TrainOutput(global_step=1224, training_loss=0.1379731973660562, metrics={'train_runtime': 13202.6601, 'train_samples_per_second': 4.451, 'train_steps_per_second': 0.093, 'total_flos': 2.364579622353371e+18, 'train_loss': 0.1379731973660562, 'epoch': 17.74908200734394})

In [None]:
# ==============================================
# ✅ Model Evaluation Using Hugging Face's Trainer API
# ==============================================

# ✅ Evaluate the model
# This command evaluates the model's performance on the evaluation dataset (defined earlier).
# It calculates key metrics such as:
# - `eval_loss` → Model's loss on the evaluation set
# - `eval_runtime` → Total time taken for evaluation
# - `eval_samples_per_second` → Processing speed (important for large datasets)
# - `eval_steps_per_second` → Number of evaluation steps processed per second
# - `epoch` → The training epoch during which the evaluation took place
eval_results = trainer.evaluate()

# ==============================================
# ✅ Display Evaluation Results
# ==============================================

# ✅ Pretty-print the evaluation results for clear presentation
print("\n📌 Model Evaluation Results:")

# ✅ Loss (Primary Metric)
# - Measures how well the model is predicting the expected output.
# - Lower values indicate better model performance.
print(f"📉 Loss: {eval_results['eval_loss']:.4f}")

# ✅ Runtime
# - Shows the total time spent on evaluation (in seconds).
# - Useful for understanding model efficiency and system performance.
print(f"⏳ Runtime: {eval_results['eval_runtime']:.2f} sec")

# ✅ Samples per Second
# - Indicates how fast the model processes data during evaluation.
# - Higher values suggest efficient computation.
print(f"⚡ Samples per second: {eval_results['eval_samples_per_second']:.2f}")

# ✅ Steps per Second
# - Displays the number of evaluation steps completed per second.
# - Helps monitor performance during batch processing.
print(f"🔄 Steps per second: {eval_results['eval_steps_per_second']:.2f}")

# ✅ Epoch (Current Training Stage)
# - Displays the epoch during which evaluation was performed.
# - Helpful for tracking progress across multiple training cycles.
print(f"🔁 Epoch: {eval_results['epoch']:.2f}\n")



📌 Model Evaluation Results:
📉 Loss: 0.0072
⏳ Runtime: 33.25 sec
⚡ Samples per second: 10.92
🔄 Steps per second: 1.38
🔁 Epoch: 17.75



# ***Test***

In [None]:
import torch

torch.cuda.empty_cache()  # ✅ Clears unused memory
torch.cuda.memory_summary(device=None, abbreviated=False)  # ✅ Prints GPU memory usage




In [None]:
import random  # Import the random module for random sampling if needed
import pandas as pd  # Import pandas for handling tabular data
import evaluate  # Import evaluate library for model evaluation
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline  # Import required components from Hugging Face Transformers
from peft import PeftModel  # Import PeftModel for loading the fine-tuned LoRA model

# ✅ Load the Base LLaMA Model
# Define the name of the base LLaMA model to use
base_model_name = "meta-llama/Llama-2-7b-chat-hf"  # This should match the model you used for fine-tuning

# Load the base LLaMA model (before fine-tuning) from Hugging Face's model hub
base_model = AutoModelForCausalLM.from_pretrained(base_model_name)

# Load the corresponding tokenizer for the base model
base_tokenizer = AutoTokenizer.from_pretrained(base_model_name)

# ✅ Load the Fine-Tuned LLaMA Model (with LoRA)
# Specify the path where the fine-tuned model checkpoint is stored
fine_tuned_model_path = "./llama_mri_model_/checkpoint-153"

# Load the fine-tuned model, applying the fine-tuning adapter (LoRA)
# PeftModel allows us to apply fine-tuned parameters on top of the base model
fine_tuned_model = PeftModel.from_pretrained(base_model, fine_tuned_model_path)

# Since the fine-tuned model uses the same tokenizer as the base model, we reuse it
fine_tuned_tokenizer = base_tokenizer

# ✅ Set Up Text Generation Pipelines
# Create a text-generation pipeline for the base model
# The pipeline provides an easy way to generate text using the model
base_generator = pipeline("text-generation", model=base_model, tokenizer=base_tokenizer)

# Create a text-generation pipeline for the fine-tuned model
fine_tuned_generator = pipeline("text-generation", model=fine_tuned_model, tokenizer=fine_tuned_tokenizer)

# ✅ Notes:
# - `base_generator` will generate text using the pre-trained LLaMA model (before fine-tuning).
# - `fine_tuned_generator` will generate text using the LLaMA model that has been fine-tuned on MRI-related radiology reports.
# - These pipelines can be used to compare the outputs from both models to evaluate the impact of fine-tuning.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0
Device set to use cuda:0
The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['AriaTextForCausalLM', 'BambaForCausalLM', 'BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'Cohere2ForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'DiffLlamaForCausalLM', 'ElectraForCausalLM', 'Emu3ForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FalconMambaForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'GitForCausalLM', 'GlmForCausalLM', 'GotOcr2ForConditionalGeneration', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCa

In [None]:
import random  # Import the random module for selecting a random medical condition

# ✅ Define Medical Conditions for Testing
# This list contains different brain-related medical conditions commonly diagnosed via MRI scans.
# These conditions will be used to test the model's ability to generate radiology reports.
conditions = [
    "a suspected glioblastoma multiforme (GBM) tumor",  # A highly aggressive type of brain tumor
    "multiple white matter hyperintensities suggestive of multiple sclerosis (MS)",  # Lesions indicating possible MS
    "ischemic stroke",  # A stroke caused by reduced blood supply to the brain
    "brain metastases from lung cancer",  # Secondary brain tumors originating from lung cancer
    "Alzheimer's disease with cortical atrophy",  # Brain shrinkage seen in Alzheimer's disease
    "meningioma near the frontal lobe",  # A slow-growing brain tumor near the frontal lobe
    "diffuse axonal injury from head trauma"  # Widespread brain damage due to traumatic brain injury
]

# Randomly select one condition from the list to simulate a unique MRI case scenario
selected_condition = random.choice(conditions)

# ✅ Construct Prompt
# The prompt is structured to instruct the model to generate a highly detailed and structured radiology report.
# It specifies that the MRI scan is performed with contrast, a common technique to enhance imaging details.
prompt = (
    f"You are an expert radiologist. Write a highly detailed and structured radiology report for a 'Brain MRI with contrast' "
    f"showing {selected_condition}. Use professional medical terminology and follow the standard structure:\n\n"
)


In [None]:
# ✅ Generate Reports from Base Models
# Generate a radiology report using the base LLaMA model (before fine-tuning).
# - `max_length=1024` ensures the generated report is long enough to be comprehensive.
# - `temperature=0.3` controls randomness in text generation (lower values produce more deterministic output).
base_report = base_generator(prompt, max_length=1024, temperature=0.3)[0]["generated_text"]

# Print the MRI report generated by the base LLaMA model.
print("\n📝 Base LLaMA MRI Report:\n", base_report)


📝 Base LLaMA MRI Report:
 You are an expert radiologist. Write a highly detailed and structured radiology report for a 'Brain MRI with contrast' showing brain metastases from lung cancer. Use professional medical terminology

Patient Information:
Name: MRI of the brain with contrast. Showing brain metastases from lung cancer.
Date of birth: 03/05/1965
Date of study: 02/22/2023
Time of study: 14:30
Location of study: MR room

MR findings:
The brain MRI with contrast is a highly detailed and structured report of the brain. It is a detailed report of the brain. It is a detailed report of the brain. It is a detailed report of the brain. It is a detailed report of the brain. It is a detailed report of the brain. It is a detailed report of the brain. It is a detailed report of the brain. It is a detailed report of the brain. It is a detailed report of the brain. It is a detailed report of the brain. It is a detailed report of the brain. It is a detailed report of the brain. It is a detailed

In [None]:
# ✅ Generate Reports from Both Models
# Generate a radiology report using the fine-tuned LLaMA model (trained specifically on MRI radiology data).
fine_tuned_report = fine_tuned_generator(prompt, max_length=1024, temperature=0.3)[0]["generated_text"]

# ✅ Print and Compare Outputs
# Print the randomly selected medical condition for reference.
print("\n📌 Condition:", selected_condition)


# Print the MRI report generated by the fine-tuned LLaMA model.
print("\n✅ Fine-Tuned LLaMA MRI Report:\n", fine_tuned_report)



📌 Condition: a suspected glioblastoma multiforme (GBM) tumor

✅ Fine-Tuned LLaMA MRI Report:
 You are an expert radiologist. Write a highly detailed and structured radiology report for a 'Brain MRI with contrast' showing a suspected glioblastoma multiforme (GBM) tumor. Use professional medical terminology and follow the standard structure:

Patient Information:

* Name: John Doe
* Age: 62 years
* Gender: Male

Imaging Information:

* MRI was performed using a 1.5T field in a sagittal, coronal, and axial plan.
* The MRI was performed with the use of gadolinium contrast.

Findings:

* A tumor is noted in the cerebral hemis and is aright- lateral to the third ventric.
* The tumor is approximately 3.0 x 2.0 cm in size and is hyperintense on the T2-weighted and fluid-attenuated inversionrecovery (FLAIR) sequences.
* The tumor is isointense on the T1-weighted sequence and shows a slight heterogeneity in signal intensity within the tumor.
* There is no evidence of edema or brain shift.

Diag

In [None]:
# ✅ Load Real MRI Reports
df = pd.read_csv("mri_reports_100000.csv")

# ✅ Drop NaN Values & Randomly Select 30 Reports
df = df.dropna(subset=["MRI Findings"])
reference_reports = df["MRI Findings"].sample(30).tolist()

# ✅ Normalize Reference Report Length (Avoid Length Bias)
reference_lengths = [len(ref.split()) for ref in reference_reports]
effective_ref_length = min(reference_lengths)  # Choose minimum length

reference_reports = [" ".join(ref.split()[:effective_ref_length]) for ref in reference_reports]


In [None]:
# ✅ Load NLP Evaluation Metrics
bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")
bertscore = evaluate.load("bertscore")

# ✅ Compute Scores for Base Model
bleu_base = bleu.compute(predictions=[base_report], references=[reference_reports], max_order=1)
rouge_base = rouge.compute(predictions=[base_report], references=[reference_reports])
bert_base = bertscore.compute(predictions=[base_report], references=[reference_reports], lang="en")

# ✅ Compute Scores for Fine-Tuned Model
bleu_fine = bleu.compute(predictions=[fine_tuned_report], references=[reference_reports], max_order=1)
rouge_fine = rouge.compute(predictions=[fine_tuned_report], references=[reference_reports])
bert_fine = bertscore.compute(predictions=[fine_tuned_report], references=[reference_reports], lang="en")

# ✅ Extract Scores from Dictionaries
bleu_base_score = bleu_base["bleu"]
bleu_fine_score = bleu_fine["bleu"]

rouge_base_1 = rouge_base["rouge1"]
rouge_fine_1 = rouge_fine["rouge1"]

rouge_base_2 = rouge_base["rouge2"]
rouge_fine_2 = rouge_fine["rouge2"]

rouge_base_l = rouge_base["rougeL"]
rouge_fine_l = rouge_fine["rougeL"]

bert_base_f1 = bert_base["f1"][0]  # BERTScore returns a list, so take first value
bert_fine_f1 = bert_fine["f1"][0]

# ✅ Print Results in a Readable Format
print("\n📊 Model Performance Comparison:")

print(f"\n🔹 **BLEU Score**:")
print(f"   - Base Model: {bleu_base_score:.4f}")
print(f"   - Fine-Tuned Model: {bleu_fine_score:.4f}")

print(f"\n🔹 **ROUGE Scores**:")
print(f"   - ROUGE-1: Base = {rouge_base_1:.4f}, Fine-Tuned = {rouge_fine_1:.4f}")
print(f"   - ROUGE-2: Base = {rouge_base_2:.4f}, Fine-Tuned = {rouge_fine_2:.4f}")
print(f"   - ROUGE-L: Base = {rouge_base_l:.4f}, Fine-Tuned = {rouge_fine_l:.4f}")

print(f"\n🔹 **BERTScore (F1)**:")
print(f"   - Base Model: {bert_base_f1:.4f}")
print(f"   - Fine-Tuned Model: {bert_fine_f1:.4f}")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



📊 Model Performance Comparison:

🔹 **BLEU Score**:
   - Base Model: 0.0357
   - Fine-Tuned Model: 0.2055

🔹 **ROUGE Scores**:
   - ROUGE-1: Base = 0.0452, Fine-Tuned = 0.1873
   - ROUGE-2: Base = 0.0108, Fine-Tuned = 0.0337
   - ROUGE-L: Base = 0.0344, Fine-Tuned = 0.1271

🔹 **BERTScore (F1)**:
   - Base Model: 0.7715
   - Fine-Tuned Model: 0.8156
