In [1]:
!pip install sentencepiece protobuf datasets transformers trl textstat peft bitsandbytes nltk --quiet
!pip install -U bitsandbytes accelerate --quiet

In [2]:
with open("hf.token", "r") as f:
    hftoken = f.read().strip()  

import os
cache_dir = "/mnt/c/Users/yc/.cache/huggingface"
os.environ['HF_HOME'] = cache_dir

from v7, add a final clean up to remove the unrelated text

## import

In [None]:
# Standard library imports
import csv
import re
import torch
from typing import Dict, List, Tuple, Optional
import nltk
from nltk.corpus import stopwords, words
from collections import Counter

import textwrap
LINE_WIDTH = 140
# Third-party data and ML libraries
import pandas as pd
# Hugging Face ecosystem
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
)

# Optional: Uncomment if needed
from huggingface_hub import login
login(token=hftoken)  # Move token to environment variable

  from .autonotebook import tqdm as notebook_tqdm


## load model

In [None]:

model_name = "meta-llama/Llama-3.1-8B-Instruct"
device_map = {"": 0}
tokenizer = AutoTokenizer.from_pretrained(model_name,
                                        #   cache_dir=cache_dir
                                          )
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map=device_map,
    load_in_8bit=True
)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
Loading checkpoint shards: 100%|██████████| 4/4 [00:33<00:00,  8.25s/it]


## data

In [35]:

def txt_to_dict(file_path):
    data_dict = {}
    with open(file_path, 'r') as file:
        lines = file.readlines()
        for i in range(0, len(lines) - 1, 2):
            key = lines[i].strip()    # Odd line are key
            value = lines[i + 1].strip()  # Even line are value
            data_dict[key] = value

    return data_dict

txt_file_path = 'formaldef.txt'
formaldic = txt_to_dict(txt_file_path)
len(formaldic)

meddict={}
for k,v in formaldic.items():
    meddict[k.split('Listen to pronunciation')[0].split('(')[0]]=v

In [36]:
filename= 'filtered_medical_dictionary.csv'
eighth_grade_words=set()
with open(filename, 'r', encoding='utf-8') as csvfile:
    reader = csv.reader(csvfile)
    next(reader)  # Skip the header row
    for row in reader:
        if row:  # Make sure row is not empty
            eighth_grade_words.add(row[0])  # Add the word (first column)
filtered_meddict = {word: explanation for word, explanation in meddict.items() 
                   if word in eighth_grade_words}
meddict=filtered_meddict

In [37]:
# load data
df = pd.read_csv('/mnt/c/Users/yc/Downloads/coral/unannotated/data/breastca_unannotated.csv')
df = df.sample(2, random_state=42) 

audience--> audiense str
key details, summary --> a summary
extraction terms --> a dict?
main prompt, takes in 123, out a str
check with key details and main Gen, takes 
final clean

# prompt, paragraph


In [48]:
# =============================================================================
# MEDICAL TERM EXTRACTION FUNCTIONS
# =============================================================================

def extract_medical_terms(text: str, meddict: Dict[str, str]) -> Dict[str, str]:

    found_terms = {}
    
    # Strategy 1: Single words
    words = re.findall(r'\b[A-Za-z]+(?:[-\'][A-Za-z]+)*\b', text)
    for word in words:
        definition = find_term_in_dict(word, meddict)
        if definition:
            found_terms[word] = definition
    
    # Strategy 2: Multi-word terms
    for n in range(2, 6):
        n_grams = get_n_grams(text, n)
        for phrase in n_grams:
            definition = find_term_in_dict(phrase, meddict)
            if definition:
                found_terms[phrase] = definition
    
    # Strategy 3: Medical abbreviations
    abbreviations = re.findall(r'\b[A-Z]{2,8}\b', text)
    for abbrev in abbreviations:
        definition = find_term_in_dict(abbrev, meddict)
        if definition:
            found_terms[abbrev] = definition
    
    # Strategy 4: Medical procedures and conditions with specific patterns
    medical_patterns = [
        r'\b\w+oscopy\b',          # bronchoscopy, endoscopy, etc.
        r'\b\w+ectomy\b',          # appendectomy, etc.
        r'\b\w+itis\b',            # bronchitis, arthritis, etc.
        r'\b\w+osis\b',            # fibrosis, stenosis, etc.
        r'\b\w+emia\b',            # anemia, septicemia, etc.
        r'\b\w+pathy\b',           # myopathy, neuropathy, etc.
        r'\b\w+malacia\b',         # tracheomalacia, etc.
    ]
    
    for pattern in medical_patterns:
        matches = re.findall(pattern, text, re.IGNORECASE)
        for match in matches:
            definition = find_term_in_dict(match, meddict)
            if definition:
                found_terms[match] = definition
    
    # Strategy 5: Medication names
    medication_patterns = [
        r'\b\w+cillin\b',          # penicillin, amoxicillin, etc.
        r'\b\w+mycin\b',           # streptomycin, etc.
        r'\b\w+floxacin\b',        # levofloxacin, ciprofloxacin, etc.
        r'\b\w+sone\b',            # prednisone, cortisone, etc.
        r'\b\w+pam\b',             # lorazepam, etc.
    ]
    
    for pattern in medication_patterns:
        matches = re.findall(pattern, text, re.IGNORECASE)
        for match in matches:
            definition = find_term_in_dict(match, meddict)
            if definition:
                found_terms[match] = definition
    
    return found_terms


def get_n_grams(text: str, n: int) -> List[str]:
    """Generate n-grams from text."""
    words = re.findall(r'\b[A-Za-z]+\b', text.lower())
    n_grams = []
    for i in range(len(words) - n + 1):
        phrase = ' '.join(words[i:i+n])
        n_grams.append(phrase)
    return n_grams


def find_term_in_dict(term: str, meddict: Dict[str, str]) -> Optional[str]:
    """Find term in medical dictionary."""
    search_formats = [
        term, term.lower(), term.upper(), term.title(), term.capitalize()
    ]
    
    for search_term in search_formats:
        if search_term in meddict:
            return meddict[search_term]
    
    # Partial matching
    for key in meddict.keys():
        if key.lower() == term.lower():
            return meddict[key]
    
    return None

def create_annotated_text(text: str, meddict: Dict[str, str]) -> str:

    # 1. Use the existing function to find all unique terms and their definitions.
    found_terms = extract_medical_terms(text, meddict)
    
    # 2. Sort terms by length in descending order to handle overlaps.
    # This is critical for terms like "cerebral palsy" and "palsy".
    sorted_terms = sorted(found_terms.keys(), key=len, reverse=True)
    
    annotated_text = text
    
    # 3. Iterate and replace.
    for term in sorted_terms:
        definition = found_terms[term]
        annotation = f"{term} [DEFINITION: {definition}]"
        pattern = r'\b' + re.escape(term) + r'\b'
        annotated_text = re.sub(pattern, annotation, annotated_text, count=1, flags=re.IGNORECASE)
        
    return annotated_text

# =============================================================================
# CORE EXECUTION FUNCTION
# =============================================================================
def run_model(prompt: str, model, tokenizer, generation_config: Dict) -> str:
    try:
        inputs = tokenizer(
            prompt, 
            return_tensors="pt", 
            truncation=True, 
            add_special_tokens=False  # This is still correct
        ).to(model.device)
        
        # We no longer need to calculate input_length manually
        # input_length = inputs["input_ids"].shape[1] 
        
        with torch.no_grad():
            outputs = model.generate(
                inputs["input_ids"],
                attention_mask=inputs["attention_mask"],
                **generation_config
            )
        
        # --- FIX: Use a more robust decoding method ---
        # Instead of slicing the tensor, decode the whole thing and skip the prompt text.
        # This is less prone to off-by-one errors with special tokens.
        # The `outputs[0]` is important to select the first sequence in the batch.
        raw_output = tokenizer.decode(
            outputs[0], 
            skip_special_tokens=True,
            clean_up_tokenization_spaces=True
        )

        # We must now remove the original prompt from the decoded string,
        # as skip_special_tokens=True doesn't always remove it perfectly with add_special_tokens=False.
        if prompt in raw_output:
            raw_output = raw_output.replace(prompt, "", 1) # Replace only the first occurrence
        
        # The rest of your cleanup logic is still valuable as a fallback.
        end_tokens = ['<|eot_id|>', '<|end_of_text|>', '</s>', '[/INST]']
        
        earliest_end = len(raw_output)
        for end_token in end_tokens:
            pos = raw_output.find(end_token)
            if pos != -1 and pos < earliest_end:
                earliest_end = pos
        
        if earliest_end < len(raw_output):
            raw_output = raw_output[:earliest_end]
        
        raw_output = raw_output.strip()
        
        return raw_output
        
    except Exception as e:
        return f"Error generating response: {str(e)}"
# =============================================================================
# PROMPT CREATION FUNCTIONS
# =============================================================================

def create_key_summary_prompt(original_text: str) -> str:

    prompt = f"""
<s>[INST] You are an expert medical information extractor with exceptional attention to detail. Your task is to carefully read the medical text below and extract ONLY the key factual details that are explicitly mentioned. You must be extremely precise and never infer, assume, or add any information not directly stated in the text.

**Medical Text:**
"{original_text}"

**Instructions:**

Carefully analyze the text and extract information for each category below. For each category, provide ONLY what is explicitly mentioned in the original text. If a category is not mentioned or unclear, write "Not mentioned" for that section.

**CRITICAL RULES:**
1. Extract ONLY facts explicitly stated in the text. If not mentioned, say "Not mentioned"
2. Use the EXACT wording from the original text when possible
3. Do not interpret, infer, or elaborate beyond what is written
4. If multiple items exist in a category, separate them with semicolons
5. Keep each section concise but complete
6. do not add any additional information, just extract and stop

**Format your response EXACTLY as follows:** 

**TREATMENTS RECEIVED:**
[List treatments, procedures, medications, surgeries, or therapeutic interventions explicitly mentioned as having been completed, given, or performed]

**MEDICAL CONDITIONS:**
[List diagnoses, medical conditions, diseases, or pathological findings explicitly stated]

**CANCER STAGE:**
[List cancer staging information if explicitly mentioned - include exact stage notation like "Stage IV", "T2N1M0", etc.]

**REFERRALS:**
[List the referrals to specialists, departments, other physicians, or healthcare facilities explicitly mentioned]

**CURRENT STATUS:**
[Patient's current medical condition, discharge status, vital status, or clinical state as explicitly stated]

**NEXT STEPS/PLAN:**
[List future medical plans, follow-up appointments, scheduled procedures, or treatment recommendations explicitly mentioned]

Now, begin the extraction, start with "**TREATMENTS RECEIVED:**":
[/INST]"""
    
    return prompt




def create_audience_determination_prompt(original_text: str) -> str:

    base_prompt = f"""
<s>[INST] You are an expert medical text classifier. Read the following medical text and determine the appropriate audience for a summary letter.

**Medical Text:**
"{original_text}"

**Instructions:**
Based on the text, who is the audience for the explanation letter?
- If the text describes a patient recovering, Discharge Condition says much improved, or having a positive or follow ongoing treatment plan, the audience is the **patient**.
- If the text mentions "died", "passed away," "deceased," or describes a fatal outcome such "comfort care" "hospice care" "pallliative care", "palliative extubate", the audience is the **patient's family**.

Respond with a single word ONLY: **patient** or **family**.
[/INST]
"""
    return base_prompt.format(original_text=original_text)


def create_explanation_prompt(annotated_text: str, audience: str, keysummary: str) -> str:

    
    if audience == 'family':
        audience_instruction = "The determined audience for this letter is the **patient's family**. You must address them directly as 'you' and refer to the patient in the third person (e.g., 'your loved one,' 'he/she')."
    else:  # patient
        audience_instruction = "The determined audience for this letter is the **patient**. You must address them directly as 'you' throughout the entire letter."

    base_prompt = f"""
<s>[INST] 
### Persona
You are an experienced and compassionate Oncologist (cancer specialist) and a skilled medical educator. Your primary role is to translate complex medical information into clear, understandable, and supportive explanations for patients and their families. Your tone should always be professional, empathetic, and honest, balancing realism with hope.

### Golden Rule: Radical Simplicity - Translate, Don't Transfer
Your single most important task is to convert medical terminology into simple, 8th-grade level English. Do not just define a medical term; replace it entirely with an easy-to-understand explanation.

**Examples of what you MUST do:**
* **INSTEAD OF:** "multifocal stage IIA breast cancer"
    * **WRITE:** "a type of breast cancer that was in an early stage and was found in more than one spot in the breast."
* **INSTEAD OF:** "a biopsy of the mass in your right axilla"
    * **WRITE:** "we will take a small sample of the lump in your right armpit to test it."
* **INSTEAD OF:** "mastectomy with sentinel node and implant reconstruction"
    * **WRITE:** "surgery to remove the breast, check the nearby glands to see if the cancer had spread, and rebuild the breast shape with an implant."

### Understanding the Medical Note's Structure
Before you write, you must understand how the original medical text is organized. This structure is key to creating a logical explanation.
* **Chief Complaint (CC):** This is the main reason for the visit in one short sentence. It answers the question, "Why are you here today?"
* **History of Present Illness (HPI):** This is the detailed story of the Chief Complaint. It explains the symptoms using a framework often remembered by the acronym "OLDCARTS":
    * **O**nset: When did the problem begin?
    * **L**ocation: Where is the symptom located?
    * **D**uration: How long has it been going on?
    * **C**haracterization: What does the symptom feel like (e.g., sharp, dull)?
    * **A**lleviating/Aggravating factors: What makes it better or worse?
    * **R**adiation: Does the sensation move anywhere else?
    * **T**emporal factor: Is it worse at a certain time of day?
    * **S**everity: How bad is it on a scale of 1 to 10?
* **History:** This section provides background context, including Oncology History (past cancer diagnoses/treatments), Past Medical History, Surgical History, Family History, and Social History.
* **Assessment and Plan (A&P):** This is the doctor’s summary and conclusion. The Assessment is the diagnosis, and the Plan outlines the next steps (tests, treatments, etc.).

### Original Medical Text:
"{annotated_text}"

### Internal Fact-Checking Reference:
This technical summary is for your internal use ONLY to ensure your response is factually accurate.
**STRICT INSTRUCTION:** You must treat this summary as a list of facts to be **translated** into simple language. DO NOT copy the medical terminology from this summary directly into the patient letter. You must translate these facts into the simple, empathetic language required by your persona.
"{keysummary}"

### {audience_instruction}

### Your Task:
Your goal is to write a single, complete, and polished letter that explains the information from the medical text above, following the logical flow of a clinical visit. Imagine you are sitting with the recipient and explaining this to them in person, then putting it in writing.
STRICT NEGATIVE CONSTRAINT: Under NO circumstances should you say something not exist in the note.

**1. Letter Structure and Flow:** Organize your letter logically using the following question-based headers to guide the reader.
    * **Empathetic Opening:** Start with a short, warm, and supportive salutation. (e.g., "Dear [Patient Name],") Acknowledge the reason for their recent visit.
    * **Why did you come to the clinic?**
        * Answer this using information from the **Chief Complaint (CC)** section. State the main symptom or reason for the visit clearly and simply.
    * **What was discussed?**
        * Answer this by translating the story from the **History of Present Illness (HPI)**. Describe the symptoms in plain language (when they started, what they feel like, what makes them better/worse, etc.), following the OLDCARTS framework.
        * Also, briefly recap relevant information from the **History** section (e.g., "As a reminder, your initial diagnosis was...").
        * **Crucial rule:** When discussing past treatment decisions, use neutral language. Instead of "you refused," say "At that time, the decision was made not to proceed with..."
    * **What did we find? (Assessment)**
        * Clearly and gently explain the main conclusion from the **Assessment** part of the note. Explain what terms like "metastatic" or "recurrence" mean. Use an analogy if helpful.
        * **STRICT NEGATIVE CONSTRAINT:** If the cancer has spread (metastasis), absolutely DO NOT list the specific organs affected. Instead, just say "the cancer has spread to other parts of your body."
    * **What is the plan? (Plan)**
        * Detail the next steps from the **Plan** section. For each step (biopsy, MRI, new medication), explain **WHAT** it is and, more importantly, **WHY** we are doing it.
        * If the note mentions palliative care, explain it as an active treatment focused on controlling cancer, managing symptoms, and maximizing quality of life. The focus must be on living well.
    * **Closing with Support:** End the letter by reinforcing that your team is there to support them.

**2. Language and Tone Directives:**
    * **Maintain Your Persona:** Use "we" for the medical team. Write with empathy and clarity.
    * **STRICT NEGATIVE CONSTRAINTS:**
        * Do not use fatalistic language. Avoid phrases like "until the end of your life" or "preparing for the end." The focus MUST be on quality and extension of LIFE.
        * Avoid lengthy, overly sympathetic sentences. Be concise and focused on the key information.

### Strict Output Formatting:
Provide ONLY the letter. Your output must start directly with the salutation and end with the signature. do not include any additional information

Dear [Patient Name],

I am writing to summarize our discussion from your recent visit. I know that receiving and processing this information can be overwhelming, and I hope this written summary is helpful.

**Why did you come to the clinic?**
[Your simplified explanation of the Chief Complaint]

**What was discussed?**
[Your simplified explanation of the HPI and relevant History]

**What did we find? (Assessment)**
[Your simplified explanation of the diagnosis]

**What is the plan?**
[Your simplified explanation of the next steps, tests, and treatment goals]

We are here to support you every step of the way. Please do not hesitate to contact our office with any questions you may have.

Sincerely,
Your Care Team at [Institution]

Please provide the patient-focused explanation now. Your output must start directly with the salutation
[/INST]
"""

    
    return base_prompt.format(
        annotated_text=annotated_text,
        keysummary=keysummary,
        audience_instruction=audience_instruction
    )

def create_cleaning_prompt(raw_response: str, audience: str) -> str:

    base_prompt = f"""
<s>[INST]
### Persona
You are an expert medical writer and editor. Your unique skill is communicating complex clinical information with absolute precision and clarity, and you are adept at tailoring your language for different audiences, from senior physicians to concerned patients. Your primary directive is to preserve the original meaning without fail.

### Target Audience:
{audience}

### Your Task
Your primary task is to revise the provided medical text. Based on the specified **Target Audience**, you will improve its quality in the following areas:

1.  **Clean Up Language:** Improve sentence structure and use professional language appropriate for the target audience.
2.  **Reduce Repetition:** Eliminate redundant words and phrases without losing critical information.
3.  **Improve Flow:** Enhance the logical flow and transitions to make the narrative easier to follow.
4.  **Define Medical Terms (Conditional Task):**
    * **IF the Target Audience is 'Patient/Layperson'**, you MUST perform this task: For any medical term or jargon a non-medical person would not understand, provide a simple, brief explanation in parentheses immediately after its first appearance.
    * **Example:** "The patient presented with tachycardia (a heart rate over 100 beats per minute) and pedal edema (swelling in the feet)."
    * **IF the Target Audience is 'Clinical Professional'**, you MUST NOT perform this task. Do not define standard medical terms.
5. remove the section headers (such as "**TREATMENTS RECEIVED:**"), and make it a flowing letter

### The Golden Rule: Preserve Clinical Meaning at All Costs
This is the most important rule. The revised text MUST be semantically and factually identical to the original.

**STRICT PROHIBITIONS:**
* **DO NOT** alter, add, or remove any clinical facts, diagnoses, measurements, dosages, or timelines.
* **DO NOT** change the certainty of a statement. A possibility ("suggests," "possible") must remain a possibility. A certainty ("diagnosed with," "confirmed") must remain a certainty.
* **DO NOT** reorder information in a way that changes the chronological or logical sequence of events.

---
### Instructions for Output

Please provide your response in two parts:
1.  **Revised Medical Text:** The complete, revised version of the text, tailored for the specified audience.
2. do not include any additional information, nor the change log

### Medical Text to Revise:

{raw_response}

Now, please proceed with the revision based on the specified Target Audience.
[/INST]
"""
    return base_prompt.format(raw_response=raw_response)





# =============================================================================
# UTILITY FUNCTIONS
# =============================================================================

def get_default_generation_config(tokenizer) -> Dict:
    """Get default generation configuration with proper terminators."""
    
    # Define stop tokens for Llama 3
    terminators = [
        tokenizer.eos_token_id,
        tokenizer.convert_tokens_to_ids("<|eot_id|>"),
        # --- FIX: Add the [/INST] token ID as a terminator ---
        tokenizer.convert_tokens_to_ids("[/INST]") 
    ]
    
    # Filter out None values in case a token doesn't exist
    terminators = [t for t in terminators if t is not None]
    
    return {
        "max_new_tokens": 800,
        "do_sample": False,
        "num_beams": 1,
        "early_stopping": True,
        "pad_token_id": tokenizer.eos_token_id,
        "eos_token_id": terminators, # This now includes [/INST]
        "repetition_penalty": 1.2,
        "no_repeat_ngram_size": 3,
    }

def get_audience_generation_config(tokenizer) -> Dict:
    """Get generation configuration for audience determination."""
    return {
        "max_new_tokens": 5,
        "temperature": 0.01,
        "do_sample": False,
        "pad_token_id": tokenizer.eos_token_id
    }

def parse_audience_response(raw_response: str) -> str:
    """Parse and validate audience determination response."""
    cleaned_response = raw_response.lower().strip()
    if "family" in cleaned_response:
        return "family"
    else:
        return "patient"  # Default to patient



# =============================================================================
# HIGH-LEVEL PIPELINE FUNCTIONS
# =============================================================================

def determine_audience(original_text: str, model, tokenizer) -> str:
    """
    Determine the target audience for the medical explanation.
    
    Args:
        original_text: The original medical text
        model: The loaded model instance
        tokenizer: The tokenizer instance
    
    Returns:
        Audience type ('patient' or 'family')
    """
    try:
        prompt = create_audience_determination_prompt(original_text)
        config = get_audience_generation_config(tokenizer)
        raw_output = run_model(prompt, model, tokenizer, config)
        audience = parse_audience_response(raw_output)
        return audience
    except Exception as e:
        print(f"⚠️ Audience determination failed: {e}. Defaulting to 'patient'.")
        return "patient"



def parse_key_summary_response(original_text: str, model, tokenizer) -> str:
    """
    Extract key medical summary using LLM from original medical text.
    
    Args:
        original_text: The original medical text
        model: The loaded model instance
        tokenizer: The tokenizer instance
    
    Returns:
        Dictionary with parsed key summary sections
    """
    try:
        # Create the extraction prompt
        prompt = create_key_summary_prompt(original_text)
        # print(prompt)
        # Use precise generation config for factual extraction
        config = get_default_generation_config(tokenizer)
        
        # Generate the key summary
        keysummary = run_model(prompt, model, tokenizer, config)
        # print(keysummary)
        return keysummary
        
    except Exception as e:
        print(f"⚠️ Key summary extraction failed: {e}")
        return {
            'treatments_received': 'Extraction failed',
            'medical_conditions': 'Extraction failed',
            'cancer_stage': 'Extraction failed',
            'referrals': 'Extraction failed',
            'current_status': 'Extraction failed',
            'next_steps_plan': 'Extraction failed', 
        }



# parse_key_summary_response(text, model, tokenizer)

def generate_explanation(annotated_text: str, 
                        audience: str, keysummary: str, model, tokenizer) -> str:
    """
    Generate medical explanation letter.
    
    Args:
        annotated_text: The original medical text
        audience: Target audience ('patient' or 'family')
        model: The loaded model instance
        tokenizer: The tokenizer instance
    
    Returns:
        Raw generated explanation
    """
    prompt = create_explanation_prompt(annotated_text, audience,keysummary)
    config = get_default_generation_config(tokenizer)
    return run_model(prompt, model, tokenizer, config)



def clean_response(raw_response: str, audience: str, model, tokenizer) -> str:
    """
    Clean the raw model response.
    
    Args:
        raw_response: Raw output from explanation generation
        audience: Target audience ('patient' or 'family')
        model: The loaded model instance
        tokenizer: The tokenizer instance
    
    Returns:
        Cleaned explanation letter
    """
    if len(raw_response.strip()) < 10 or raw_response.startswith("❌"):
        return raw_response
    
    try:
        prompt = create_cleaning_prompt(raw_response, audience)
        config = get_default_generation_config(tokenizer)
        cleaned = run_model(prompt, model, tokenizer, config)
        return cleaned
    except Exception as e:
        print(f"⚠️ Cleaning failed: {e}. Returning raw response.")
        return raw_response



# generation

In [49]:
all_results = []

for index, row in df.iterrows():
    text = row['note_text'] 
    
    print(f"\nProcessing row {index + 1}/{len(df)}...")
    
    audience = determine_audience(text, model, tokenizer)
    annotated_text = create_annotated_text(text, meddict)
    keysummary = parse_key_summary_response(text, model, tokenizer)
    explanation = generate_explanation(annotated_text, audience, keysummary, model, tokenizer)
    final_result = clean_response(explanation, audience, model, tokenizer)
    
    # Create result dictionary for this row
    row_result = {
        'original_text': text,
        'determined_audience': audience,
        'annotated_text': annotated_text,
        'keysummary': keysummary,
        'raw_explanation': explanation,
        'final_letter': final_result
    }
    
    # Print the result immediately after generation with same formatting
    print(f"\n{'='*60}")
    print(f"RESULTS FOR ROW {index + 1}")
    print(f"{'='*60}")
    
    for col in ['keysummary', 'raw_explanation', 'final_letter']:
        print(f"\n--- Column: {col} ---")
        original_text = row_result[col]
        wrapped_text = textwrap.fill(original_text, width=LINE_WIDTH)
        print(wrapped_text)
    
    all_results.append(row_result)



Processing row 84/2...



RESULTS FOR ROW 84

--- Column: keysummary ---
<s>[INST] You are an expert medical information extractor with exceptional attention to detail. Your task is to carefully read the medical
text below and extract ONLY the key factual details that are explicitly mentioned. You must be extremely precise and never infer, assume, or
add any information not directly stated in the text.  **Medical Text:** "We performed this consultation using real-time Telehealth tools,
including a live video connection between my location and the patient's location. Prior to initiating the consultation, we obtained informed
verbal consent to perform this consultation using Telehealth tools and answered all the questions about the Telehealth interaction.
Patient Name: ***** *****  ***** *****: 11/10/20    HPI: ***** ***** is a 60 y.o. female with a CHEK2 mutation, multiple sclerosis, and
metastatic breast cancer with disease in bone, soft tissue, liver, and possibly meninges who is seeking recommendations for o

KeyboardInterrupt: 

In [None]:

# Convert to DataFrame
results_df = pd.DataFrame(all_results)

# Save to CSV
base_filename = 'output'
extension = '.csv'
output_filename = base_filename + extension
counter = 1

while os.path.exists(output_filename):
    output_filename = f"{base_filename}_{counter}{extension}"
    counter += 1

results_df.to_csv(output_filename, index=False)
print(f"DataFrame saved to '{output_filename}'")

DataFrame saved to 'output_38.csv'
