In [2]:
!pip install sentencepiece protobuf datasets transformers trl textstat peft bitsandbytes nltk --quiet
!pip install -U bitsandbytes accelerate --quiet

In [3]:
with open("hf.token", "r") as f:
    hftoken = f.read().strip()  

import os
cache_dir = "/mnt/c/Users/yc/.cache/huggingface"
os.environ['HF_HOME'] = cache_dir

## import

In [4]:
# Standard library imports
import csv
import json
import random
import re
import sys
import time
from collections import Counter
from collections import defaultdict
from collections import defaultdict
import re
import torch
from typing import Dict, List, Tuple, Optional


# Third-party data and ML libraries
import pandas as pd
import torch
import nltk
from nltk.tokenize import word_tokenize
import re
import torch
# Hugging Face ecosystem
from datasets import Dataset, load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    Trainer,
    pipeline,
    logging,
)

# Fine-tuning and optimization
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

# Text analysis and readability metrics
from textstat import (
    flesch_kincaid_grade, 
    flesch_reading_ease,
    smog_index, 
    gunning_fog, 
    dale_chall_readability_score,
    text_standard, 
    syllable_count
)

# Optional: Uncomment if needed
from huggingface_hub import login
login(token=hftoken)  # Move token to environment variable

  from .autonotebook import tqdm as notebook_tqdm


## load model

In [5]:
# Model Configuration
# model_name = "NousResearch/Llama-2-7b-chat-hf"
# Recommended upgrade - Llama 3.1 8B
model_name = "meta-llama/Llama-3.1-8B-Instruct"

# # Or try Mistral 7B
# model_name = "mistralai/Mistral-7B-Instruct-v0.3"

device_map = {"": 0}

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name,
                                        #   cache_dir=cache_dir
                                          )
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)

# Load model with quantization
# model = AutoModelForCausalLM.from_pretrained(
#     model_name,
#     device_map=device_map,
#     quantization_config=bnb_config
# )

# # full precision
# model = AutoModelForCausalLM.from_pretrained(
#     model_name,
#     device_map=device_map,
#     torch_dtype=torch.float16,  # Use half precision instead
#     # cache_dir=cache_dir
# )

# 8 precision
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map=device_map,
    load_in_8bit=True
)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
Loading checkpoint shards: 100%|██████████| 4/4 [00:31<00:00,  7.89s/it]


## data

In [22]:

def txt_to_dict(file_path):
    data_dict = {}
    with open(file_path, 'r') as file:
        lines = file.readlines()
        for i in range(0, len(lines) - 1, 2):
            key = lines[i].strip()    # Odd line are key
            value = lines[i + 1].strip()  # Even line are value
            data_dict[key] = value

    return data_dict

txt_file_path = 'formaldef.txt' 
formaldic = txt_to_dict(txt_file_path)
len(formaldic)

meddict={}
for k,v in formaldic.items():
    meddict[k.split('Listen to pronunciation')[0].split('(')[0]]=v

In [88]:
# load data
df = pd.read_csv('/mnt/c/Users/yc/Downloads/coral/unannotated/data/breastca_unannotated.csv')
df=df.head(2) 

In [89]:
df

Unnamed: 0,coral_idx,Sex,UCSFDerivedRaceEthnicity_X,BirthDate,note_text
0,140,Female,Native Hawaiian or Other Pacific Islander,1964-03-25,Medical Oncology Consult Note Patient Name:...
1,141,Female,Native Hawaiian or Other Pacific Islander,1975-03-29,This is a shared visit for services provided b...


In [90]:
text=df.iloc[0]['note_text']
text

'Medical Oncology Consult Note    Patient Name: ***** *****   Patient MRN:  *****   Patient DOB:  11/23/1963   Date of Visit:  12/30/2019  Provider:  ***** ***** *****  Primary Care Provider:  None Per Patient Provider  Referring MD:        Reason for visit:   Chief Complaint   Patient presents with   \x07 New Patient Evaluation       Diagnosis:    1. Malignant neoplasm of overlapping sites of right breast in female, estrogen receptor positive (CMS code)  NM Whole Body Bone Scan    MR Brain with and without Contrast    Complete Blood Count with Differential    Comprehensive Metabolic Panel (BMP, AST, ALT, T.BILI, ALKP, TP, ALB)    Cancer Antigen 15-3    Carcinoembryonic Antigen    Activated Partial Thromboplastin Time    Prothrombin Time    Ambulatory Referral to Integrative Medicine       History of Present Illness:   56 year old female diagnosed in May 2013 with a multifocal Stage IIA right breast cancer.  She had a mastectomy with sentinel node and implant reconstruction in June 201

audience--> audiense str
key details, summary --> a summary
extraction terms --> a dict?
main prompt, takes in 123, out a str
check with key details and main Gen, takes 
final clean

# prompt, paragraph


In [91]:

# =============================================================================
# CORE EXECUTION FUNCTION
# =============================================================================

def run_model(prompt: str, model, tokenizer, generation_config: Dict) -> str:
    """
    Core function to run model inference with a given prompt.
    
    Args:
        prompt: The formatted prompt to send to the model
        model: The loaded model instance
        tokenizer: The tokenizer instance
        generation_config: Dictionary of generation parameters
    
    Returns:
        Raw model output string
    """
    try:
        inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to(model.device)
        input_length = inputs["input_ids"].shape[1]
        
        with torch.no_grad():
            outputs = model.generate(
                inputs["input_ids"],
                attention_mask=inputs["attention_mask"],
                **generation_config
            )
        
        response_tokens = outputs[0][input_length:]
        # raw_output = tokenizer.decode(response_tokens, skip_special_tokens=False)
        # return raw_output
        raw_output = tokenizer.decode(response_tokens, skip_special_tokens=True)
        return raw_output.strip()
        
    except Exception as e:
        return f"Error generating response: {str(e)}"



In [92]:

# =============================================================================
# MEDICAL TERM EXTRACTION FUNCTIONS
# =============================================================================

def extract_medical_terms(text: str, meddict: Dict[str, str]) -> Dict[str, str]:
    """
    Extract medical terms from text using multiple strategies.
    
    Args:
        text: Input medical text
        meddict: Medical dictionary for term lookup
    
    Returns:
        Dictionary of found terms and their definitions
    """
    found_terms = {}
    
    # Strategy 1: Single words
    words = re.findall(r'\b[A-Za-z]+(?:[-\'][A-Za-z]+)*\b', text)
    for word in words:
        definition = find_term_in_dict(word, meddict)
        if definition:
            found_terms[word] = definition
    
    # Strategy 2: Multi-word terms
    for n in range(2, 6):
        n_grams = get_n_grams(text, n)
        for phrase in n_grams:
            definition = find_term_in_dict(phrase, meddict)
            if definition:
                found_terms[phrase] = definition
    
    # Strategy 3: Medical abbreviations
    abbreviations = re.findall(r'\b[A-Z]{2,8}\b', text)
    for abbrev in abbreviations:
        definition = find_term_in_dict(abbrev, meddict)
        if definition:
            found_terms[abbrev] = definition
    
    # Strategy 4: Medical procedures and conditions with specific patterns
    medical_patterns = [
        r'\b\w+oscopy\b',          # bronchoscopy, endoscopy, etc.
        r'\b\w+ectomy\b',          # appendectomy, etc.
        r'\b\w+itis\b',            # bronchitis, arthritis, etc.
        r'\b\w+osis\b',            # fibrosis, stenosis, etc.
        r'\b\w+emia\b',            # anemia, septicemia, etc.
        r'\b\w+pathy\b',           # myopathy, neuropathy, etc.
        r'\b\w+malacia\b',         # tracheomalacia, etc.
    ]
    
    for pattern in medical_patterns:
        matches = re.findall(pattern, text, re.IGNORECASE)
        for match in matches:
            definition = find_term_in_dict(match, meddict)
            if definition:
                found_terms[match] = definition
    
    # Strategy 5: Medication names
    medication_patterns = [
        r'\b\w+cillin\b',          # penicillin, amoxicillin, etc.
        r'\b\w+mycin\b',           # streptomycin, etc.
        r'\b\w+floxacin\b',        # levofloxacin, ciprofloxacin, etc.
        r'\b\w+sone\b',            # prednisone, cortisone, etc.
        r'\b\w+pam\b',             # lorazepam, etc.
    ]
    
    for pattern in medication_patterns:
        matches = re.findall(pattern, text, re.IGNORECASE)
        for match in matches:
            definition = find_term_in_dict(match, meddict)
            if definition:
                found_terms[match] = definition
    
    return found_terms


def get_n_grams(text: str, n: int) -> List[str]:
    """Generate n-grams from text."""
    words = re.findall(r'\b[A-Za-z]+\b', text.lower())
    n_grams = []
    for i in range(len(words) - n + 1):
        phrase = ' '.join(words[i:i+n])
        n_grams.append(phrase)
    return n_grams


def find_term_in_dict(term: str, meddict: Dict[str, str]) -> Optional[str]:
    """Find term in medical dictionary."""
    search_formats = [
        term, term.lower(), term.upper(), term.title(), term.capitalize()
    ]
    
    for search_term in search_formats:
        if search_term in meddict:
            return meddict[search_term]
    
    # Partial matching
    for key in meddict.keys():
        if key.lower() == term.lower():
            return meddict[key]
    
    return None

def create_annotated_text(text: str, meddict: Dict[str, str]) -> str:
    """
    Annotates a medical text with definitions for all found medical terms.
    
    Args:
        text: The original medical text.
        meddict: The medical dictionary for term lookup.
        
    Returns:
        The annotated text string.
    """
    # 1. Use the existing function to find all unique terms and their definitions.
    found_terms = extract_medical_terms(text, meddict)
    
    # 2. Sort terms by length in descending order to handle overlaps.
    # This is critical for terms like "cerebral palsy" and "palsy".
    sorted_terms = sorted(found_terms.keys(), key=len, reverse=True)
    
    annotated_text = text
    
    # 3. Iterate and replace.
    for term in sorted_terms:
        definition = found_terms[term]
        annotation = f"{term} [DEFINITION: {definition}]"
        pattern = r'\b' + re.escape(term) + r'\b'
        annotated_text = re.sub(pattern, annotation, annotated_text, count=1, flags=re.IGNORECASE)
        
    return annotated_text

In [93]:
# =============================================================================
# PROMPT CREATION FUNCTIONS
# =============================================================================

def create_key_summary_prompt(original_text: str) -> str:
    """
    Create prompt for LLM-based key summary extraction.
    
    Args:
        original_text: The original medical text
    
    Returns:
        Formatted prompt string for key summary extraction
    """
    prompt = f"""
[INST] You are an expert medical information extractor with exceptional attention to detail. Your task is to carefully read the medical text below and extract ONLY the key factual details that are explicitly mentioned. You must be extremely precise and never infer, assume, or add any information not directly stated in the text.

**Medical Text:**
"{original_text}"

**Instructions:**

Carefully analyze the text and extract information for each category below. For each category, provide ONLY what is explicitly mentioned in the original text. If a category is not mentioned or unclear, write "Not mentioned" for that section.

**CRITICAL RULES:**
1. Extract ONLY facts explicitly stated in the text
2. Use the EXACT wording from the original text when possible
3. Do not interpret, infer, or elaborate beyond what is written
4. If multiple items exist in a category, separate them with semicolons
5. Keep each section concise but complete

**Format your response EXACTLY as follows:**

**TREATMENTS RECEIVED:**
[List only treatments, procedures, medications, surgeries, or therapeutic interventions explicitly mentioned as having been completed, given, or performed]

**MEDICAL CONDITIONS:**
[List only diagnoses, medical conditions, diseases, or pathological findings explicitly stated]

**CANCER STAGE:**
[Only if cancer staging information is explicitly mentioned - include exact stage notation like "Stage IV", "T2N1M0", etc.]

**REFERRALS:**
[Only if referrals to specialists, departments, other physicians, or healthcare facilities are explicitly mentioned]

**CURRENT STATUS:**
[Patient's current medical condition, discharge status, vital status, or clinical state as explicitly stated]

**NEXT STEPS/PLAN:**
[Only future medical plans, follow-up appointments, scheduled procedures, or treatment recommendations explicitly mentioned]

Extract the key information now:
[/INST]"""
    
    return prompt




In [94]:

def create_audience_determination_prompt(original_text: str) -> str:
    """
    Create prompt for determining the target audience (patient or family).
    
    Args:
        original_text: The original medical text
    
    Returns:
        Formatted prompt string for audience determination
    """
    base_prompt = f"""
[INST] You are an expert medical text classifier. Read the following medical text and determine the appropriate audience for a summary letter.

**Medical Text:**
"{original_text}"

**Instructions:**
Based on the text, who is the audience for the explanation letter?
- If the text describes a patient recovering, Discharge Condition says much improved, or having a positive or follow ongoing treatment plan, the audience is the **patient**.
- If the text mentions "died", "passed away," "deceased," or describes a fatal outcome such "comfort care" "hospice care" "pallliative care", "palliative extubate", the audience is the **patient's family**.

Respond with a single word ONLY: **patient** or **family**.
[/INST]
"""
    return base_prompt.format(original_text=original_text)



In [95]:

def create_explanation_prompt(annotated_text: str, audience: str, keysummary: str) -> str:
    """
    Create the main explanation prompt.
    
    Args:
        annotated_text: The original medical text
        found_terms: Dictionary of medical terms and definitions
        audience: Target audience ('patient' or 'family')
    
    Returns:
        Formatted prompt string for generating explanation
    """
    
    if audience == 'family':
        audience_instruction = "The determined audience for this letter is the **patient's family**. You must address them directly as 'you' and refer to the patient in the third person (e.g., 'your loved one,' 'he/she')."
    else:  # patient
        audience_instruction = "The determined audience for this letter is the **patient**. You must address them directly as 'you' throughout the entire letter."

    base_prompt = f"""
<s>[INST] 
### Persona
You are an experienced and compassionate Oncologist (cancer specialist) and a skilled medical educator. Your primary role is to translate complex medical information into clear, understandable, and supportive explanations for patients and their families. Your tone should always be professional, empathetic, and honest, balancing realism with hope.

### Golden Rule: Radical Simplicity - Translate, Don't Transfer
Your single most important task is to convert medical terminology into simple, 8th-grade level English. Do not just define a medical term; replace it entirely with an easy-to-understand explanation.

**Examples of what you MUST do:**
* **INSTEAD OF:** "multifocal stage IIA breast cancer"
    * **WRITE:** "a type of breast cancer that was in an early stage and was found in more than one spot in the breast."
* **INSTEAD OF:** "a biopsy of the mass in your right axilla"
    * **WRITE:** "we will take a small sample of the lump in your right armpit to test it."
* **INSTEAD OF:** "mastectomy with sentinel node and implant reconstruction"
    * **WRITE:** "surgery to remove the breast, check the nearby glands to see if the cancer had spread, and rebuild the breast shape with an implant."

    
### Original Medical Text:
"{annotated_text}"

### Internal Fact-Checking Reference:
This technical summary is for your internal use ONLY to ensure your response is factually accurate.
**STRICT INSTRUCTION:** You must treat this summary as a list of facts to be **translated** into simple language. DO NOT copy the medical terminology from this summary directly into the patient letter. You must translate these facts into the simple, empathetic language required by your persona.
"{keysummary}"


### {audience_instruction}

### Your Task:
Your goal is to write a single, complete, and polished letter to the patient that explains the information from the medical text above. Imagine you are sitting with the patient and explaining this to them in person, then putting it in writing.
STRICT NEGATIVE CONSTRAINT: Under NO circumstances should you say something not exist in the note.

**1. Letter Structure and Flow:** Organize your letter logically to guide the patient through the information without overwhelming them. Follow this structure:
    * **Empathetic Opening:** Start with a short warm and supportive salutation. Acknowledge the reason for their recent visit and the stress they might be feeling. 
    * **STRICT NEGATIVE CONSTRAINT:** AVOID LENGTHY OPENINGS. 1 sentence maxiumm.
    * **Summary of Past History:** Briefly recap their initial diagnosis and treatment. **Crucial rule: When discussing past treatment decisions, use neutral, non-judgmental language. Avoid direct phrases like "you chose not to" or "you refused." Instead, use objective phrasing like "At that time, the treatment plan did not include..." or "The decision was made not to proceed with..." to describe past events.**
    * **Explanation of Current Findings:** Describe what the recent tests (like the CT scan) have shown. Use simple, everyday language. Use analogies if helpful. **After providing a simple explanation (e.g., 'the cancer has spread'), avoid immediately following it with blunt, technical classifications. However, we want to highight the cancer stage if the note mentioned, and explain the stage meaning immediately after mentioning. Prioritize the compassionate explanation over the clinical label in the letter's flow.**
    * **The Main Diagnosis (Assessment):** Clearly and gently explain the main conclusion. Explain what terms like "metastatic" or "recurrence" mean.
    * **STRICT NEGATIVE CONSTRAINT:** If the cancer has spread (metastasis), absolutely DO NOT list the specific organs affected. Instead, just say "the cancer has spread to other parts of your body."
    * **The Go-Forward Plan:** Detail the next steps you have planned. For each step (like a biopsy, MRI, or bone scan), explain **WHAT** it is, and more importantly, **WHY** you are doing it.
    * **Treatment Goals and Philosophy:** This is the most important section. If the medical note mentions the term "palliative," explain this concept with extreme care. define it as an active and positive treatment approach focused on controlling the cancer, managing symptoms, and improving quality of life. The entire focus must be on the quality and extension of life.    
    * **STRICT NEGATIVE CONSTRAINT:** Under NO circumstances use fatalistic or terminal language. Absolutely AVOID phrases like "until the end of your life," "preparing for the end," "however many that may be", or any language that focuses on dying. The entire focus of this paragraph MUST be on the quality and extension of LIFE.**
    * **STRICT NEGATIVE CONSTRAINT:** AVOID LENGTHY comforting sentences and too much sympathy. The letter should be concise and focused on key information.
    * **Closing with Support:** End the letter by reinforcing that your team is there to support them through every step.
    For the "Explanation of Current Findings & Diagnosis" section:

**2. Language and Tone Directives:**
    * **Define Key Terms:** Do not assume the patient knows any medical jargon. Clearly define medical terms (beyond 8th grade) as they appear.
    * **Maintain Your Persona:** Use "we" to refer to the medical team. Write with empathy and clarity. The goal is to inform, not to scare.
    * **Audience:** The letter is strictly for the **patient**.


### Strict Output Formatting:
Provide ONLY the extracted letter text. Your output must start directly with the salutation (e.g., "Dear [Patient],") and end immediately after the signature. There should be absolutely no other text before or after the letter. You can use this format:
Dear [patient name]
We hope you are doing well. We're writing this letter to help you understand what happened during your recent visit on [Data] with [Physician name]
Why did I come to the clinic?
Why was discussed?
What tests were done or ordered?
What treatment or medication were changed?
What is the plan?
Thank you for choosing [Institution] for your care. We are committed to supporting you every step of the way on your journey to better health and well-being.
Please feel free to contact us if you have any questions.
Sincerely
[Institution]

Please provide the patient-focused explanation now: Your output must start directly with the salutation.
[/INST]
"""
    
    return base_prompt.format(
        annotated_text=annotated_text,
        audience_instruction=audience_instruction
    )



In [96]:

def create_cleaning_prompt(raw_response: str, audience: str) -> str:

    base_prompt = f"""
[INST]
### Persona
You are an expert medical writer and editor. Your unique skill is communicating complex clinical information with absolute precision and clarity, and you are adept at tailoring your language for different audiences, from senior physicians to concerned patients. Your primary directive is to preserve the original meaning without fail.

### Target Audience:
{audience}

### Your Task
Your primary task is to revise the provided medical text. Based on the specified **Target Audience**, you will improve its quality in the following areas:

1.  **Clean Up Language:** Improve sentence structure and use professional language appropriate for the target audience.
2.  **Reduce Repetition:** Eliminate redundant words and phrases without losing critical information.
3.  **Improve Flow:** Enhance the logical flow and transitions to make the narrative easier to follow.
4.  **Define Medical Terms (Conditional Task):**
    * **IF the Target Audience is 'Patient/Layperson'**, you MUST perform this task: For any medical term or jargon a non-medical person would not understand, provide a simple, brief explanation in parentheses immediately after its first appearance.
    * **Example:** "The patient presented with tachycardia (a heart rate over 100 beats per minute) and pedal edema (swelling in the feet)."
    * **IF the Target Audience is 'Clinical Professional'**, you MUST NOT perform this task. Do not define standard medical terms.

### The Golden Rule: Preserve Clinical Meaning at All Costs
This is the most important rule. The revised text MUST be semantically and factually identical to the original.

**STRICT PROHIBITIONS:**
* **DO NOT** alter, add, or remove any clinical facts, diagnoses, measurements, dosages, or timelines.
* **DO NOT** change the certainty of a statement. A possibility ("suggests," "possible") must remain a possibility. A certainty ("diagnosed with," "confirmed") must remain a certainty.
* **DO NOT** reorder information in a way that changes the chronological or logical sequence of events.

---
### Instructions for Output

Please provide your response in two parts:
1.  **Revised Medical Text:** The complete, revised version of the text, tailored for the specified audience.
2.  **Summary of Changes:** A brief, bulleted list explaining the key changes you made.

### Medical Text to Revise:

{raw_response}

Now, please proceed with the revision based on the specified Target Audience.
[/INST]
"""
    return base_prompt.format(raw_response=raw_response)





In [97]:

# =============================================================================
# UTILITY FUNCTIONS
# =============================================================================

def get_default_generation_config(tokenizer) -> Dict:
    """Get default generation configuration."""
    return {
        "max_new_tokens": 999,
        "temperature": 0.3,
        "top_p": 0.85,
        "repetition_penalty": 1.15,
        "do_sample": True,
        "pad_token_id": tokenizer.eos_token_id,
        "eos_token_id": tokenizer.eos_token_id,
        "early_stopping": True,
    }


def get_audience_generation_config(tokenizer) -> Dict:
    """Get generation configuration for audience determination."""
    return {
        "max_new_tokens": 5,
        "temperature": 0.01,
        "do_sample": False,
        "pad_token_id": tokenizer.eos_token_id
    }


def get_cleaning_generation_config(tokenizer) -> Dict:
    """Get generation configuration for response cleaning."""
    return {
        "max_new_tokens": 600,
        "temperature": 0.1,
        "top_p": 0.9,
        "repetition_penalty": 1.1,
        "do_sample": True,
        "pad_token_id": tokenizer.eos_token_id,
        "eos_token_id": tokenizer.eos_token_id,
        "early_stopping": True,
    }


def parse_audience_response(raw_response: str) -> str:
    """Parse and validate audience determination response."""
    cleaned_response = raw_response.lower().strip()
    if "family" in cleaned_response:
        return "family"
    else:
        return "patient"  # Default to patient



In [98]:

# =============================================================================
# HIGH-LEVEL PIPELINE FUNCTIONS
# =============================================================================

def determine_audience(original_text: str, model, tokenizer) -> str:
    """
    Determine the target audience for the medical explanation.
    
    Args:
        original_text: The original medical text
        model: The loaded model instance
        tokenizer: The tokenizer instance
    
    Returns:
        Audience type ('patient' or 'family')
    """
    try:
        prompt = create_audience_determination_prompt(original_text)
        config = get_audience_generation_config(tokenizer)
        raw_output = run_model(prompt, model, tokenizer, config)
        audience = parse_audience_response(raw_output)
        return audience
    except Exception as e:
        print(f"⚠️ Audience determination failed: {e}. Defaulting to 'patient'.")
        return "patient"



def parse_key_summary_response(original_text: str, model, tokenizer) -> str:
    """
    Extract key medical summary using LLM from original medical text.
    
    Args:
        original_text: The original medical text
        model: The loaded model instance
        tokenizer: The tokenizer instance
    
    Returns:
        Dictionary with parsed key summary sections
    """
    try:
        # Create the extraction prompt
        prompt = create_key_summary_prompt(original_text)
        # print(prompt)
        # Use precise generation config for factual extraction
        config = {
            "max_new_tokens": 500,
            "temperature": 0.1,  # Very low temperature for factual accuracy
            "top_p": 0.9,
            "repetition_penalty": 1.1,
            "do_sample": True,
            "pad_token_id": tokenizer.eos_token_id,
            "eos_token_id": tokenizer.eos_token_id,
            "early_stopping": True,
        }
        
        # Generate the key summary
        keysummary = run_model(prompt, model, tokenizer, config)
        # print(keysummary)
        return keysummary
        
    except Exception as e:
        print(f"⚠️ Key summary extraction failed: {e}")
        return {
            'treatments_received': 'Extraction failed',
            'medical_conditions': 'Extraction failed',
            'cancer_stage': 'Extraction failed',
            'referrals': 'Extraction failed',
            'current_status': 'Extraction failed',
            'next_steps_plan': 'Extraction failed', 
        }



In [99]:

# parse_key_summary_response(text, model, tokenizer)

def generate_explanation(annotated_text: str, 
                        audience: str, keysummary: str, model, tokenizer) -> str:
    """
    Generate medical explanation letter.
    
    Args:
        annotated_text: The original medical text
        audience: Target audience ('patient' or 'family')
        model: The loaded model instance
        tokenizer: The tokenizer instance
    
    Returns:
        Raw generated explanation
    """
    prompt = create_explanation_prompt(annotated_text, audience,keysummary)
    config = get_default_generation_config(tokenizer)
    return run_model(prompt, model, tokenizer, config)



In [100]:

def clean_response(raw_response: str, audience: str, model, tokenizer) -> str:
    """
    Clean the raw model response.
    
    Args:
        raw_response: Raw output from explanation generation
        audience: Target audience ('patient' or 'family')
        model: The loaded model instance
        tokenizer: The tokenizer instance
    
    Returns:
        Cleaned explanation letter
    """
    if len(raw_response.strip()) < 10 or raw_response.startswith("❌"):
        return raw_response
    
    try:
        prompt = create_cleaning_prompt(raw_response, audience)
        config = get_cleaning_generation_config(tokenizer)
        cleaned = run_model(prompt, model, tokenizer, config)
        return cleaned
    except Exception as e:
        print(f"⚠️ Cleaning failed: {e}. Returning raw response.")
        return raw_response


def explain_medical_text_functional(original_text: str, model, tokenizer, 
                                  meddict: Dict[str, str], 
                                  steps: List[str] = ['determine_audience', 'extract', 'generate', 'clean']) -> Dict:
    """
    Complete functional pipeline for medical text explanation.
    
    Args:
        original_text: The original medical text to explain
        model: The loaded model instance
        tokenizer: The tokenizer instance
        meddict: Medical dictionary for term lookup
        steps: List of steps to execute
    
    Returns:
        Dictionary containing all results from the pipeline
    """
    result = {
        'original_text': original_text,
        'steps_run': steps.copy()
    }
    
    # Step 1: Determine audience (if requested)
    if 'determine_audience' in steps:
        print("🔄 Running step: determine_audience")
        audience = determine_audience(original_text, model, tokenizer)
        result['determined_audience'] = audience
        print(f"   🎯 Determined audience: '{audience}'")
    
    # Step 2: Extract medical terms (if requested)
    if 'extract' in steps:
        print("🔄 Running step: extract")
        annotated_text = create_annotated_text(original_text, meddict)
        result['annotated_text'] = annotated_text
    
    # Step 3: Generate explanation (if requested)
    if 'generate' in steps:
        print("🔄 Running step: generate")
        
        # Ensure we have required data
        if 'determined_audience' not in result:
            if 'determine_audience' not in steps:
                print("   ⚠️ Audience not determined, determining now...")
                audience = determine_audience(original_text, model, tokenizer)
                result['determined_audience'] = audience
            else:
                raise ValueError("Audience determination failed")
        
        if 'annotated_text' not in result:
            if 'extract' not in steps:
                print("   ⚠️ Terms not extracted, extracting now...")
                annotated_text = create_annotated_text(original_text, meddict)
                result['annotated_text'] = annotated_text
            else:
                annotated_text = {}
        else:
            annotated_text = result['annotated_text']
        
        raw_output = generate_explanation(
            annotated_text, result['determined_audience'], model, tokenizer
        )
        result['raw_output'] = raw_output
        print(f"   🤖 Generated {len(raw_output)} character response")
    
    # Step 4: Clean response (if requested)
    if 'clean' in steps:
        print("🔄 Running step: clean")
        
        if 'raw_output' not in result:
            raise ValueError("No raw_output found. Must run 'generate' step before 'clean' step.")
        
        cleaned_output = clean_response(
            result['raw_output'], result['determined_audience'], model, tokenizer
        )
        result['cleaned_output'] = cleaned_output
        result['cleaned_length'] = len(cleaned_output)
        print(f"   🧹 Cleaned to {len(cleaned_output)} characters")
    
    print(f"✅ Completed {len(steps)} steps: {steps}")
    return result



# generation

In [101]:
keysummary = parse_key_summary_response(text, model, tokenizer)



In [102]:
keysummary

'[INST] **TREATMENTS RECEIVED:**\n Mastectomy with sentinel node and implant reconstruction; decline of tamoxifen; no radiation or chemotherapy\n\n**MEDICAL CONDITIONS:**\n Malignant neoplasm of overlapping sites of right breast in female, estrogen receptor positive; anxiety; hyperlipidemia; breast cancer; lung cancer (mother); lung cancer (father); esophageal cancer (maternal uncle)\n\n**CANCER STAGE:**\n Stage II (pT2, pN0, pMx, G2, ER+, PR+, HER2-)\n\n**REFERRALS:**\n Ambulatory referral to Integrative Medicine; referral to the ***** center\n\n**CURRENT STATUS:**\n Widely metastatic cancer; involvement of the lungs, peritoneum, liver, and ovary; local recurrence near the right axilla and implant; hepatomegaly and omental masses; mildly enlarged and hyperdense ovaries\n\n**NEXT STEPS/PLAN:**\n Biopsy of the right axilla in the office; completion of staging workup; appointment with Dr. ***** on Thursday; formulation of a plan based on biopsy results; discussion of treatment options in

In [103]:

all_results = []

for index, row in df.iterrows():
    text = row['note_text'] 
    

    audience = determine_audience(text, model, tokenizer)
    annotated_text = create_annotated_text(text, meddict)
    keysummary = parse_key_summary_response(text, model, tokenizer)
    explanation = generate_explanation(annotated_text, audience, keysummary, model, tokenizer)
    final_result = clean_response(explanation, audience, model, tokenizer)
    # Create result dictionary for this row
    row_result = {
        'original_text': text,
        'determined_audience': audience,
        'annotated_text': annotated_text,
        'keysummary': keysummary,
        'raw_explanation': explanation,
        'final_letter': final_result
    }
    
    all_results.append(row_result)





In [104]:

# Convert to DataFrame
results_df = pd.DataFrame(all_results)

# Save to CSV
base_filename = 'output'
extension = '.csv'
output_filename = base_filename + extension
counter = 1

while os.path.exists(output_filename):
    output_filename = f"{base_filename}_{counter}{extension}"
    counter += 1

results_df.to_csv(output_filename, index=False)
print(f"DataFrame saved to '{output_filename}'")

DataFrame saved to 'output_21.csv'


In [105]:
import pandas as pd
import textwrap
LINE_WIDTH = 80
# Assuming 'results_df' is your DataFrame

# Option 1: Adjusting pandas display options for a single printout
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)  # This is key for wrapping long strings

print(results_df)

# Resetting options to default if needed
pd.reset_option('display.max_rows')
pd.reset_option('display.max_columns')
pd.reset_option('display.width')

# Option 2: Iterating through the columns and printing each one
# This gives you more control over the formatting
print("-" * 50)
print("Printing each column with a header:")
print("-" * 50)

for col in results_df.columns:
    print(f"\n--- Column: {col} ---")
    
    # Get the long text from the cell (assuming one row at index 0)
    original_text = results_df.loc[0, col]
    
    # Use textwrap.fill() to format the text into a paragraph
    # with lines no longer than LINE_WIDTH characters.
    wrapped_text = textwrap.fill(original_text, width=LINE_WIDTH)
    
    # Print the nicely formatted text
    print(wrapped_text)

                                       original_text determined_audience  \
0  Medical Oncology Consult Note    Patient Name:...             patient   
1  This is a shared visit for services provided b...             patient   

                                      annotated_text  \
0  Medical Oncology [DEFINITION: A branch of Medi...   
1  This is a shared visit for services provided b...   

                                          keysummary  \
0  [INST] You are an expert medical information e...   
1  **TREATMENTS RECEIVED:**\nIrinotecan; Doxycycl...   

                                     raw_explanation  \
0  </s> \n\n---\n\nDear [Patient],\n\nI'm glad yo...   
1  </s>\n\nDear [Patient],\n\nWe're here to help ...   

                                        final_letter  
0  ### Revised Medical Text\n\nDear Patient,\n\nW...  
1  ### Revised Medical Text\n\nDear Patient,\n\nW...  
--------------------------------------------------
Printing each column with a header:
------------