In [1]:
!pip install sentencepiece protobuf datasets transformers trl textstat peft bitsandbytes nltk --quiet
!pip install -U bitsandbytes accelerate --quiet

In [2]:

with open("hf.token", "r") as f:
    hftoken = f.read().strip()  

import os
cache_dir = "/mnt/c/Users/yc/.cache/huggingface"
os.environ['HF_HOME'] = cache_dir

v4 is about oncology notes

## import

In [3]:
# Standard library imports
import csv
import json
import random
import re
import sys
import time
from collections import Counter
from collections import defaultdict

# Third-party data and ML libraries
import pandas as pd
import torch
import nltk
from nltk.tokenize import word_tokenize

# Hugging Face ecosystem
from datasets import Dataset, load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    Trainer,
    pipeline,
    logging,
)

# Fine-tuning and optimization
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

# Text analysis and readability metrics
from textstat import (
    flesch_kincaid_grade, 
    flesch_reading_ease,
    smog_index, 
    gunning_fog, 
    dale_chall_readability_score,
    text_standard, 
    syllable_count
)

# Optional: Uncomment if needed
from huggingface_hub import login
login(token=hftoken)  # Move token to environment variable

  from .autonotebook import tqdm as notebook_tqdm


## load model

In [4]:
# Model Configuration
# model_name = "NousResearch/Llama-2-7b-chat-hf"
# Recommended upgrade - Llama 3.1 8B
model_name = "meta-llama/Llama-3.1-8B-Instruct"

# # Or try Mistral 7B
# model_name = "mistralai/Mistral-7B-Instruct-v0.3"

device_map = {"": 0}

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name,
                                        #   cache_dir=cache_dir
                                          )
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)

# Load model with quantization
# model = AutoModelForCausalLM.from_pretrained(
#     model_name,
#     device_map=device_map,
#     quantization_config=bnb_config
# )

# # full precision
# model = AutoModelForCausalLM.from_pretrained(
#     model_name,
#     device_map=device_map,
#     torch_dtype=torch.float16,  # Use half precision instead
#     # cache_dir=cache_dir
# )

# 8 precision
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map=device_map,
    load_in_8bit=True
)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
Loading checkpoint shards: 100%|██████████| 4/4 [00:32<00:00,  8.09s/it]


## data

In [5]:

def calculate_readability_scores(text, term=None):
    """Calculate readability scores for a piece of text, optionally removing a term"""
    scoring_text = text
    
    # Remove the term and its variations before scoring if provided
    if term:
        # Create variations of the term to remove (capitalized, lowercase)
        term_variations = [term, term.lower(), term.capitalize()]
        
        for variation in term_variations:
            scoring_text = scoring_text.replace(variation, "")
        
        # Clean up any double spaces created
        scoring_text = " ".join(scoring_text.split())
    
    fk_grade = flesch_kincaid_grade(scoring_text)
    readability = flesch_reading_ease(scoring_text)
    return {
        "fk_grade": fk_grade,
        "readability": readability,
    } 
def txt_to_dict(file_path):
    data_dict = {}
    with open(file_path, 'r') as file:
        lines = file.readlines()
        for i in range(0, len(lines) - 1, 2):
            key = lines[i].strip()    # Odd line are key
            value = lines[i + 1].strip()  # Even line are value
            data_dict[key] = value

    return data_dict

txt_file_path = 'formaldef.txt' 
formaldic = txt_to_dict(txt_file_path)
len(formaldic)

meddict={}
for k,v in formaldic.items():
    meddict[k.split('Listen to pronunciation')[0].split('(')[0]]=v

In [6]:
# load data
df = pd.read_csv('/mnt/c/Users/yc/Downloads/coral/unannotated/data/breastca_unannotated.csv')
df=df.head(10)

# prompt, paragraph


In [7]:
# Modular Medical Text Explanation System
# Returns clean dict with configurable steps: extract, generate, clean

import re
import torch
from collections import defaultdict

class MedicalTextExplainer:
    def __init__(self, model, tokenizer, meddict):
        self.model = model
        self.tokenizer = tokenizer
        self.meddict = meddict
        
        # Improved generation config for better outputs
        self.generation_config = {
            "max_new_tokens": 999,              # Increased for fuller explanations
            "temperature": 0.3,
            "top_p": 0.85,
            "repetition_penalty": 1.15,
            "do_sample": True,
            "pad_token_id": tokenizer.eos_token_id,
            "eos_token_id": tokenizer.eos_token_id,  # Ensure proper stopping
            "early_stopping": True,              # Stop when EOS token is generated
        }
        
        # Define available steps
        self.available_steps = ['determine_audience', 'extract', 'generate', 'clean'] # 增加新步骤
        self.step_functions = {
            'determine_audience': self._step_determine_audience, # 增加新映射
            'extract': self._step_extract_terms,
            'generate': self._step_generate_explanation,
            'clean': self._step_clean_response
        }
    def create_audience_prompt(self, original_text):
            """
            Creates a simple, direct prompt to classify the audience.
            """
            prompt = f"""
        [INST] You are an expert medical text classifier. Read the following medical text and determine the appropriate audience for a summary letter.

        **Medical Text:**
        "{original_text}"

        **Instructions:**
        Based on the text, who is the audience for the explanation letter?
        - If the text describes a patient recovering, Discharge Condition says much improved, or having a positive or follow ongoing treatment plan, the audience is the **patient**.
        - If the text mentions "died", "passed away," "deceased," or describes a fatal outcome such "comfort care" "hospice care" "pallliative care", "palliative extubate", the audience is the **patient's family**.

        Respond with a single word ONLY: **patient** or **family**.
        [/INST]
        """
            return prompt
    

    def _step_determine_audience(self, result):
        """Step 0: Determine the correct audience (patient or family)."""
        text = result['original_text']
        
        # 1. 创建分类任务的Prompt
        audience_prompt = self.create_audience_prompt(text)
        
        # 2. 调用模型进行分类 (使用一个轻量的生成配置)
        try:
            inputs = self.tokenizer(audience_prompt, return_tensors="pt").to(self.model.device)
            input_length = inputs["input_ids"].shape[1]
            
            # 使用一个非常短的生成配置，因为我们只需要一个词
            audience_config = {
                "max_new_tokens": 5,
                "temperature": 0.01, # 温度极低，追求最确定的答案
                "do_sample": False, # 不进行采样
                "pad_token_id": self.tokenizer.eos_token_id
            }

            with torch.no_grad():
                outputs = self.model.generate(inputs["input_ids"], **audience_config)
            
            response_tokens = outputs[0][input_length:]
            raw_label = self.tokenizer.decode(response_tokens, skip_special_tokens=True).lower().strip()
            
            # 3. 清理和验证标签
            if "family" in raw_label:
                audience = "family"
            else:
                audience = "patient" # 默认为 patient
                
        except Exception as e:
            print(f"   ⚠️ Audience determination failed: {e}. Defaulting to 'patient'.")
            audience = "patient"

        result['determined_audience'] = audience
        print(f"   🎯 Determined audience: '{audience}'")
        return result


    def explain_medical_text(self, text, steps=['extract', 'generate', 'clean']):
        
        # Validate steps
        invalid_steps = [step for step in steps if step not in self.available_steps]
        if invalid_steps:
            raise ValueError(f"Invalid steps: {invalid_steps}. Available steps: {self.available_steps}")
        
        # Initialize result with original text
        result = {
            'original_text': text,
            'steps_run': steps.copy(),
            'available_steps': self.available_steps.copy()
        }
        
        # Run each step in sequence, passing result between steps
        for step in steps:
            print(f"🔄 Running step: {step}")
            result = self.step_functions[step](result)
        
        print(f"✅ Completed {len(steps)} steps: {steps}")
        return result
    
    def _step_extract_terms(self, result):
        """Step 1: Extract medical terms from text."""
        text = result['original_text']
        found_terms = self.extract_medical_terms(text)
        
        result.update({
            'found_terms': found_terms,
            'terms_count': len(found_terms)
        })
        
        print(f"   📋 Found {len(found_terms)} medical terms")
        return result
    
    
    def _step_generate_explanation(self, result):
        """Step 2: Generate explanation using model."""
        text = result['original_text']
        if 'determined_audience' not in result:
            raise ValueError("Audience not determined. Please run 'determine_audience' step before 'generate'.")
        
        audience = result['determined_audience']
        # Use found terms if extract step was run, otherwise extract them now
        if 'found_terms' in result:
            found_terms = result['found_terms']
        else:
            print("   ⚠️  Terms not extracted yet, extracting now...")
            found_terms = self.extract_medical_terms(text)
            result['found_terms'] = found_terms
        
        # Create prompt and generate
        prompt = self.create_prompt(text, found_terms, audience)
        raw_output = self._generate_raw_response(prompt)
        
        result.update({
            'prompt': prompt,
            'raw_output': raw_output,
            'prompt_length': len(prompt),
        })
        
        print(f"   🤖 Generated {len(raw_output)} character response")
        return result
    
    
    def _step_clean_response(self, result):
        """Step 3: Clean the raw model output."""
        
        # Check if we have raw output to clean
        if 'raw_output' not in result:
            raise ValueError("No raw_output found. Must run 'generate' step before 'clean' step.")
        
        raw_output = result['raw_output']
        
        cleaned_output = self.model_clean_response(raw_output, result['determined_audience'])
        
        result.update({
            'cleaned_output': cleaned_output,
            'cleaned_length': len(cleaned_output)
        })
        
        print(f"   🧹 Cleaned to {len(cleaned_output)} characters")
        return result
    
    def extract_medical_terms(self, text):
        """
        Enhanced medical term extraction to catch more terms from complex texts.
        """
        found_terms = {}
        
        # Strategy 1: Single words (improved regex for medical terms)
        words = re.findall(r'\b[A-Za-z]+(?:[-\'][A-Za-z]+)*\b', text)
        for word in words:
            definition = self.find_term_in_dict(word)
            if definition:
                found_terms[word] = definition
        
        # Strategy 2: Multi-word terms (expanded range for complex medical phrases)
        for n in range(2, 6):  # Increased from 5 to 6 for longer medical phrases
            n_grams = self.get_n_grams(text, n)
            for phrase in n_grams:
                definition = self.find_term_in_dict(phrase)
                if definition:
                    found_terms[phrase] = definition
        
        # Strategy 3: Medical abbreviations (enhanced pattern)
        abbreviations = re.findall(r'\b[A-Z]{2,8}\b', text)  # Increased from 6 to 8
        for abbrev in abbreviations:
            definition = self.find_term_in_dict(abbrev)
            if definition:
                found_terms[abbrev] = definition
        
        # Strategy 4: Medical procedures and conditions with specific patterns
        medical_patterns = [
            r'\b\w+oscopy\b',          # bronchoscopy, endoscopy, etc.
            r'\b\w+ectomy\b',          # appendectomy, etc.
            r'\b\w+itis\b',            # bronchitis, arthritis, etc.
            r'\b\w+osis\b',            # fibrosis, stenosis, etc.
            r'\b\w+emia\b',            # anemia, septicemia, etc.
            r'\b\w+pathy\b',           # myopathy, neuropathy, etc.
            r'\b\w+malacia\b',         # tracheomalacia, etc.
        ]
        
        for pattern in medical_patterns:
            matches = re.findall(pattern, text, re.IGNORECASE)
            for match in matches:
                definition = self.find_term_in_dict(match)
                if definition:
                    found_terms[match] = definition
        
        # Strategy 5: Medication names (often end in specific suffixes)
        medication_patterns = [
            r'\b\w+cillin\b',          # penicillin, amoxicillin, etc.
            r'\b\w+mycin\b',           # streptomycin, etc.
            r'\b\w+floxacin\b',        # levofloxacin, ciprofloxacin, etc.
            r'\b\w+sone\b',            # prednisone, cortisone, etc.
            r'\b\w+pam\b',             # lorazepam, etc.
        ]
        
        for pattern in medication_patterns:
            matches = re.findall(pattern, text, re.IGNORECASE)
            for match in matches:
                definition = self.find_term_in_dict(match)
                if definition:
                    found_terms[match] = definition
        
        return found_terms
    
    
    def get_n_grams(self, text, n):
        """Generate n-grams from text."""
        words = re.findall(r'\b[A-Za-z]+\b', text.lower())
        n_grams = []
        for i in range(len(words) - n + 1):
            phrase = ' '.join(words[i:i+n])
            n_grams.append(phrase)
        return n_grams
    
    def find_term_in_dict(self, term):
        """Find term in medical dictionary."""
        search_formats = [
            term, term.lower(), term.upper(), term.title(), term.capitalize()
        ]
        
        for search_term in search_formats:
            if search_term in self.meddict:
                return self.meddict[search_term]
        
        # Partial matching
        for key in self.meddict.keys():
            if key.lower() == term.lower():
                return self.meddict[key]
        
        return None
    
    def create_prompt(self, original_text, found_terms, determined_audience):

        # Format found terms for inclusion in prompt
        terms_section = ""
        if found_terms:
            terms_section = "Found medical terms and their definitions:\n"
            for term, definition in found_terms.items():
                terms_section += f"- {term}: {definition}\n"
        else:
            terms_section = "No medical terms found in dictionary.\n"
        
        if determined_audience == 'family':
            audience_instruction = "The determined audience for this letter is the **patient's family**. You must address them directly as 'you' and refer to the patient in the third person (e.g., 'your loved one,' 'he/she')."
        else: # patient
            audience_instruction = "The determined audience for this letter is the **patient**. You must address them directly as 'you' throughout the entire letter."

        prompt = f"""
<s>[INST] 
### Persona
You are an experienced and compassionate Oncologist (cancer specialist) and a skilled medical educator. Your primary role is to translate complex medical information into clear, understandable, and supportive explanations for patients and their families. Your tone should always be professional, empathetic, and honest, balancing realism with hope.

### Original Medical Text:
"{original_text}"

### Your Task:
Your goal is to write a single, complete, and polished letter to the patient that explains the information from the medical text above. Imagine you are sitting with the patient and explaining this to them in person, then putting it in writing.
STRICT NEGATIVE CONSTRAINT: Under NO circumstances should you say something not exist in the note.

**1. Letter Structure and Flow:** Organize your letter logically to guide the patient through the information without overwhelming them. Follow this structure:
    * **Empathetic Opening:** Start with a short warm and supportive salutation. Acknowledge the reason for their recent visit and the stress they might be feeling. 
    STRICT NEGATIVE CONSTRAINT: AVOID LENGTHY OPENINGS. 1 sentence maxiumm.
    * **Summary of Past History:** Briefly recap their initial diagnosis and treatment. **Crucial rule: When discussing past treatment decisions, use neutral, non-judgmental language. Avoid direct phrases like "you chose not to" or "you refused." Instead, use objective phrasing like "At that time, the treatment plan did not include..." or "The decision was made not to proceed with..." to describe past events.**
    * **Explanation of Current Findings:** Describe what the recent tests (like the CT scan) have shown. Use simple, everyday language. Use analogies if helpful. **After providing a simple explanation (e.g., 'the cancer has spread'), avoid immediately following it with blunt, technical classifications. However, we want to highight the cancer stage if the note mentioned, and explain the stage meaning immediately after mentioning. Prioritize the compassionate explanation over the clinical label in the letter's flow.**
    * **The Main Diagnosis (Assessment):** Clearly and gently explain the main conclusion. Explain what terms like "metastatic" or "recurrence" mean.
    STRICT NEGATIVE CONSTRAINT: If the cancer has spread (metastasis), absolutely DO NOT list the specific organs affected. Instead, just say "the cancer has spread to other parts of your body."
    * **The Go-Forward Plan:** Detail the next steps you have planned. For each step (like a biopsy, MRI, or bone scan), explain **WHAT** it is, and more importantly, **WHY** you are doing it.
    * **Treatment Goals and Philosophy:** This is the most important section. If the medical note mentions the term "palliative," explain this concept with extreme care. define it as an active and positive treatment approach focused on controlling the cancer, managing symptoms, and improving quality of life. The entire focus must be on the quality and extension of life.    
    * **STRICT NEGATIVE CONSTRAINT: Under NO circumstances use fatalistic or terminal language. Absolutely AVOID phrases like "until the end of your life," "preparing for the end," "however many that may be", or any language that focuses on dying. The entire focus of this paragraph MUST be on the quality and extension of LIFE.**
    STRICT NEGATIVE CONSTRAINT: AVOID LENGTHY comforting sentences and too much sympathy. The letter should be concise and focused on key information.
    * **Closing with Support:** End the letter by reinforcing that your team is there to support them through every step.
    For the "Explanation of Current Findings & Diagnosis" section:


**2. Language and Tone Directives:**
    * **Define Key Terms:** Do not assume the patient knows any medical jargon. Clearly define necessary terms like "metastatic" or "biopsy" as they appear.
    * **Maintain Your Persona:** Use "we" to refer to the medical team. Write with empathy and clarity. The goal is to inform, not to scare.
    * **Audience:** The letter is strictly for the **patient**.

### Strict Output Formatting:
Provide ONLY the extracted letter text. Your output must start directly with the salutation (e.g., "Dear [Patient],") and end immediately after the signature. There should be absolutely no other text before or after the letter. You can use this format:
Dear [patient name]
We hope you are doing well. We're writing this letter to help you understand what happened during your recent visit on [Data] with [Physician name]
Why did I come to the clinic?
Why was discussed?
What tests were done or ordered?
What treatment or medication were changed?
What is the plan?
Thank you for choosing [Institution] for your care. We are committed to supporting you every step of the way on your journey to better health and well-being.
Please feel free to contact us if you have any questions.
Sincerely
[Institution]
Please provide the patient-focused explanation now: Your output must start directly with the salutation.
[/INST]
    """
        return prompt
    
    def _generate_raw_response(self, prompt):
        """
        Generate raw response from model (separated from cleaning).
        """
        try:
            # Tokenize input
            inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True).to(self.model.device)
            input_length = inputs["input_ids"].shape[1]
            
            # Generate response
            with torch.no_grad():
                outputs = self.model.generate(
                    inputs["input_ids"],
                    attention_mask=inputs["attention_mask"],
                    **self.generation_config
                )
            
            # Extract only the NEW tokens (response), not the input prompt
            response_tokens = outputs[0][input_length:]  # Skip input tokens
            raw_output = self.tokenizer.decode(response_tokens, skip_special_tokens=False)
            
            return raw_output
            
        except Exception as e:
            return f"Error generating response: {str(e)}"
    
    def model_clean_response(self, raw_response, aduience):
        """
        Enhanced cleaning with better instructions for patient-focused, structured output.
        """
        
        # If response is very short or error, return as-is
        if len(raw_response.strip()) < 10 or raw_response.startswith("❌"):
            return raw_response
        
        cleaning_prompt = f"""
 
[INST]
### Persona
You are a specialized text-formatting AI. Your sole purpose is to extract a clean, complete letter from a raw text block, removing all surrounding artifacts, preambles, and post-scripts. You do not edit or change the content of the letter itself.

### Raw Text Block:
"{raw_response}"

### Your Task:
Your task is to analyze the provided Raw Text Block and isolate ONLY the formal letter contained within it. You must follow these rules precisely:

1.  **Identify the Letter's Boundaries:**
    * The letter starts with a salutation (e.g., "Dear [Patient]," or similar).
    * The letter ends with the final line of the signature (e.g., "Oncologist" or the doctor's name).

2.  **Strip All Prefixes:**
    * You MUST remove any and all text that appears **before** the letter's starting salutation.
    * **Examples of prefixes to remove:** "The final answer is:", "Here is the rewritten letter:", "Certainly, here is the polished version:", conversational filler, blank lines, etc.

3.  **Strip All Suffixes:**
    * You MUST remove any and all text that appears **after** the final line of the signature.
    * **Examples of suffixes to remove:** "---", "Note:", "<|eot_id|>", explanations about the edits, meta-commentary, etc.

4.  **Strict No-Edit Rule:**
    * **Do not change the wording, tone, or content of the letter itself.** Your only job is to surgically extract the letter from the surrounding text. You are pruning, not rewriting.



Please provide the cleaned letter now:
[/INST]

"""
        
        try:
            inputs = self.tokenizer(cleaning_prompt, return_tensors="pt", truncation=True).to(self.model.device)
            input_length = inputs["input_ids"].shape[1]
            
            # Use enhanced generation config for cleaning
            cleaning_config = {
                "max_new_tokens": 600,  # Increased for more complete cleaning
                "temperature": 0.1,
                "top_p": 0.9,
                "repetition_penalty": 1.1,
                "do_sample": True,
                "pad_token_id": self.tokenizer.eos_token_id,
                "eos_token_id": self.tokenizer.eos_token_id,
                "early_stopping": True,
            }
            
            with torch.no_grad():
                outputs = self.model.generate(
                    inputs["input_ids"],
                    attention_mask=inputs["attention_mask"],
                    **cleaning_config
                )
            
            # Extract only the cleaned response
            response_tokens = outputs[0][input_length:]
            cleaned = self.tokenizer.decode(response_tokens, skip_special_tokens=True)
            

            
            return cleaned
            
        except Exception as e:
     
            return raw_response
    

In [8]:
explainer = MedicalTextExplainer(model, tokenizer, meddict)
allres=[]

In [9]:
df.columns

Index(['coral_idx', 'Sex', 'UCSFDerivedRaceEthnicity_X', 'BirthDate',
       'note_text'],
      dtype='object')

In [10]:
for index, row in df.iterrows():
    text=row['note_text'] 
    result = explainer.explain_medical_text(text, steps=['determine_audience','extract', 'generate','clean'])
    allres.append(result)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


🔄 Running step: determine_audience
   🎯 Determined audience: 'patient'
🔄 Running step: extract
   📋 Found 143 medical terms
🔄 Running step: generate




   🤖 Generated 2041 character response
🔄 Running step: clean
   🧹 Cleaned to 2045 characters
✅ Completed 4 steps: ['determine_audience', 'extract', 'generate', 'clean']
🔄 Running step: determine_audience
   🎯 Determined audience: 'family'
🔄 Running step: extract
   📋 Found 265 medical terms
🔄 Running step: generate
   🤖 Generated 2941 character response
🔄 Running step: clean
   🧹 Cleaned to 3086 characters
✅ Completed 4 steps: ['determine_audience', 'extract', 'generate', 'clean']
🔄 Running step: determine_audience
   🎯 Determined audience: 'patient'
🔄 Running step: extract
   📋 Found 87 medical terms
🔄 Running step: generate
   🤖 Generated 3090 character response
🔄 Running step: clean
   🧹 Cleaned to 2139 characters
✅ Completed 4 steps: ['determine_audience', 'extract', 'generate', 'clean']
🔄 Running step: determine_audience
   🎯 Determined audience: 'patient'
🔄 Running step: extract
   📋 Found 141 medical terms
🔄 Running step: generate
   🤖 Generated 4015 character response
🔄 Running

KeyboardInterrupt: 

In [None]:
allres[0].keys()
# allres[0]['terms_count']
allres[0]['raw_output']


" Dear [Patient],\nWe hope you are doing well. We're writing this letter to help you understand what happened during your recent visit on December 30, 2019, with Dr. [Physician name].\n\nFirst, let's talk about your original diagnosis. In May 2013, you were diagnosed with a type of breast cancer called Stage IIA. At that time, the treatment plan included a mastectomy with sentinel node removal and implant reconstruction. Unfortunately, the cancer came back, and we've recently learned that it has spread to other parts of your body.\n\nThe recent tests showed that the cancer has spread to several areas, including your lungs, liver, and other places. This means that your cancer is considered metastatic. Don't worry; this doesn't mean that the cancer is untreatable. It simply means that we'll need to take a different approach to manage it.\n\nTo do this, we'll first confirm the type of cancer cells present using a biopsy. This involves taking a small sample of tissue from one of the affect

In [12]:

allres[0]['cleaned_output']

"### Cleaned Letter\n\nDear [Patient],\n\nWe hope you are doing well. We're writing this letter to help you understand what happened during your recent visit on December 30, 2019, with our oncologist.\n\nFirst, let's review your original diagnosis. In May 2013, you were diagnosed with a type of breast cancer called invasive ductal carcinoma. At that time, the treatment plan included a mastectomy with sentinel node removal and implant reconstruction. Unfortunately, the cancer came back, and we've been monitoring its progression ever since.\n\nRecently, you underwent some additional testing, including a CT scan of your chest, abdomen, and pelvis. These scans showed that the cancer has spread to other parts of your body. Specifically, the cancer has spread to several areas, including your lungs, liver, and ovaries. It's also caused a tumor to grow in your right armpit.\n\nThis means that your cancer is considered advanced, and we'll need to adjust our treatment plan accordingly. Our goal 

In [11]:
allres[0]['prompt_length']

12584

# gen

In [13]:
results_df = pd.DataFrame(allres)
results_df.columns

Index(['original_text', 'steps_run', 'available_steps', 'determined_audience',
       'found_terms', 'terms_count', 'prompt', 'raw_output', 'prompt_length',
       'cleaned_output', 'cleaned_length'],
      dtype='object')

In [14]:


base_filename = 'output'
extension = '.csv'
output_filename = base_filename + extension
counter = 1

# Check if the file exists and find a new name if it does
while os.path.exists(output_filename):
    output_filename = f"{base_filename}_{counter}{extension}"
    counter += 1

# Save the DataFrame to the new, unique filename
results_df.to_csv(output_filename, index=False)

print(f"DataFrame saved to '{output_filename}'")

DataFrame saved to 'output_12.csv'
