In [2]:
!pip install sentencepiece protobuf datasets transformers trl textstat peft bitsandbytes --quiet
!pip install -U bitsandbytes accelerate --quiet

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
pathos 0.3.4 requires dill>=0.4.0, but you have dill 0.3.8 which is incompatible.
pathos 0.3.4 requires multiprocess>=0.70.18, but you have multiprocess 0.70.16 which is incompatible.[0m[31m
[0m

## import

In [8]:
# Standard library imports
import csv
import json
import os
import random
import re
import sys
import time
from collections import Counter
from collections import defaultdict

# Third-party data and ML libraries
import pandas as pd
import torch
import nltk
from nltk.tokenize import word_tokenize

# Hugging Face ecosystem
from datasets import Dataset, load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    Trainer,
    pipeline,
    logging,
)

# Fine-tuning and optimization
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

# Text analysis and readability metrics
from textstat import (
    flesch_kincaid_grade, 
    flesch_reading_ease,
    smog_index, 
    gunning_fog, 
    dale_chall_readability_score,
    text_standard, 
    syllable_count
)

# Optional: Uncomment if needed
from huggingface_hub import login
login(token="")  # Move token to environment variable

## load model

In [4]:
# Model Configuration
# model_name = "NousResearch/Llama-2-7b-chat-hf"
# Recommended upgrade - Llama 3.1 8B
# model_name = "meta-llama/Llama-3.1-8B-Instruct"

# Or try Mistral 7B
model_name = "mistralai/Mistral-7B-Instruct-v0.3"

device_map = {"": 0}

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)

# Load model with quantization
# model = AutoModelForCausalLM.from_pretrained(
#     model_name,
#     device_map=device_map,
#     quantization_config=bnb_config
# )

# full precision
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map=device_map,
    torch_dtype=torch.float16  # Use half precision instead
)

# # 8 precision
# model = AutoModelForCausalLM.from_pretrained(
#     model_name,
#     device_map=device_map,
#     load_in_8bit=True
# )

tokenizer_config.json:   0%|          | 0.00/141k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/601 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

## data

In [5]:

def calculate_readability_scores(text, term=None):
    """Calculate readability scores for a piece of text, optionally removing a term"""
    scoring_text = text
    
    # Remove the term and its variations before scoring if provided
    if term:
        # Create variations of the term to remove (capitalized, lowercase)
        term_variations = [term, term.lower(), term.capitalize()]
        
        for variation in term_variations:
            scoring_text = scoring_text.replace(variation, "")
        
        # Clean up any double spaces created
        scoring_text = " ".join(scoring_text.split())
    
    fk_grade = flesch_kincaid_grade(scoring_text)
    readability = flesch_reading_ease(scoring_text)
    return {
        "fk_grade": fk_grade,
        "readability": readability,
    }
def txt_to_dict(file_path):
    data_dict = {}
    with open(file_path, 'r') as file:
        lines = file.readlines()
        for i in range(0, len(lines) - 1, 2):
            key = lines[i].strip()    # Odd line are key
            value = lines[i + 1].strip()  # Even line are value
            data_dict[key] = value

    return data_dict


txt_file_path = 'formaldef.txt' 
formaldic = txt_to_dict(txt_file_path)
len(formaldic)

meddict={}
for k,v in formaldic.items():
    meddict[k.split('Listen to pronunciation')[0].split('(')[0]]=v

# promt, single term


In [7]:
# Medical Term Explanation System
# Converts complex medical terminology into simple, accessible language

# Improved prompt template with clearer instruction formatting
explanation_template = """<s>[INST] You are a medical educator who explains complex medical terms in simple language.

Medical term: {term}
Medical definition: {definition}

Instructions:
1. Explain this term using everyday language that anyone can understand
2. Avoid all medical jargon and technical terms
3. Write at a middle school reading level (grades 7-8)
4. Keep it concise - maximum 3 short sentences
5. Be medically accurate but accessible
6. Start immediately with the explanation (no introductory phrases)

[/INST]"""

# Optimized generation parameters for clear, concise explanations
generation_config = {
    "max_new_tokens": 150,           # Reduced for concise responses
    "temperature": 0.3,              # Lower for more consistent, focused output
    "top_p": 0.85,                   # Slightly reduced for better coherence
    "repetition_penalty": 1.15,      # Higher to avoid repetition
    "do_sample": True,               # Enable sampling for natural language
    "pad_token_id": tokenizer.eos_token_id  # Handle padding properly
}

def find_term_in_dict(term):
    """
    Find a term in the medical dictionary using various case formats.
    
    Args:
        term (str): The term to search for
    
    Returns:
        str or None: The definition if found, None otherwise
    """
    # Try different case formats
    search_formats = [
        term,                           # Original case
        term.lower(),                   # lowercase
        term.upper(),                   # UPPERCASE  
        term.title(),                   # Title Case
        term.capitalize(),              # Capitalized
    ]
    
    for search_term in search_formats:
        if search_term in meddict:
            return meddict[search_term]
    
    # If still not found, try partial matching for terms with special characters
    for key in meddict.keys():
        if key.lower() == term.lower():
            return meddict[key]
    
    return None

def explain_medical_term(term, definition=None):
    """
    Generate a simple explanation for a medical term.
    
    Args:
        term (str): The medical term to explain
        definition (str, optional): Medical definition. If None, looks up in meddict
    
    Returns:
        str: Simple explanation suitable for general audience
    """
    # Look up definition if not provided
    if definition is None:
        definition = find_term_in_dict(term)
        if definition is None:
            return f"❌ Term '{term}' not found in medical dictionary."
    
    # Create formatted prompt
    prompt = explanation_template.format(term=term, definition=definition)
    
    try:
        # Tokenize input
        inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to(model.device)
        
        # Generate explanation
        with torch.no_grad():
            outputs = model.generate(
                inputs["input_ids"],
                attention_mask=inputs["attention_mask"],
                **generation_config
            )
        
        # Decode and clean response
        full_response = tokenizer.decode(outputs[0], skip_special_tokens=False)
        
        # Extract just the explanation part
        explanation = extract_explanation(full_response, prompt)
        
        return explanation.strip()
        
    except Exception as e:
        return f"❌ Error generating explanation: {str(e)}"

def extract_explanation(full_response, prompt):
    """
    Extract the model's explanation from the full response.
    
    Args:
        full_response (str): Complete model output
        prompt (str): Original prompt
    
    Returns:
        str: Cleaned explanation text
    """
    explanation = full_response
    
    # Remove the prompt part
    if "[/INST]" in explanation:
        explanation = explanation.split("[/INST]", 1)[1]
    else:
        # Fallback: remove original prompt
        explanation = explanation.replace(prompt, "")
    
    # Clean up special tokens and artifacts
    cleanup_patterns = [
        "</s>",
        "<s>",
        "[INST]",
        "[/INST]"
    ]
    
    for pattern in cleanup_patterns:
        explanation = explanation.replace(pattern, "")
    
    # Remove common response artifacts
    artifacts = [
        "Here's the explanation:",
        "Let me explain:",
        "The explanation is:",
        "Okay!",
        "Sure!",
        "Here's my explanation:"
    ]
    
    for artifact in artifacts:
        if explanation.lower().startswith(artifact.lower()):
            explanation = explanation[len(artifact):].strip()
    
    return explanation

def demo_explanations(num_terms=5):
    """
    Demonstrate the explanation system with sample medical terms.
    
    Args:
        num_terms (int): Number of terms to demonstrate
    """
    print("🏥 Medical Term Explanation Demo")
    print("=" * 50)
    
    # Get sample terms that actually exist
    available_terms = []
    for term in list(meddict.keys())[:20]:  # Check more terms to find valid ones
        if find_term_in_dict(term) is not None:
            available_terms.append(term)
        if len(available_terms) >= num_terms:
            break
    
    if len(available_terms) == 0:
        print("❌ No valid terms found in dictionary")
        return
    
    for i, term in enumerate(available_terms[:num_terms], 1):
        print(f"\n{i}. Term: {term.title()}")
        print("-" * 30)
        
        explanation = explain_medical_term(term)
        print(f"{explanation}")
        
        if i < len(available_terms[:num_terms]):
            print("=" * 50)

# Advanced function with readability scoring
def explain_with_metrics(term, definition=None):
    """
    Generate explanation with readability metrics.
    
    Args:
        term (str): Medical term
        definition (str, optional): Medical definition
    
    Returns:
        dict: Explanation with readability scores
    """
    explanation = explain_medical_term(term, definition)
    
    if explanation.startswith("❌"):
        return {"explanation": explanation, "metrics": None}
    
    # Calculate readability metrics
    try:
        metrics = {
            "flesch_reading_ease": flesch_reading_ease(explanation),
            "flesch_kincaid_grade": flesch_kincaid_grade(explanation),
            "word_count": len(explanation.split()),
            "sentence_count": explanation.count('.') + explanation.count('!') + explanation.count('?')
        }
        
        # Determine reading level
        grade_level = metrics["flesch_kincaid_grade"]
        if grade_level <= 8:
            level_assessment = "✅ Appropriate (Middle school level)"
        elif grade_level <= 10:
            level_assessment = "⚠️ Slightly high (Early high school)"
        else:
            level_assessment = "❌ Too complex (Advanced high school+)"
            
        metrics["level_assessment"] = level_assessment
        
    except Exception as e:
        metrics = {"error": f"Could not calculate metrics: {str(e)}"}
    
    return {
        "explanation": explanation,
        "metrics": metrics
    }

# Run demonstration
def debug_dictionary(sample_size=10):
    """
    Debug function to examine the structure of meddict.
    
    Args:
        sample_size (int): Number of sample keys to examine
    """
    print("🔍 Medical Dictionary Debug Info")
    print("=" * 40)
    
    keys = list(meddict.keys())
    print(f"Total terms in dictionary: {len(keys)}")
    print(f"\nFirst {sample_size} keys:")
    
    for i, key in enumerate(keys[:sample_size]):
        print(f"{i+1:2d}. '{key}' -> {len(meddict[key])} chars")
    
    # Check for specific problematic terms
    problematic_terms = ['A33', 'A6', 'AAP', 'AAT deficiency', 'abarelix']
    print(f"\nChecking problematic terms:")
    
    for term in problematic_terms:
        result = find_term_in_dict(term)
        if result:
            print(f"✅ Found '{term}': {result[:50]}...")
        else:
            print(f"❌ Not found: '{term}'")
            # Look for similar keys
            similar = [k for k in keys if term.lower() in k.lower()][:3]
            if similar:
                print(f"   Similar keys: {similar}")

if __name__ == "__main__":
    demo_explanations(5)

🏥 Medical Term Explanation Demo

1. Term: A33
------------------------------
A33 is a special kind of protein made in labs, similar to our immune system's natural defense against germs. It helps find and stick to cancer cells, like a magnet. This tool can help doctors detect or fight cancer more effectively.

2. Term: A6
------------------------------
A6 is an experimental substance we're looking into for treating cancer. It's like a tiny part of a natural protein that helps break down blood clots. In simpler terms, it stops new blood vessels from growing (antiangiogenesis) and prevents cancer cells from spreading (antimetastatic). We call it urokinase plasminogen activator (uPA)-derived peptide A6.

3. Term: Aap
------------------------------
The AAP, or alanine aminopeptidase, is an enzyme usually found in healthy kidneys. If you have issues with your kidneys, this enzyme might show up more than usual in your pee. Doctors use its presence to check for harm done to the kidneys from me

chatgpt baseline.

A33
A33 is a protein found on the surface of certain cells in the intestines and some types of cancer. Doctors can target it to help find or treat these cancers. It acts like a flag that helps identify where the cancer cells are.

A6
A6 is a small part of a protein that can help stop cancer from spreading. It works by blocking signals that tell cancer cells to move. Scientists are studying it to see if it can be used as a treatment.

AAP
AAP stands for American Academy of Pediatrics. It’s a group of doctors who focus on keeping children healthy. They create guidelines to help parents and doctors care for kids.

AAT deficiency
AAT deficiency is a condition where the body doesn't make enough of a protein that protects the lungs. Without it, the lungs can get damaged more easily, especially by smoking or pollution. It can also affect the liver in some people.

Abarelix
Abarelix is a medicine that lowers certain hormones in the body. It is mainly used to treat prostate cancer by stopping the cancer from growing. It works by blocking signals from the brain that tell the body to make these hormones.

# prompt, paragraph


In [36]:
text='''"Admission Date:  [**2118-6-2**]       Discharge Date:  [**2118-6-14**]

Date of Birth:                    Sex:  F

Service:  MICU and then to [**Doctor Last Name **] Medicine

HISTORY OF PRESENT ILLNESS:  This is an 81-year-old female
with a history of emphysema (not on home O2), who presents
with three days of shortness of breath thought by her primary
care doctor to be a COPD flare.  Two days prior to admission,
she was started on a prednisone taper and one day prior to
admission she required oxygen at home in order to maintain
oxygen saturation greater than 90%.  She has also been on
levofloxacin and nebulizers, and was not getting better, and
presented to the [**Hospital1 18**] Emergency Room.

In the [**Hospital3 **] Emergency Room, her oxygen saturation was
100% on CPAP.  She was not able to be weaned off of this
despite nebulizer treatment and Solu-Medrol 125 mg IV x2.

Review of systems is negative for the following:  Fevers,
chills, nausea, vomiting, night sweats, change in weight,
gastrointestinal complaints, neurologic changes, rashes,
palpitations, orthopnea.  Is positive for the following:
Chest pressure occasionally with shortness of breath with
exertion, some shortness of breath that is positionally
related, but is improved with nebulizer treatment.

PAST MEDICAL HISTORY:
1. COPD.  Last pulmonary function tests in [**2117-11-3**]
demonstrated a FVC of 52% of predicted, a FEV1 of 54% of
predicted, a MMF of 23% of predicted, and a FEV1:FVC ratio of
67% of predicted, that does not improve with bronchodilator
treatment.  The FVC, however, does significantly improve with
bronchodilator treatment consistent with her known reversible
air flow obstruction in addition to an underlying restrictive
ventilatory defect.  The patient has never been on home
oxygen prior to this recent episode.  She has never been on
steroid taper or been intubated in the past.
2. Lacunar CVA.  MRI of the head in [**2114-11-4**]
demonstrates "mild degree of multiple small foci of high T2
signal within the white matter of both cerebral hemispheres
as well as the pons, in the latter region predominantly to
the right of midline.  The abnormalities, while nonspecific
in etiology, are most likely secondary to chronic
microvascular infarction.  There is no mass, lesion, shift of
the normal midline strictures or hydrocephalus.  The major
vascular flow patterns are preserved.  There is moderate
right maxillary, moderate bilateral ethmoid, mild left
maxillary, minimal right sphenoid, and frontal sinus mucosal
thickening.  These abnormalities could represent an allergic
or some other type of inflammatory process.  Additionally
noted is a moderately enlarged subtotally empty sella
turcica".
3. Angina:  Most recent stress test was in [**2118-1-3**]
going for four minutes with a rate pressure product of
10,000, 64% of maximum predicted heart rate without evidence
of ischemic EKG changes or symptoms.  The imaging portion of
the study demonstrated no evidence of myocardial ischemia and
a calculated ejection fraction of 84%.  The patient denies
angina at rest and gets angina with walking a few blocks.
Are alleviated by sublingual nitroglycerin.
4. Hypothyroidism on Synthroid.
5. Depression on Lexapro.
6. Motor vehicle accident with head injury approximately 10
years ago.

MEDICATIONS ON ADMISSION:
1. Hydrochlorothiazide 25 q.d.
2. Prednisone 60 mg, 50 mg, 40 mg, 20 mg.
3. Levofloxacin 500 mg q.d.
4. Imdur 60 mg q.d.
5. Synthroid 75 mcg q.d.
6. Pulmicort nebulizer b.i.d.
7. Albuterol nebulizer q.4. prn.
8. Lexapro 10 mg q.d.
9. Protonix 40 mg q.d.
10. Aspirin 81 mg q.d.

ALLERGIES:  Norvasc leads to lightheadedness and headache.

FAMILY HISTORY:  Noncontributory.

SOCIAL HISTORY:  Lives with her husband, Dr. [**Known lastname 1809**] an
eminent Pediatric Neurologist at [**Hospital3 1810**].  The
patient is a prior smoker, but has not smoked in over 10
years.  She has no known alcohol use and she is a full code.

PHYSICAL EXAM AT TIME OF ADMISSION:  Blood pressure 142/76,
heart rate 100 and regular, respirations at 17-21, and 97%
axillary temperature.  She was saturating at 100% on CPAP
with dry mucous membranes.  An elderly female in no apparent
distress.  Pupils are equal, round, and reactive to light and
accommodation.  Extraocular movements are intact.  Oropharynx
difficult to assess due to CPAP machine.  No evidence of
jugular venous pressure, however, the strap from the CPAP
machine obscures the neck exam.  Cranial nerves II through
XII are grossly intact.  Neck is supple without
lymphadenopathy.  Heart exam:  Tachycardic, regular, obscured
by loud bilateral wheezing with increase in the expiratory
phase as well as profuse scattered rhonchi throughout the
lung fields.  Positive bowel sounds, soft, nontender,
nondistended, obese, no masses.  Mild edema of the lower
extremities without clubbing or cyanosis, no rashes.  There
is a right hand hematoma.  Strength is assessed as [**5-9**] in the
lower extremities, [**5-9**] in the upper extremities with a normal
mental status and cognition.

LABORATORY STUDIES:  White count 19, hematocrit 41, platelets
300.  Chem-7:  127, 3.6, 88, 29, 17, 0.6, 143.  Troponin was
negative.  CKs were negative times three.  Initial blood gas
showed a pH of 7.4, pO2 of 66, pCO2 of 54.

Chest x-ray demonstrates a moderate sized hiatal hernia,
segmental atelectasis, left lower lobe infiltrate versus
segmental atelectasis.

EKG shows normal sinus rhythm at 113 beats per minute, normal
axis, no evidence of ST-T wave changes.

BRIEF SUMMARY OF HOSPITAL COURSE:
1. COPD/dyspnea/pneumonia:  The patient was initially placed
on an aggressive steroid taper and admitted to the Medical
Intensive Care Unit due to her difficulty with oxygenation
despite CPAP machine.  She was also given nebulizer
treatments q.4h. as well as chest PT.  The nebulizers were
increased to q.1h. due to the fact that she continued to have
labored breathing.

Due to persistent respiratory failure and labored breathing,
the patient was intubated on [**2118-6-7**] in order to improve
oxygenation, ventilation, and ability to suction.  A
bronchoscopy was performed on [**2118-6-7**], which demonstrated
marked narrowing of the airways with expiration consistent
with tracheomalacia.

On [**2118-6-9**], two silicone stents were placed, one in the left
main stem (12 x 25 and one in the trachea 16 x 40) by Dr.
[**First Name (STitle) **] [**Name (STitle) **] under rigid bronchoscopy with general anesthesia.

On [**2118-6-11**], the patient was extubated to a cool mist shovel
mask and her oxygen was titrated down to 2 liters nasal
cannula at which time she was transferred to the medical
floor.  On the medical floor, the steroids were weaned to off
on [**2118-6-14**], and the patient was saturating at 97% on 2
liters, 92% on room air.

On [**2118-6-14**], the patient was seen again by the Interventional
Pulmonology service, who agreed that she looked much improved
and recommended that she go to pulmonary rehabilitation with
followup within six weeks' time status post placement of
stents in respiratory failure.

2. Cardiovascular:  The patient was ruled out for a MI.  She
did have another episode on the medical floor of chest pain,
which showed no evidence of EKG changes and negative
troponin, negative CKs x3.  She was continued on aspirin,
Imdur, and diltiazem for rate control per her outpatient
regimen.

3. Hypertension:  She was maintained on diltiazem and
hydrochlorothiazide with adequate blood pressure control and
normalization of electrolytes.

4. Hematuria:  The patient had intermittent hematuria likely
secondary to Foley placement.  The Foley catheter was
discontinued on [**2118-6-14**].  She had serial urinalyses, which
were all negative for signs of infection.

5. Hyperglycemia:  Patient was placed on insulin-sliding
scale due to hyperglycemia, which was steroid induced.  This
worked quite well and her glucose came back to normal levels
once the steroids were tapered to off.

6. Leukocytosis:  Patient did have a profound leukocytosis of
20 to 22 during much of her hospital course.  As the steroids
were tapered to off, her white blood cell count on [**2118-6-14**]
was 15,000.  It was felt that the leukocytosis was secondary
to both steroids as well as question of a left lower lobe
pneumonia.

7. For the left lower lobe pneumonia, the patient had
initially received a course of levofloxacin 500 p.o. q.d.
from [**2118-6-4**] to [**2118-6-10**].  This was restarted on [**2118-6-12**]
for an additional seven day course given the fact that she
still had the leukocytosis and still had marked rales at the
left lower lobe.

8. Hypothyroidism:  The patient was continued on outpatient
medical regimen.

9. Depression:  The patient was continued on Lexapro per
outpatient regimen.  It is recommended that she follow up
with a therapist as an outpatient due to the fact that she
did have a blunted affect throughout much of the hospital
course, and did appear clinically to be depressed.

10. Prophylaxis:  She was maintained on proton-pump inhibitor
with subQ Heparin.

11. Sore throat:  The patient did have a sore throat for much
of the hospital course post extubation.  This was treated
with Cepacol lozenges as well as KBL liquid (a solution
containing Kaopectate, Bismuth, and lidocaine) at bedtime.

12. Communication/code status:  The patient was full code
throughout her hospital course, and communication was
maintained with the patient and her husband.

13. Muscle weakness:  The patient did have profound muscle
weakness and was evaluated by Physical Therapy, and was found
to have impaired functional mobility, impaired
musculoskeletal performance, impaired gas exchange, impaired
endurance, impaired ventilation, and needed help with supine
to sit.  However, she was able to tolerate sitting in a chair
for approximately one hour.

On motor exam, her flexors and extensors of the lower
extremities were [**4-8**] at the knee, [**4-8**] at the ankle, [**4-8**] at
the elbows, and [**4-8**] hips.  It was felt that this weakness was
most likely due to a combination of steroid myopathy as well
as muscle atrophy secondary to deconditioning after a
prolonged hospital course.

14. Speech/swallow:  The patient had a Speech and Swallow
evaluation showing no evidence of dysphagia, no evidence of
vocal cord damage status post tracheal stent placement.

DISCHARGE CONDITION:  The patient was able to oxygenate on
room air at 93% at the time of discharge.  She was profoundly
weak, but was no longer tachycardic and had a normal blood
pressure.  Her respirations were much improved albeit with
transmitted upper airway sounds.

DISCHARGE STATUS:  The patient will be discharged to [**Hospital1 **]
for both pulmonary and physical rehabilitation.

DISCHARGE MEDICATIONS:
1. Levothyroxine 75 mcg p.o. q.d.
2. Citalopram 10 mg p.o. q.d.
3. Aspirin 81 mg p.o. q.d.
4. Fluticasone 110 mcg two puffs inhaled b.i.d.
5. Salmeterol Diskus one inhalation b.i.d.
6. Acetaminophen 325-650 mg p.o. q.4-6h. prn.
7. Ipratropium bromide MDI two puffs inhaled q.2h. prn.
8. Albuterol 1-2 puffs inhaled q.2h. prn.
9. Zolpidem tartrate 5 mg p.o. q.h.s. prn.
10. Isosorbide dinitrate 10 mg p.o. t.i.d.
11. Diltiazem 60 mg p.o. q.i.d.
12. Pantoprazole 40 mg p.o. q.24h.
13. Trazodone 25 mg p.o. q.h.s. prn.
14. SubQ Heparin 5000 units subcutaneous b.i.d. until such
time that the patient is able to get out of bed twice a day.
15. Cepacol lozenges q.2h. prn.
16. Levofloxacin 500 mg p.o. q.d. for a seven day course to
be completed on [**2118-6-21**].
17. Kaopectate/Benadryl/lidocaine 5 mL p.o. b.i.d. prn, not
to be given around mealtimes for concern of dysphagia induced
by lidocaine.
18. Lorazepam 0.5-2 mg IV q.6h. prn.

FOLLOW-UP PLANS:  The patient is recommended to followup with
Dr. [**First Name4 (NamePattern1) **] [**Last Name (NamePattern1) 1407**], [**Telephone/Fax (1) 1408**] within two weeks of leaving
of the hospital.  She is also recommended to followup with
the Interventional Pulmonary service for followup status post
stent placement.  She is also recommended to followup with a
neurologist if her muscle weakness does not improve within
one week on physical therapy with concern for steroid-induced
myopathy.

FINAL DIAGNOSES:
1. Tracheomalacia status post tracheal and left main stem
bronchial stent placement.
2. Hypertension.
3. Hypothyroidism.
4. Restrictive lung defect.
5. Depression.


                     DR.[**Last Name (STitle) **],[**First Name3 (LF) **] 12-207


Dictated By:[**Last Name (NamePattern1) 1811**]
MEDQUIST36

D:  [**2118-6-14**]  11:30
T:  [**2118-6-14**]  11:33
JOB#:  [**Job Number 1812**]"'''

In [74]:
# Modular Medical Text Explanation System
# Returns clean dict with configurable steps: extract, generate, clean

import re
import torch
from collections import defaultdict

class MedicalTextExplainer:
    def __init__(self, model, tokenizer, meddict):
        self.model = model
        self.tokenizer = tokenizer
        self.meddict = meddict
        
        # Improved generation config for better outputs
        self.generation_config = {
            "max_new_tokens": 999,              # Increased for fuller explanations
            "temperature": 0.3,
            "top_p": 0.85,
            "repetition_penalty": 1.15,
            "do_sample": True,
            "pad_token_id": tokenizer.eos_token_id,
            "eos_token_id": tokenizer.eos_token_id,  # Ensure proper stopping
            "early_stopping": True,              # Stop when EOS token is generated
        }
        
        # Define available steps
        self.available_steps = ['extract', 'generate', 'clean']
        self.step_functions = {
            'extract': self._step_extract_terms,
            'generate': self._step_generate_explanation,
            'clean': self._step_clean_response
        }
    
    def explain_medical_text(self, text, steps=['extract', 'generate', 'clean']):
        """
        Modular main function that runs configurable steps.
        
        Args:
            text (str): Input medical text
            steps (list): List of steps to run. Options: ['extract', 'generate', 'clean']
                         - 'extract': Extract medical terms from text
                         - 'generate': Generate explanation from model  
                         - 'clean': Clean the raw output
        
        Returns:
            dict: Contains results from each step that was run
        """
        
        # Validate steps
        invalid_steps = [step for step in steps if step not in self.available_steps]
        if invalid_steps:
            raise ValueError(f"Invalid steps: {invalid_steps}. Available steps: {self.available_steps}")
        
        # Initialize result with original text
        result = {
            'original_text': text,
            'steps_run': steps.copy(),
            'available_steps': self.available_steps.copy()
        }
        
        # Run each step in sequence, passing result between steps
        for step in steps:
            print(f"🔄 Running step: {step}")
            result = self.step_functions[step](result)
        
        print(f"✅ Completed {len(steps)} steps: {steps}")
        return result
    
    def _step_extract_terms(self, result):
        """Step 1: Extract medical terms from text."""
        text = result['original_text']
        found_terms = self.extract_medical_terms(text)
        
        result.update({
            'found_terms': found_terms,
            'terms_count': len(found_terms)
        })
        
        print(f"   📋 Found {len(found_terms)} medical terms")
        return result
    
    def _step_generate_explanation(self, result):
        """Step 2: Generate explanation using model."""
        text = result['original_text']
        
        # Use found terms if extract step was run, otherwise extract them now
        if 'found_terms' in result:
            found_terms = result['found_terms']
        else:
            print("   ⚠️  Terms not extracted yet, extracting now...")
            found_terms = self.extract_medical_terms(text)
            result['found_terms'] = found_terms
        
        # Create prompt and generate
        prompt = self.create_prompt(text, found_terms)
        raw_output = self._generate_raw_response(prompt)
        
        result.update({
            'prompt': prompt,
            'raw_output': raw_output,
            'prompt_length': len(prompt),
            'raw_output_length': len(raw_output)
        })
        
        print(f"   🤖 Generated {len(raw_output)} character response")
        return result
    
    def _step_clean_response(self, result):
        """Step 3: Clean the raw model output."""
        
        # Check if we have raw output to clean
        if 'raw_output' not in result:
            raise ValueError("No raw_output found. Must run 'generate' step before 'clean' step.")
        
        raw_output = result['raw_output']
        original_prompt = result.get('prompt', '')
        
        # Clean the response
        cleaned_output = self.model_clean_response(raw_output, original_prompt)
        cleaning_prompt = self.create_cleaning_prompt(raw_output, original_prompt)
        
        result.update({
            'cleaning_prompt': cleaning_prompt,
            'cleaned_output': cleaned_output,
            'cleaned_length': len(cleaned_output)
        })
        
        print(f"   🧹 Cleaned to {len(cleaned_output)} characters")
        return result
    
    def extract_medical_terms(self, text):
        """
        Enhanced medical term extraction to catch more terms from complex texts.
        """
        found_terms = {}
        
        # Strategy 1: Single words (improved regex for medical terms)
        words = re.findall(r'\b[A-Za-z]+(?:[-\'][A-Za-z]+)*\b', text)
        for word in words:
            definition = self.find_term_in_dict(word)
            if definition:
                found_terms[word] = definition
        
        # Strategy 2: Multi-word terms (expanded range for complex medical phrases)
        for n in range(2, 6):  # Increased from 5 to 6 for longer medical phrases
            n_grams = self.get_n_grams(text, n)
            for phrase in n_grams:
                definition = self.find_term_in_dict(phrase)
                if definition:
                    found_terms[phrase] = definition
        
        # Strategy 3: Medical abbreviations (enhanced pattern)
        abbreviations = re.findall(r'\b[A-Z]{2,8}\b', text)  # Increased from 6 to 8
        for abbrev in abbreviations:
            definition = self.find_term_in_dict(abbrev)
            if definition:
                found_terms[abbrev] = definition
        
        # Strategy 4: Medical procedures and conditions with specific patterns
        medical_patterns = [
            r'\b\w+oscopy\b',          # bronchoscopy, endoscopy, etc.
            r'\b\w+ectomy\b',          # appendectomy, etc.
            r'\b\w+itis\b',            # bronchitis, arthritis, etc.
            r'\b\w+osis\b',            # fibrosis, stenosis, etc.
            r'\b\w+emia\b',            # anemia, septicemia, etc.
            r'\b\w+pathy\b',           # myopathy, neuropathy, etc.
            r'\b\w+malacia\b',         # tracheomalacia, etc.
        ]
        
        for pattern in medical_patterns:
            matches = re.findall(pattern, text, re.IGNORECASE)
            for match in matches:
                definition = self.find_term_in_dict(match)
                if definition:
                    found_terms[match] = definition
        
        # Strategy 5: Medication names (often end in specific suffixes)
        medication_patterns = [
            r'\b\w+cillin\b',          # penicillin, amoxicillin, etc.
            r'\b\w+mycin\b',           # streptomycin, etc.
            r'\b\w+floxacin\b',        # levofloxacin, ciprofloxacin, etc.
            r'\b\w+sone\b',            # prednisone, cortisone, etc.
            r'\b\w+pam\b',             # lorazepam, etc.
        ]
        
        for pattern in medication_patterns:
            matches = re.findall(pattern, text, re.IGNORECASE)
            for match in matches:
                definition = self.find_term_in_dict(match)
                if definition:
                    found_terms[match] = definition
        
        return found_terms
    
    def get_n_grams(self, text, n):
        """Generate n-grams from text."""
        words = re.findall(r'\b[A-Za-z]+\b', text.lower())
        n_grams = []
        for i in range(len(words) - n + 1):
            phrase = ' '.join(words[i:i+n])
            n_grams.append(phrase)
        return n_grams
    
    def find_term_in_dict(self, term):
        """Find term in medical dictionary."""
        search_formats = [
            term, term.lower(), term.upper(), term.title(), term.capitalize()
        ]
        
        for search_term in search_formats:
            if search_term in self.meddict:
                return self.meddict[search_term]
        
        # Partial matching
        for key in self.meddict.keys():
            if key.lower() == term.lower():
                return self.meddict[key]
        
        return None
    
    def create_prompt(self, original_text, found_terms):
        """
        Create the exact prompt that will be sent to the model.
        Enhanced with better instructions for patient-focused explanations.
        """
        
        # Format found terms for inclusion in prompt
        terms_section = ""
        if found_terms:
            terms_section = "Found medical terms and their definitions:\n"
            for term, definition in found_terms.items():
                terms_section += f"- {term}: {definition}\n"
        else:
            terms_section = "No medical terms found in dictionary.\n"
        
        # Enhanced prompt with direct patient addressing
        prompt = f"""<s>[INST] You are a medical doctor and educator who explains medical information directly to patients in simple, caring language.

Original medical text:
"{original_text}"

{terms_section}

Your task:
1. Address the patient directly using "you" and "your" (e.g., "You have a condition called..." not "This is a story about...")
2. Explain the medical content in simple, everyday language that anyone can understand
3. For EVERY medical term found above, provide a clear explanation using middle school level words
4. Be accurate, reassuring, and caring in tone
5. Organize your explanation clearly: condition → what happened → treatments → outcome
6. Include key procedures and treatments mentioned in the text (e.g., code status related procedures, procedures, medications, and test results)
7. Explain why treatments were necessary and how they helped
8. End with encouraging words about recovery and next steps

Please provide your patient-focused explanation now:

[/INST]"""
        
        return prompt
    
    def _generate_raw_response(self, prompt):
        """
        Generate raw response from model (separated from cleaning).
        """
        try:
            # Tokenize input
            inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True).to(self.model.device)
            input_length = inputs["input_ids"].shape[1]
            
            # Generate response
            with torch.no_grad():
                outputs = self.model.generate(
                    inputs["input_ids"],
                    attention_mask=inputs["attention_mask"],
                    **self.generation_config
                )
            
            # Extract only the NEW tokens (response), not the input prompt
            response_tokens = outputs[0][input_length:]  # Skip input tokens
            raw_output = self.tokenizer.decode(response_tokens, skip_special_tokens=False)
            
            return raw_output
            
        except Exception as e:
            return f"❌ Error generating response: {str(e)}"
    
    def model_clean_response(self, raw_response, original_prompt):
        """
        Enhanced cleaning with better instructions for patient-focused, structured output.
        """
        
        # If response is very short or error, return as-is
        if len(raw_response.strip()) < 10 or raw_response.startswith("❌"):
            return raw_response
        
        cleaning_prompt = f"""<s>[INST] You are a medical text editor. Clean up this patient explanation to make it perfect.

ORIGINAL PROMPT GIVEN TO MODEL:
{original_prompt}

RAW MODEL OUTPUT TO CLEAN:
"{raw_response}"

CLEANING INSTRUCTIONS:
1. Remove any special tokens like <s>, </s>, [INST], [/INST]
2. Fix any anonymization issues - use "you" and "your" consistently instead of "[**Patient Name**]" 
3. Remove any incomplete sentences at the end
4. Remove repetitive or garbled text
5. Ensure direct patient addressing throughout ("You have..." not "The patient has...")
6. Organize clearly: Your condition → What happened → Treatments → Outcome → Next steps
7. Make sure all medical terms mentioned are properly explained in simple language
8. Keep the caring, reassuring tone
9. Ensure it flows naturally and is easy to read
10. End on an encouraging, positive note about recovery
11. Do not skip medical procedures related to code status (intubation, extubation, and Cardiopulmonary Resuscitation)

Return only the clean, patient-focused explanation:

[/INST]"""
        
        try:
            inputs = self.tokenizer(cleaning_prompt, return_tensors="pt", truncation=True).to(self.model.device)
            input_length = inputs["input_ids"].shape[1]
            
            # Use enhanced generation config for cleaning
            cleaning_config = {
                "max_new_tokens": 600,  # Increased for more complete cleaning
                "temperature": 0.1,
                "top_p": 0.9,
                "repetition_penalty": 1.1,
                "do_sample": True,
                "pad_token_id": self.tokenizer.eos_token_id,
                "eos_token_id": self.tokenizer.eos_token_id,
                "early_stopping": True,
            }
            
            with torch.no_grad():
                outputs = self.model.generate(
                    inputs["input_ids"],
                    attention_mask=inputs["attention_mask"],
                    **cleaning_config
                )
            
            # Extract only the cleaned response
            response_tokens = outputs[0][input_length:]
            cleaned = self.tokenizer.decode(response_tokens, skip_special_tokens=True)
            
            # Post-process to fix common issues
            cleaned = self.post_process_patient_text(cleaned.strip())
            
            return cleaned
            
        except Exception as e:
            # Fallback to enhanced cleaning if model cleaning fails
            return self.enhanced_fallback_clean(raw_response)
    
    def post_process_patient_text(self, text):
        """
        Post-process the cleaned text to ensure patient-focused language.
        """
        # Fix anonymization placeholders
        anonymization_fixes = [
            (r'\[?\*?\*?Patient Name\*?\*?\]?', 'you'),
            (r'\[?\*?\*?Patient\*?\*?\]?', 'you'),
            (r'the patient', 'you'),
            (r'The patient', 'You'),
            (r'This patient', 'You'),
            (r'this patient', 'you'),
            (r'A patient', 'You'),
            (r'a patient', 'you'),
        ]
        
        for pattern, replacement in anonymization_fixes:
            text = re.sub(pattern, replacement, text, flags=re.IGNORECASE)
        
        # Fix common grammar issues after pronoun replacement
        text = re.sub(r'\byou was\b', 'you were', text, flags=re.IGNORECASE)
        text = re.sub(r'\byou has\b', 'you have', text, flags=re.IGNORECASE)
        text = re.sub(r'\byou is\b', 'you are', text, flags=re.IGNORECASE)
        
        # Remove story-like introductions
        story_patterns = [
            r'^This is a story about.*?\.',
            r'^Let me tell you about.*?\.',
            r'^Here\'s what happened.*?\.',
        ]
        
        for pattern in story_patterns:
            text = re.sub(pattern, '', text, flags=re.IGNORECASE | re.DOTALL)
        
        # Clean up extra whitespace
        text = re.sub(r'\s+', ' ', text)
        text = text.strip()
        
        # Ensure it starts with "You" if it doesn't already
        if not text.lower().startswith('you'):
            text = 'You ' + text.lower()
        
        return text
    
    def enhanced_fallback_clean(self, raw_output):
        """
        Enhanced fallback cleaning with patient-focused improvements.
        """
        cleaned = raw_output
        
        # Remove special tokens
        cleanup_patterns = ["</s>", "<s>", "[INST]", "[/INST]"]
        for pattern in cleanup_patterns:
            cleaned = cleaned.replace(pattern, "")
        
        # Apply patient-focused post-processing
        cleaned = self.post_process_patient_text(cleaned)
        
        # Remove incomplete sentences more intelligently
        sentences = re.split(r'[.!?]+', cleaned)
        complete_sentences = []
        
        for sentence in sentences:
            sentence = sentence.strip()
            if len(sentence) > 10 and not sentence.endswith(('and', 'or', 'but', 'the', 'a', 'an')):
                complete_sentences.append(sentence)
        
        if complete_sentences:
            result = '. '.join(complete_sentences) + '.'
            # Fix double periods
            result = re.sub(r'\.+', '.', result)
            return result
        
        return cleaned
    
    def create_cleaning_prompt(self, raw_response, original_prompt):
        """
        Create the enhanced cleaning prompt for transparency.
        """
        return f"""<s>[INST] You are a medical text editor. Clean up this patient explanation to make it perfect.

ORIGINAL PROMPT GIVEN TO MODEL:
{original_prompt}

RAW MODEL OUTPUT TO CLEAN:
"{raw_response}"

CLEANING INSTRUCTIONS:
1. Remove any special tokens like <s>, </s>, [INST], [/INST]
2. Fix any anonymization issues - use "you" and "your" consistently instead of "[**Patient Name**]" 
3. Remove any incomplete sentences at the end
4. Remove repetitive or garbled text
5. Ensure direct patient addressing throughout ("You have..." not "The patient has...")
6. Organize clearly: Your condition → What happened → Treatments → Outcome → Next steps
7. Make sure all medical terms mentioned are properly explained in simple language
8. Keep the caring, reassuring tone
9. Ensure it flows naturally and is easy to read
10. End on an encouraging, positive note about recovery

Return only the clean, patient-focused explanation:

[/INST]"""

    def clean_raw_output(self, raw_output, original_prompt=""):
        """
        Directly clean a raw output with optional original prompt context.
        
        Args:
            raw_output (str): Raw model output to clean
            original_prompt (str): Original prompt for context (optional)
            
        Returns:
            dict: Cleaning results
        """
        print("🧹 Cleaning raw output directly...")
        
        cleaned_output = self.model_clean_response(raw_output, original_prompt)
        cleaning_prompt = self.create_cleaning_prompt(raw_output, original_prompt)
        
        result = {
            'raw_output': raw_output,
            'original_prompt': original_prompt,
            'cleaning_prompt': cleaning_prompt,
            'cleaned_output': cleaned_output,
            'cleaned_length': len(cleaned_output)
        }
        
        print(f"   ✅ Cleaned to {len(cleaned_output)} characters")
        return result
        """
        Create the enhanced cleaning prompt for transparency.
        """
        return f"""<s>[INST] You are a medical text editor. Clean up this patient explanation to make it perfect.

ORIGINAL PROMPT GIVEN TO MODEL:
{original_prompt}

RAW MODEL OUTPUT TO CLEAN:
"{raw_response}"

CLEANING INSTRUCTIONS:
1. Remove any special tokens like <s>, </s>, [INST], [/INST]
2. Fix any anonymization issues - use "you" and "your" consistently instead of "[**Patient Name**]" 
3. Remove any incomplete sentences at the end
4. Remove repetitive or garbled text
5. Ensure direct patient addressing throughout ("You have..." not "The patient has...")
6. Organize clearly: Your condition → What happened → Treatments → Outcome → Next steps
7. Make sure all medical terms mentioned are properly explained in simple language
8. Keep the caring, reassuring tone
9. Ensure it flows naturally and is easy to read
10. End on an encouraging, positive note about recovery

Return only the clean, patient-focused explanation:

[/INST]"""

# Demo and usage examples
def demo_modular_system():
    """Demonstrate the new modular step system."""
    
    print("🔧 MODULAR MEDICAL EXPLANATION SYSTEM")
    print("=" * 60)
    
    print("📋 Available Step Combinations:")
    print()
    
    print("1️⃣ FULL PIPELINE - All steps (default)")
    print("   result = explainer.explain_medical_text(text)")
    print("   result = explainer.explain_medical_text(text, steps=['extract', 'generate', 'clean'])")
    print()
    
    print("2️⃣ SKIP CLEANING - Raw model output")
    print("   result = explainer.explain_medical_text(text, steps=['extract', 'generate'])")
    print()
    
    print("3️⃣ ONLY EXTRACT - Just find medical terms")
    print("   result = explainer.explain_medical_text(text, steps=['extract'])")
    print()
    
    print("4️⃣ GENERATE ONLY - Skip extraction")
    print("   result = explainer.explain_medical_text(text, steps=['generate'])")
    print()
    
    print("5️⃣ CUSTOM PIPELINE - Generate then clean")
    print("   result = explainer.explain_medical_text(text, steps=['generate', 'clean'])")
    print()
    
    print("✅ BENEFITS:")
    print("   • Run only the steps you need")
    print("   • Debug individual steps") 
    print("   • Compare raw vs cleaned outputs")
    print("   • Skip expensive cleaning for quick tests")

if __name__ == "__main__":
    demo_modular_system()

🔧 MODULAR MEDICAL EXPLANATION SYSTEM
📋 Available Step Combinations:

1️⃣ FULL PIPELINE - All steps (default)
   result = explainer.explain_medical_text(text)
   result = explainer.explain_medical_text(text, steps=['extract', 'generate', 'clean'])

2️⃣ SKIP CLEANING - Raw model output
   result = explainer.explain_medical_text(text, steps=['extract', 'generate'])

3️⃣ ONLY EXTRACT - Just find medical terms
   result = explainer.explain_medical_text(text, steps=['extract'])

4️⃣ GENERATE ONLY - Skip extraction
   result = explainer.explain_medical_text(text, steps=['generate'])

5️⃣ CUSTOM PIPELINE - Generate then clean
   result = explainer.explain_medical_text(text, steps=['generate', 'clean'])

✅ BENEFITS:
   • Run only the steps you need
   • Debug individual steps
   • Compare raw vs cleaned outputs
   • Skip expensive cleaning for quick tests


In [75]:
explainer = MedicalTextExplainer(model, tokenizer, meddict)

In [76]:
result = explainer.explain_medical_text(text, steps=['extract', 'generate'])
cleaned_result = explainer.clean_raw_output(
    raw_output=result['raw_output'], 
    original_prompt=result['prompt']
)

🔄 Running step: extract


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


   📋 Found 131 medical terms
🔄 Running step: generate


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


   🤖 Generated 2973 character response
✅ Completed 2 steps: ['extract', 'generate']
🧹 Cleaning raw output directly...
   ✅ Cleaned to 2603 characters


In [77]:
for k,v in result.items():
    if k not in ['original_text', 'found_terms','prompt']:
        print('='*44)
        print(k, v)

steps_run ['extract', 'generate']
available_steps ['extract', 'generate', 'clean']
terms_count 131
raw_output Dear [Patient's Name],

I want to explain your recent stay in the hospital and what happened to you. You are an 81-year-old woman who has been living with emphysema, a condition that affects your lungs and makes it difficult to breathe. About three days before coming to the hospital, you started feeling short of breath, and your primary care doctor thought it might be a flare-up of your emphysema.

To help with your breathing, your doctor prescribed prednisone, a medication that reduces inflammation in your lungs. Unfortunately, your condition didn't improve, and you needed oxygen at home to keep your oxygen levels above 90%. Despite receiving oxygen, nebulizer treatments, and a strong dose of solumedrol, a type of steroid, in the emergency room, your oxygen levels remained low.

When you arrived at the hospital, your oxygen levels were 100% on continuous positive airway pressu