## imports

In [1]:
import json
import csv
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
# from huggingface_hub import login

# login(token="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")

from peft import LoraConfig, PeftModel
from trl import SFTTrainer
from transformers import Trainer, TrainingArguments
from datasets import Dataset

  from .autonotebook import tqdm as notebook_tqdm


## predef

In [2]:
from textstat import flesch_kincaid_grade, flesch_reading_ease

def calculate_readability_scores(text, term=None):
    """Calculate readability scores for a piece of text, optionally removing a term"""
    scoring_text = text
    
    # Remove the term and its variations before scoring if provided
    if term:
        # Create variations of the term to remove (capitalized, lowercase)
        term_variations = [term, term.lower(), term.capitalize()]
        
        for variation in term_variations:
            scoring_text = scoring_text.replace(variation, "")
        
        # Clean up any double spaces created
        scoring_text = " ".join(scoring_text.split())
    
    fk_grade = flesch_kincaid_grade(scoring_text)
    readability = flesch_reading_ease(scoring_text)
    return {
        "fk_grade": fk_grade,
        "readability": readability,
    }

## data

In [3]:
def txt_to_dict(file_path):
    data_dict = {}
    with open(file_path, 'r') as file:
        lines = file.readlines()
        for i in range(0, len(lines) - 1, 2):
            key = lines[i].strip()    # Odd line are key
            value = lines[i + 1].strip()  # Even line are value
            data_dict[key] = value

    return data_dict


txt_file_path = 'formaldef.txt' 
formaldic = txt_to_dict(txt_file_path)
len(formaldic)

9369

In [4]:
meddict={}
for k,v in formaldic.items():
    meddict[k.split('Listen to pronunciation')[0].split('(')[0]]=v

In [5]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import random

# Model Configuration
model_name = "NousResearch/Llama-2-7b-chat-hf"
device_map = {"": 0}

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)

# Load model with quantization
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map=device_map,
    quantization_config=bnb_config
)


Loading checkpoint shards: 100%|██████████| 2/2 [00:03<00:00,  1.75s/it]


# promt

In [6]:

# Improved prompt template with clearer instruction formatting
explanation_template = """<s>[INST] You are a medical educator explaining complex medical terms.

The term is: {term}

The medical definition is: {definition}

Your task:
1. Explain this term in simple language a non-medical person would understand
2. Avoid using technical jargon entirely
3. Write at a middle school reading level (grades 7-8)
4. Do not include examples
5. Be concise (maximum 3 short sentences)
6. Maintain medical accuracy

IMPORTANT: Begin your answer with the actual explanation. Do not write any introductory text like "Okay!" or "Here's my attempt..." or "Let me explain..."
[/INST]
"""

# Generation parameters
generation_config = {
    "max_new_tokens": 512,
    "temperature": 0.7,
    "top_p": 0.9,
    "repetition_penalty": 1.1,
    "do_sample": True  # Enable sampling for more creative responses
}

# Function to explain a medical term
def explain_medical_term(term):
    # Look up the term in the dictionary
    if term in meddict:
        definition = meddict[term]
    else:
        return f""
    
    # Create prompt with the definition
    prompt = explanation_template.format(term=term, definition=definition)
    
    # Tokenize
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    # Generate explanation
    with torch.no_grad():
        output = model.generate(
            inputs["input_ids"],
            max_new_tokens=generation_config["max_new_tokens"],
            temperature=generation_config["temperature"],
            top_p=generation_config["top_p"],
            repetition_penalty=generation_config["repetition_penalty"],
            do_sample=generation_config["do_sample"]
        )
    
    # Decode the response - don't skip special tokens to handle chat format
    explanation = tokenizer.decode(output[0], skip_special_tokens=False)
    
    # Extract just the model's response part using string manipulation
    # For Llama-2-chat models, responses typically follow [/INST]
    if "[/INST]" in explanation:
        explanation = explanation.split("[/INST]")[1].strip()
    else:
        # Fallback - just remove the prompt
        explanation = explanation.replace(prompt, "").strip()

    # Remove any ending tokens
    if "</s>" in explanation:
        explanation = explanation.split("</s>")[0].strip()
    
    return explanation


In [7]:
# torch.cuda.empty_cache() # reset context when try new prompt

## generate

In [8]:
from textstat import (flesch_kincaid_grade, flesch_reading_ease, 
                     smog_index, gunning_fog, dale_chall_readability_score,
                     text_standard, syllable_count)
import re
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter

# Download NLTK resources (run once)
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

def comprehensive_readability_analysis(formal_text, simplified_text, term=None):
    """
    Calculate multiple readability and complexity metrics for medical texts
    
    Args:
        formal_text: The original medical definition
        simplified_text: The simplified explanation
        term: The medical term to exclude from analysis (optional)
    
    Returns:
        Dictionary containing all metrics and comparisons
    """
    # Clean texts for analysis (remove the term if provided)
    formal_clean = formal_text
    simplified_clean = simplified_text
    
    if term:
        term_variations = [term, term.lower(), term.capitalize()]
        for variation in term_variations:
            formal_clean = re.sub(r'\b' + re.escape(variation) + r'\b', '', formal_clean)
            simplified_clean = re.sub(r'\b' + re.escape(variation) + r'\b', '', simplified_clean)
        
        # Clean up double spaces
        formal_clean = ' '.join(formal_clean.split())
        simplified_clean = ' '.join(simplified_clean.split())
    
    # Tokenize for word-level analysis
    formal_words = word_tokenize(formal_clean.lower())
    simplified_words = word_tokenize(simplified_clean.lower())
    
    # Calculate traditional readability metrics
    metrics = {
        "formal": {
            "fk_grade": flesch_kincaid_grade(formal_clean),
            "flesch_ease": flesch_reading_ease(formal_clean),
            "smog": smog_index(formal_clean),
            "gunning_fog": gunning_fog(formal_clean),
            "dale_chall": dale_chall_readability_score(formal_clean),
            "text_standard": text_standard(formal_clean),
            "word_count": len(formal_words),
            "avg_word_length": sum(len(word) for word in formal_words) / len(formal_words) if formal_words else 0,
            "avg_syllables_per_word": syllable_count(formal_clean) / len(formal_words) if formal_words else 0,
        },
        "simplified": {
            "fk_grade": flesch_kincaid_grade(simplified_clean),
            "flesch_ease": flesch_reading_ease(simplified_clean),
            "smog": smog_index(simplified_clean),
            "gunning_fog": gunning_fog(simplified_clean),
            "dale_chall": dale_chall_readability_score(simplified_clean),
            "text_standard": text_standard(simplified_clean),
            "word_count": len(simplified_words),
            "avg_word_length": sum(len(word) for word in simplified_words) / len(simplified_words) if simplified_words else 0,
            "avg_syllables_per_word": syllable_count(simplified_clean) / len(simplified_words) if simplified_words else 0,
        }
    }
    
    # Calculate lexical complexity measures
    
    # Type-Token Ratio (vocabulary diversity)
    metrics["formal"]["type_token_ratio"] = len(set(formal_words)) / len(formal_words) if formal_words else 0
    metrics["simplified"]["type_token_ratio"] = len(set(simplified_words)) / len(simplified_words) if simplified_words else 0
    
    # Long word percentage (words with 3+ syllables)
    formal_long_words = sum(1 for word in formal_words if syllable_count(word) >= 3)
    simplified_long_words = sum(1 for word in simplified_words if syllable_count(word) >= 3)
    
    metrics["formal"]["long_word_percentage"] = formal_long_words / len(formal_words) * 100 if formal_words else 0
    metrics["simplified"]["long_word_percentage"] = simplified_long_words / len(simplified_words) * 100 if simplified_words else 0
    
    # Calculate improvements
    metrics["improvements"] = {
        "fk_grade_reduction": metrics["formal"]["fk_grade"] - metrics["simplified"]["fk_grade"],
        "flesch_ease_improvement": metrics["simplified"]["flesch_ease"] - metrics["formal"]["flesch_ease"],
        "smog_reduction": metrics["formal"]["smog"] - metrics["simplified"]["smog"],
        "gunning_fog_reduction": metrics["formal"]["gunning_fog"] - metrics["simplified"]["gunning_fog"],
        "dale_chall_reduction": metrics["formal"]["dale_chall"] - metrics["simplified"]["dale_chall"],
        "word_length_reduction": metrics["formal"]["avg_word_length"] - metrics["simplified"]["avg_word_length"],
        "syllable_reduction": metrics["formal"]["avg_syllables_per_word"] - metrics["simplified"]["avg_syllables_per_word"],
        "long_word_percentage_reduction": metrics["formal"]["long_word_percentage"] - metrics["simplified"]["long_word_percentage"],
    }
    
    return metrics

def print_readability_analysis(term, formal_definition, simplified_explanation):
    """Print detailed readability analysis with consistent formatting"""
    
    # Format both texts with consistent line breaks
    char_limit = 80
    
    # Format formal definition with line breaks
    formatted_definition = ""
    for i in range(0, len(formal_definition), char_limit):
        formatted_definition += formal_definition[i:i+char_limit] + "\n"
    
    # Format simplified explanation with the same line breaks
    formatted_explanation = ""
    for i in range(0, len(simplified_explanation), char_limit):
        formatted_explanation += simplified_explanation[i:i+char_limit] + "\n"
    
    # Calculate metrics
    metrics = comprehensive_readability_analysis(formal_definition, simplified_explanation, term)
    
    # Print term and texts
    print(f"\nTERM: {term}")
    print(f"{'-'*50}")
    print(f"FORMAL DEFINITION:")
    print(formatted_definition)
    print(f"{'-'*50}")
    print(f"SIMPLIFIED EXPLANATION:")
    print(formatted_explanation)
    print(f"{'-'*50}")
    
    # Print basic metrics table
    print(f"READABILITY METRICS:")
    print(f"{'Metric':<25} {'Formal':<10} {'Simplified':<10} {'Improvement':<10}")
    print(f"{'-'*60}")
    
    # Traditional readability scores
    print(f"{'Flesch-Kincaid Grade':<25} {metrics['formal']['fk_grade']:<10.1f} {metrics['simplified']['fk_grade']:<10.1f} {metrics['improvements']['fk_grade_reduction']:<10.1f}")
    print(f"{'Flesch Reading Ease':<25} {metrics['formal']['flesch_ease']:<10.1f} {metrics['simplified']['flesch_ease']:<10.1f} {metrics['improvements']['flesch_ease_improvement']:<10.1f}")
    print(f"{'SMOG Index':<25} {metrics['formal']['smog']:<10.1f} {metrics['simplified']['smog']:<10.1f} {metrics['improvements']['smog_reduction']:<10.1f}")
    print(f"{'Gunning Fog':<25} {metrics['formal']['gunning_fog']:<10.1f} {metrics['simplified']['gunning_fog']:<10.1f} {metrics['improvements']['gunning_fog_reduction']:<10.1f}")
    print(f"{'Dale-Chall Score':<25} {metrics['formal']['dale_chall']:<10.1f} {metrics['simplified']['dale_chall']:<10.1f} {metrics['improvements']['dale_chall_reduction']:<10.1f}")
    
    # Lexical complexity
    print(f"{'-'*60}")
    print(f"{'Avg Word Length':<25} {metrics['formal']['avg_word_length']:<10.2f} {metrics['simplified']['avg_word_length']:<10.2f} {metrics['improvements']['word_length_reduction']:<10.2f}")
    print(f"{'Avg Syllables/Word':<25} {metrics['formal']['avg_syllables_per_word']:<10.2f} {metrics['simplified']['avg_syllables_per_word']:<10.2f} {metrics['improvements']['syllable_reduction']:<10.2f}")
    print(f"{'Long Words (%)':<25} {metrics['formal']['long_word_percentage']:<10.1f} {metrics['simplified']['long_word_percentage']:<10.1f} {metrics['improvements']['long_word_percentage_reduction']:<10.1f}")
    print(f"{'Type-Token Ratio':<25} {metrics['formal']['type_token_ratio']:<10.3f} {metrics['simplified']['type_token_ratio']:<10.3f} {'N/A':<10}")
    
    # Summary
    print(f"{'-'*60}")
    print(f"{'Educational Level':<25} {metrics['formal']['text_standard']:<25} {metrics['simplified']['text_standard']}")
    print(f"{'-'*60}")
    
    return metrics

# Usage in your explain_random_term function
def explain_random_term():
    random_term = random.choice(list(meddict.keys()))
    formal_definition = f"{random_term} is {meddict[random_term]}"
    explanation = explain_medical_term(random_term)
    
    # Print comprehensive analysis
    metrics = print_readability_analysis(random_term, formal_definition, explanation)
    
    return random_term, formal_definition, explanation, metrics

# Test with a random term
explain_random_term()


TERM: hypoxic
--------------------------------------------------
FORMAL DEFINITION:
hypoxic is Having too little oxygen.

--------------------------------------------------
SIMPLIFIED EXPLANATION:
Hypoxic means having too little oxygen in the body. When our bodies don't get en
ough oxygen, it can cause problems like headaches, fatigue, and even organ damag
e. It's important to make sure we breathe in enough oxygen so our bodies can wor
k properly.

--------------------------------------------------
READABILITY METRICS:
Metric                    Formal     Simplified Improvement
------------------------------------------------------------
Flesch-Kincaid Grade      7.6        8.8        -1.2      
Flesch Reading Ease       49.5       57.3       7.8       
SMOG Index                0.0        10.5       -10.5     
Gunning Fog               10.0       7.5        2.5       
Dale-Chall Score          7.0        6.2        0.8       
----------------------------------------------------------

('hypoxic',
 'hypoxic is Having too little oxygen.',
 "Hypoxic means having too little oxygen in the body. When our bodies don't get enough oxygen, it can cause problems like headaches, fatigue, and even organ damage. It's important to make sure we breathe in enough oxygen so our bodies can work properly.",
 {'formal': {'fk_grade': 7.6,
   'flesch_ease': 49.48,
   'smog': 0.0,
   'gunning_fog': 10.0,
   'dale_chall': 7.04,
   'text_standard': '7th and 8th grade',
   'word_count': 6,
   'avg_word_length': 4.0,
   'avg_syllables_per_word': 1.5,
   'type_token_ratio': 1.0,
   'long_word_percentage': 16.666666666666664},
  'simplified': {'fk_grade': 8.8,
   'flesch_ease': 57.27,
   'smog': 10.5,
   'gunning_fog': 7.5,
   'dale_chall': 6.21,
   'text_standard': '8th and 9th grade',
   'word_count': 50,
   'avg_word_length': 4.04,
   'avg_syllables_per_word': 1.34,
   'type_token_ratio': 0.76,
   'long_word_percentage': 10.0},
  'improvements': {'fk_grade_reduction': -1.200000000000001,
   '

# gen and save


In [9]:
def csv_to_dict(file_path):
    data_dict = {}
    with open(file_path, mode='r', newline='') as csvfile:
        reader = csv.reader(csvfile)
        next(reader)
        for row in reader:
            if len(row) >= 3: 
                key, value = row[1].strip(), row[2].strip()
                data_dict[key] = value
    return data_dict

csv_file_path = 'easydef.csv' 
easydic = csv_to_dict(csv_file_path)
csv_file_path = 'easydef2.csv' 
easydic2 = csv_to_dict(csv_file_path)

easydic=easydic|easydic2
len(easydic)

691

In [10]:
easydic.keys()

dict_keys(['adolescents and young adults (AYA)', 'biomarker testing', 'biopsy', 'chromosome', 'clinical trial', 'ejaculate', 'fertility', 'fertility preservation', 'gene', 'genetic mutation', 'gynecologic oncologist', 'hormone', 'inherited mutation', 'in vitro fertilization (IVF)', 'medical oncologist', 'menopause', 'mutation', 'oncofertility', 'oncologist', 'oocyte preservation', 'ovarian suppression', 'ovary', 'pathologist', 'prognosis', 'psychosocial', 'psychosocial support', 'radiation oncologist', 'radiation therapy', 'recurrence', 'refractory cancer', 'relapse', 'reproductive system', 'semen', 'side effect', 'sperm', 'sperm aspiration', 'sperm banking', 'sperm count', 'stem cell transplant (SCT)', '3D-CRT', 'adenocarcinoma', 'adjuvant treatment', 'alveoli', 'board certified', 'body plethysmograph', 'bronchioli', 'bronchoscope', 'bronchoscopy', 'bronchus', 'cancer screening', 'cancer stage', 'carcinoma', 'chemistry profile', 'chemoimmunotherapy', 'chemoradiation', 'chemotherapy', 

In [11]:
sharedmeddict={}
for k in easydic.keys():
# for k in ['adolescents and young adults (AYA)', 'biomarker testing', 'biopsy', 'chromosome', 'clinical trial']:
    if k in meddict.keys():
        sharedmeddict[k]=meddict[k]
len(sharedmeddict)


374

In [12]:
from textstat import (flesch_kincaid_grade, flesch_reading_ease, 
                     smog_index, gunning_fog, dale_chall_readability_score,
                     text_standard, syllable_count)
import re
import nltk
import pandas as pd
import os
from nltk.tokenize import word_tokenize
from collections import Counter
import time
import sys

# Download NLTK resources (run once)
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

def comprehensive_readability_analysis(formal_text, simplified_text, term=None):
    """
    Calculate multiple readability and complexity metrics for medical texts
    
    Args:
        formal_text: The original medical definition
        simplified_text: The simplified explanation
        term: The medical term to exclude from analysis (optional)
    
    Returns:
        Dictionary containing all metrics and comparisons
    """
    # Clean texts for analysis (remove the term if provided)
    formal_clean = formal_text
    simplified_clean = simplified_text
    
    if term:
        term_variations = [term, term.lower(), term.capitalize()]
        for variation in term_variations:
            formal_clean = re.sub(r'\b' + re.escape(variation) + r'\b', '', formal_clean)
            simplified_clean = re.sub(r'\b' + re.escape(variation) + r'\b', '', simplified_clean)
        
        # Clean up double spaces
        formal_clean = ' '.join(formal_clean.split())
        simplified_clean = ' '.join(simplified_clean.split())
    
    # Tokenize for word-level analysis
    formal_words = word_tokenize(formal_clean.lower())
    simplified_words = word_tokenize(simplified_clean.lower())
    
    # Calculate traditional readability metrics
    metrics = {
        "formal": {
            "fk_grade": flesch_kincaid_grade(formal_clean),
            "flesch_ease": flesch_reading_ease(formal_clean),
            "smog": smog_index(formal_clean),
            "gunning_fog": gunning_fog(formal_clean),
            "dale_chall": dale_chall_readability_score(formal_clean),
            "text_standard": text_standard(formal_clean),
            "word_count": len(formal_words),
            "avg_word_length": sum(len(word) for word in formal_words) / len(formal_words) if formal_words else 0,
            "avg_syllables_per_word": syllable_count(formal_clean) / len(formal_words) if formal_words else 0,
        },
        "simplified": {
            "fk_grade": flesch_kincaid_grade(simplified_clean),
            "flesch_ease": flesch_reading_ease(simplified_clean),
            "smog": smog_index(simplified_clean),
            "gunning_fog": gunning_fog(simplified_clean),
            "dale_chall": dale_chall_readability_score(simplified_clean),
            "text_standard": text_standard(simplified_clean),
            "word_count": len(simplified_words),
            "avg_word_length": sum(len(word) for word in simplified_words) / len(simplified_words) if simplified_words else 0,
            "avg_syllables_per_word": syllable_count(simplified_clean) / len(simplified_words) if simplified_words else 0,
        }
    }
    
    # Calculate lexical complexity measures
    
    # Type-Token Ratio (vocabulary diversity)
    metrics["formal"]["type_token_ratio"] = len(set(formal_words)) / len(formal_words) if formal_words else 0
    metrics["simplified"]["type_token_ratio"] = len(set(simplified_words)) / len(simplified_words) if simplified_words else 0
    
    # Long word percentage (words with 3+ syllables)
    formal_long_words = sum(1 for word in formal_words if syllable_count(word) >= 3)
    simplified_long_words = sum(1 for word in simplified_words if syllable_count(word) >= 3)
    
    metrics["formal"]["long_word_percentage"] = formal_long_words / len(formal_words) * 100 if formal_words else 0
    metrics["simplified"]["long_word_percentage"] = simplified_long_words / len(simplified_words) * 100 if simplified_words else 0
    
    # Calculate improvements
    metrics["improvements"] = {
        "fk_grade_reduction": metrics["formal"]["fk_grade"] - metrics["simplified"]["fk_grade"],
        "flesch_ease_improvement": metrics["simplified"]["flesch_ease"] - metrics["formal"]["flesch_ease"],
        "smog_reduction": metrics["formal"]["smog"] - metrics["simplified"]["smog"],
        "gunning_fog_reduction": metrics["formal"]["gunning_fog"] - metrics["simplified"]["gunning_fog"],
        "dale_chall_reduction": metrics["formal"]["dale_chall"] - metrics["simplified"]["dale_chall"],
        "word_length_reduction": metrics["formal"]["avg_word_length"] - metrics["simplified"]["avg_word_length"],
        "syllable_reduction": metrics["formal"]["avg_syllables_per_word"] - metrics["simplified"]["avg_syllables_per_word"],
        "long_word_percentage_reduction": metrics["formal"]["long_word_percentage"] - metrics["simplified"]["long_word_percentage"],
    }
    
    return metrics

def get_readability_analysis_row(term, formal_definition, simplified_explanation):
    """
    Get readability analysis as a row for a dataframe
    
    Args:
        term: The medical term
        formal_definition: The original medical definition
        simplified_explanation: The simplified explanation
    
    Returns:
        Dictionary containing all metrics as a flat structure for dataframe row
    """
    
    # Calculate metrics
    metrics = comprehensive_readability_analysis(formal_definition, simplified_explanation, term)
    
    # Create a flat dictionary for the dataframe row
    row = {
        "term": term,
        "formal_definition": formal_definition,
        "simplified_explanation": simplified_explanation,
        
        # Formal metrics
        "formal_fk_grade": metrics["formal"]["fk_grade"],
        "formal_flesch_ease": metrics["formal"]["flesch_ease"],
        "formal_smog": metrics["formal"]["smog"],
        "formal_gunning_fog": metrics["formal"]["gunning_fog"],
        "formal_dale_chall": metrics["formal"]["dale_chall"],
        "formal_text_standard": metrics["formal"]["text_standard"],
        "formal_word_count": metrics["formal"]["word_count"],
        "formal_avg_word_length": metrics["formal"]["avg_word_length"],
        "formal_avg_syllables_per_word": metrics["formal"]["avg_syllables_per_word"],
        "formal_type_token_ratio": metrics["formal"]["type_token_ratio"],
        "formal_long_word_percentage": metrics["formal"]["long_word_percentage"],
        
        # Simplified metrics
        "simplified_fk_grade": metrics["simplified"]["fk_grade"],
        "simplified_flesch_ease": metrics["simplified"]["flesch_ease"],
        "simplified_smog": metrics["simplified"]["smog"],
        "simplified_gunning_fog": metrics["simplified"]["gunning_fog"],
        "simplified_dale_chall": metrics["simplified"]["dale_chall"],
        "simplified_text_standard": metrics["simplified"]["text_standard"],
        "simplified_word_count": metrics["simplified"]["word_count"],
        "simplified_avg_word_length": metrics["simplified"]["avg_word_length"],
        "simplified_avg_syllables_per_word": metrics["simplified"]["avg_syllables_per_word"],
        "simplified_type_token_ratio": metrics["simplified"]["type_token_ratio"],
        "simplified_long_word_percentage": metrics["simplified"]["long_word_percentage"],
        
        # Improvements
        "improvement_fk_grade": metrics["improvements"]["fk_grade_reduction"],
        "improvement_flesch_ease": metrics["improvements"]["flesch_ease_improvement"],
        "improvement_smog": metrics["improvements"]["smog_reduction"],
        "improvement_gunning_fog": metrics["improvements"]["gunning_fog_reduction"],
        "improvement_dale_chall": metrics["improvements"]["dale_chall_reduction"],
        "improvement_word_length": metrics["improvements"]["word_length_reduction"],
        "improvement_syllables": metrics["improvements"]["syllable_reduction"],
        "improvement_long_word_percentage": metrics["improvements"]["long_word_percentage_reduction"],
    }
    
    return row

def explain_all_terms(meddict, explain_medical_term_func, output_file="readability_resultsv2.pkl"):
    """
    Analyze all terms in the medical dictionary and save results to a pickle file
    
    Args:
        meddict: Dictionary of medical terms and their definitions
        explain_medical_term_func: Function that generates simplified explanations
        output_file: Path to save the results pickle file
    """
    # Create empty list to store results
    results = []
    
    # Create empty dataframe
    df = pd.DataFrame()
    
    # Get timestamp for unique backup files
    timestamp = int(time.time())
    
    # Total number of terms
    total_terms = len(meddict)
    print(f"Starting analysis of {total_terms} terms...")
    
    # Function to print progress bar
    def print_progress_bar(iteration, total, prefix='Progress:', suffix='Complete', length=50, fill='█'):
        percent = ("{0:.1f}").format(100 * (iteration / float(total)))
        filled_length = int(length * iteration // total)
        bar = fill * filled_length + '-' * (length - filled_length)
        sys.stdout.write(f'\r{prefix} |{bar}| {percent}% {suffix} ({iteration}/{total})')
        sys.stdout.flush()
        # Print new line on complete
        if iteration == total:
            print()
    
    # Initialize progress bar
    print_progress_bar(0, total_terms)
    
    # Process each term
    for i, (term, definition) in enumerate(meddict.items()):
        try:
            if term.lower() in meddict: continue

            # Generate formal definition and simplified explanation
            formal_definition = f"{term} is {definition}"
            simplified_explanation = explain_medical_term_func(term)
            
            # Get analysis as a row
            row = get_readability_analysis_row(term, formal_definition, simplified_explanation)
            
            # Append to results
            results.append(row)
            
            # Update dataframe
            df = pd.DataFrame(results)
            
            # Save after each term (for robustness)
            df.to_pickle(output_file)
            
            # Create a backup every 10 terms
            if (i + 1) % 10 == 0:
                backup_file = f"readability_results_backup_{timestamp}_{i+1}.pkl"
                df.to_pickle(backup_file)
                print(f"\nBackup saved to {backup_file}")
            
            # Update progress bar
            print_progress_bar(i + 1, total_terms)
                
        except Exception as e:
            print(f"Error processing term '{term}': {str(e)}")
            # Save current results before continuing
            if results:
                error_backup = f"readability_results_error_{timestamp}.pkl"
                pd.DataFrame(results).to_pickle(error_backup)
                print(f"\nError encountered! Saved progress before error to {error_backup}")
    
    # Final save
    df = pd.DataFrame(results)
    df.to_pickle(output_file)
    
    print(f"Analysis complete. Results saved to {output_file}")
    
    return df

# Usage example:
# First, make sure meddict and explain_medical_term are defined
# meddict = {...}  # Your medical dictionary
# Then run:
# results_df = explain_all_terms(meddict, explain_medical_term)

In [13]:
results_df = explain_all_terms(sharedmeddict, explain_medical_term,output_file="noeg_missing.pkl")
results_df

Starting analysis of 374 terms...
Progress: |███████████████████████████-----------------------| 54.5% Complete (204/374)
Backup saved to readability_results_backup_1743720014_210.pkl
Progress: |█████████████████████████████████████████████████-| 98.1% Complete (367/374)Analysis complete. Results saved to noeg_missing.pkl


Unnamed: 0,term,formal_definition,simplified_explanation,formal_fk_grade,formal_flesch_ease,formal_smog,formal_gunning_fog,formal_dale_chall,formal_text_standard,formal_word_count,...,simplified_type_token_ratio,simplified_long_word_percentage,improvement_fk_grade,improvement_flesch_ease,improvement_smog,improvement_gunning_fog,improvement_dale_chall,improvement_word_length,improvement_syllables,improvement_long_word_percentage
0,3D-CRT,3D-CRT is A procedure that uses a computer to ...,3D-CRT stands for three-dimensional conformal ...,13.1,29.86,15.0,11.4,10.35,12th and 13th grade,52,...,0.666667,13.095238,2.3,22.03,2.4,-0.48,2.07,0.709707,0.304945,11.904762
1,FDA,FDA is An agency in the U.S. federal governmen...,FDA stands for Food and Drug Administration. I...,9.3,55.95,13.4,13.99,10.09,10th and 11th grade,69,...,0.76,10.666667,1.7,15.19,2.2,4.25,2.13,0.467826,0.154783,8.173913
2,Cushing syndrome,Cushing syndrome is A condition in which there...,Cushing syndrome is when there is too much of ...,9.4,61.06,10.8,10.45,9.31,10th and 11th grade,109,...,0.625,6.818182,0.9,7.75,0.7,0.65,1.44,0.199229,0.054525,1.438699
3,DNA,DNA is The molecule inside cells that contains...,DNA is like a special set of instructions insi...,9.6,60.35,11.5,11.51,9.22,9th and 10th grade,138,...,0.640449,8.988764,1.5,9.27,0.3,1.95,1.89,-0.04014,0.063345,0.431526
4,Breslow thickness,Breslow thickness is A measure of how deeply a...,Breslow thickness is a way doctors measure how...,7.1,67.04,8.3,7.71,8.84,7th and 8th grade,87,...,0.657895,3.947368,-1.7,6.17,-0.5,-2.18,1.57,0.158046,0.065336,1.799758
5,B cell,B cell is A type of white blood cell that make...,"B cells, also known as B lymphocytes, are spec...",4.9,77.94,8.8,7.99,9.75,4th and 5th grade,34,...,0.708333,6.25,-3.2,-2.91,-1.3,-2.94,2.11,0.034926,0.026961,2.573529
6,T cell,T cell is A type of white blood cell. T cells ...,T cells are special types of white blood cells...,5.0,77.74,8.8,8.01,8.67,8th and 9th grade,46,...,0.780488,3.658537,-2.2,-0.17,0.7,-1.17,0.97,-0.121951,0.019618,5.037116
7,CT scan,CT scan is A procedure that uses a computer li...,A CT scan is a way for doctors to take picture...,10.7,52.19,12.7,11.81,9.9,9th and 10th grade,107,...,0.8,3.076923,3.0,24.06,4.9,2.38,1.89,0.216104,0.174407,10.007189
8,PET scan,PET scan is A procedure in which a small amoun...,A PET scan is a way for doctors to see inside ...,12.0,48.84,12.5,13.7,8.98,13th and 14th grade,75,...,0.671233,1.369863,2.2,16.51,6.1,4.03,1.43,0.335525,0.166393,9.296804
9,FOLFIRI,FOLFIRI is An abbreviation for a chemotherapy ...,FOLFIRI is a combination of three medicines us...,9.9,43.39,11.7,12.58,11.36,11th and 12th grade,52,...,0.698413,12.698413,1.1,13.88,0.5,3.41,2.24,0.281441,0.160867,4.60928


In [15]:
olddf = pd.read_pickle('readability_resultsv2.pkl')
olddf

Unnamed: 0,term,formal_definition,simplified_explanation,formal_fk_grade,formal_flesch_ease,formal_smog,formal_gunning_fog,formal_dale_chall,formal_text_standard,formal_word_count,...,simplified_type_token_ratio,simplified_long_word_percentage,improvement_fk_grade,improvement_flesch_ease,improvement_smog,improvement_gunning_fog,improvement_dale_chall,improvement_word_length,improvement_syllables,improvement_long_word_percentage
0,biomarker testing,biomarker testing is A laboratory method that ...,Biomarker testing is a way for doctors to chec...,11.5,55.58,12.3,13.60,9.30,11th and 12th grade,137,...,0.717172,2.020202,3.0,18.44,5.1,3.71,1.21,0.082504,0.106245,7.468849
1,biopsy,biopsy is The removal of cells or tissues for ...,Biopsy is when a doctor takes a small piece of...,10.2,53.51,12.5,11.61,8.69,8th and 9th grade,126,...,0.717949,6.410256,3.4,25.08,3.0,2.19,1.97,0.122711,0.115385,6.288156
2,chromosome,chromosome is Part of a cell that contains gen...,Chromosomes are like the instruction manuals i...,7.2,61.33,0.0,10.00,12.03,7th and 8th grade,23,...,0.658537,15.853659,-3.0,-7.92,-13.4,-0.50,5.29,0.238070,-0.005832,-2.810180
3,clinical trial,clinical trial is A type of research study tha...,Clinical trials are research studies that test...,7.7,60.01,10.5,10.40,10.70,10th and 11th grade,40,...,0.833333,6.666667,-0.6,9.10,0.8,-0.01,1.41,-0.083333,0.041667,5.833333
4,fertility,fertility is The ability to produce children.,Fertility refers to the ability of an individu...,8.0,48.47,0.0,9.07,9.20,7th and 8th grade,7,...,0.695652,13.043478,-4.4,-6.06,-13.0,-4.95,-2.43,-0.192547,0.064182,1.242236
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
369,intravenous urogram,intravenous urogram is An x-ray image of the k...,An intravenous urogram is a special kind of X-...,9.4,61.06,11.2,10.88,10.33,10th and 11th grade,110,...,0.731959,2.061856,1.1,13.67,4.0,1.24,1.62,0.218182,0.115651,7.029053
370,stage 0 transitional cell carcinoma in situ of...,stage 0 transitional cell carcinoma in situ of...,Stage 0 transitional cell carcinoma in situ of...,9.8,60.04,10.4,10.29,8.31,9th and 10th grade,107,...,0.818182,10.909091,1.2,2.94,-0.8,0.41,-0.89,-0.386576,-0.036024,-3.432455
371,vacuum-assisted core biopsy,vacuum-assisted core biopsy is A procedure in ...,A vacuum-assisted core biopsy is when a doctor...,7.3,66.33,9.6,8.76,8.84,8th and 9th grade,101,...,0.722222,1.388889,1.1,13.78,3.6,1.55,1.75,0.671892,0.239274,7.522002
372,vasoactive intestinal peptide,vasoactive intestinal peptide is A hormone fou...,Vasoactive intestinal peptide (VIP) is a speci...,8.4,58.28,9.3,8.28,9.75,8th and 9th grade,92,...,0.676768,10.101010,-0.3,4.60,-1.9,-1.30,0.77,0.161506,0.023825,-2.492314


In [None]:
df_combined = pd.concat([results_df, olddf], ignore_index=True)



In [19]:

df_combined.to_pickle('fixednoeg.pkl')
df_combined.to_csv('fixednoeg.csv')