In [35]:
import os
import shutil  # For deleting directories
import json
import pandas as pd
from whoosh import index
from whoosh.index import create_in, open_dir
from whoosh.fields import Schema, TEXT, ID
from whoosh.qparser import QueryParser
from nltk.tokenize import word_tokenize
import nltk
import logging
nltk.download('punkt')


[nltk_data] Downloading package punkt to /Users/vince/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [38]:
# Data directory and Excel file
data_dir = '/Users/vince/Salk/PaperGeneration/data'
csv_file = "/Users/vince/Salk/PaperGeneration/notebooks/condition_revised.csv"
base_dir = "/Users/vince/Salk/NeuroCircadia/data/uptodate_markdownify/table-of-contents"
output_file = "search_results_revised.json"

In [39]:
# Whoosh index directory (change this if needed)
index_dir = "uptodate_index"

# 1. Create Whoosh Schema
schema = Schema(path=ID(unique=True, stored=True), content=TEXT)

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
logger = logging.getLogger(__name__)


def setup_index(index_dir, schema):
    """
    Creates a new index or clears the existing one.
    """
    if os.path.exists(index_dir):
        logger.info(f"Clearing existing index directory: {index_dir}")
        shutil.rmtree(index_dir)
    os.mkdir(index_dir)
    ix = index.create_in(index_dir, schema)
    logger.info(f"Created new index in directory: {index_dir}")
    return ix

ix = setup_index(index_dir, schema)

INFO: Clearing existing index directory: uptodate_index
INFO: Created new index in directory: uptodate_index


In [14]:
import markdown
from bs4 import BeautifulSoup

# Function to extract plain text from Markdown
def extract_text_from_markdown(markdown_content):
    html = markdown.markdown(markdown_content)
    soup = BeautifulSoup(html, 'html.parser')
    return soup.get_text()


def index_markdown_files(base_dir, ix):
    """
    Indexes all Markdown files in the specified base directory.
    """
    # Accurately count all .md files
    total_files = sum(
        1 for root, dirs, files in os.walk(base_dir) for file in files if file.lower().endswith(".md")
    )
    logger.info(f"Total Markdown files to index: {total_files}")

    if total_files == 0:
        logger.warning("No Markdown files found to index.")
        return

    pbar = tqdm(total=total_files, desc="Indexing Markdown Files", unit="file")

    try:
        with ix.writer() as writer:
            for root, _, files in os.walk(base_dir):
                for file in files:
                    if file.lower().endswith(".md"):
                        file_path = os.path.join(root, file)
                        try:
                            with open(file_path, 'r', encoding='utf-8') as f:
                                markdown_content = f.read()
                            content = extract_text_from_markdown(markdown_content)
                            writer.update_document(path=file_path, content=content)
                            pbar.update(1)
                        except Exception as e:
                            logger.error(f"Error indexing {file_path}: {e}")
    except Exception as e:
        logger.critical(f"Failed to write to index: {e}")
    finally:
        pbar.close()
        logger.info("Indexing complete.")


index_markdown_files(base_dir, ix)

INFO: Total Markdown files to index: 42061
Indexing Markdown Files: 100%|██████████| 42061/42061 [28:44<00:00, 24.39file/s]  
INFO: Indexing complete.


In [15]:

# 4. Search Function
def search_index(query, ix, top_n=100):
    with ix.searcher() as searcher:
        parser = QueryParser("content", ix.schema)
        q = parser.parse(query)
        results = searcher.search(q, limit=top_n)
        return [{"path": r["path"], "score": r.score} for r in results]


results = search_index("PTSD", ix)
results


[{'path': '/Users/vince/Salk/NeuroCircadia/data/uptodate/table-of-contents/neurology/primary-care-psychiatry/posttraumatic-stress-disorder-in-adults-psychotherapy-and-psychosocial-interventions.md',
  'score': 12.397324148483142},
 {'path': '/Users/vince/Salk/NeuroCircadia/data/uptodate/table-of-contents/primary-care-adult/primary-care-psychiatry/posttraumatic-stress-disorder-in-adults-psychotherapy-and-psychosocial-interventions.md',
  'score': 12.397324148483142},
 {'path': '/Users/vince/Salk/NeuroCircadia/data/uptodate/table-of-contents/family-medicine-and-general-practice/primary-care-psychiatry/posttraumatic-stress-disorder-in-adults-psychotherapy-and-psychosocial-interventions.md',
  'score': 12.397324148483142},
 {'path': '/Users/vince/Salk/NeuroCircadia/data/uptodate/table-of-contents/psychiatry/trauma-related-disorders/posttraumatic-stress-disorder-in-adults-psychotherapy-and-psychosocial-interventions.md',
  'score': 12.397324148483142},
 {'path': '/Users/vince/Salk/NeuroCirc

In [20]:

# 5. Process Queries and Save Results
df = pd.read_csv(csv_file)
df = df[['Condition', 'Alternative Name', 'Category']].fillna("")

results_list = []
for i, row in df.iterrows():
    query = row['Condition']
    query_alt = row['Alternative Name']
    print(f"Processing Query: {query}")
    results = search_index(query, ix)
    formatted_results = {
        "query": query,
        "results": [{"path": result['path'], "score": result['score']} for result in results]
    }
    results_list.append(formatted_results)

    if query_alt:
        print(f"Processing Alternative Query: {query_alt}")
        results = search_index(query_alt, ix)
        formatted_results = {
            "query": query_alt,
            "results": [{"path": result['path'], "score": result['score']} for result in results]
        }
        results_list.append(formatted_results)

with open(output_file, "w") as json_file:
    json.dump(results_list, json_file, indent=4)
print(f"Results saved to {output_file}")

Processing Query: Food Allergy
Processing Query: Allergic Rhinitis
Processing Alternative Query: Hay Fever
Processing Query: Drug Allergies
Processing Query: Atopic Dermatitis
Processing Alternative Query: Eczema
Processing Query: Contact Dermatitis
Processing Query: Ankylosing Spondylitis
Processing Query: Gout
Processing Query: Guillain-Barre Syndrome
Processing Query: Multiple Sclerosis
Processing Alternative Query: MS
Processing Query: Osteoarthritis
Processing Query: Rheumatoid Arthritis
Processing Query: Sarcoidosis
Processing Query: Scleroderma
Processing Query: Systemic Lupus Erythematosus
Processing Alternative Query: SLE
Processing Query: Vasculitis
Processing Query: Ant Bites
Processing Query: Bedbug Bites
Processing Query: Flea Bites
Processing Query: Horsefly Bites
Processing Query: Mosquito Bites
Processing Query: Spider Bites
Processing Query: Tick Bites
Processing Query: Anemia
Processing Query: Hemophilia
Processing Query: Thrombocytopenia
Processing Alternative Query:

In [34]:
from rich import print as rprint
import pprint as pp

def extract_top_unique_files(results, top_n=5):
    """
    Extracts the top N unique markdown file paths from the results.

    Args:
        results (List[Dict]): List of result dictionaries with 'path' and 'score' keys.
        top_n (int): Number of top unique files to return.

    Returns:
        List[str]: List of top N unique file paths.
    """
    seen = set()
    unique_files = []

    for item in results:
        path = item.get("path")
        score = item.get("score")
        if path and score is not None:
            file_name = os.path.basename(path)
            key = (file_name, score)
            if key not in seen:
                seen.add(key)
                unique_files.append(item)
                if len(unique_files) == top_n:
                    break

    return unique_files


print(len(results_list))
for result in results_list[:10]:
    # rprint(result)
    print(result['query'])
    pp.pprint(extract_top_unique_files(result['results'], 10))
    

371
Food Allergy
[{'path': '/Users/vince/Salk/NeuroCircadia/data/uptodate/table-of-contents/allergy-and-immunology/pediatric-allergy/history-and-physical-examination-in-the-patient-with-possible-food-allergy.md',
  'score': 11.593656134767937},
 {'path': '/Users/vince/Salk/NeuroCircadia/data/uptodate/table-of-contents/pediatrics/pediatric-allergy/history-and-physical-examination-in-the-patient-with-possible-food-allergy.md',
  'score': 11.593480239683114},
 {'path': '/Users/vince/Salk/NeuroCircadia/data/uptodate/table-of-contents/allergy-and-immunology/pediatric-allergy/management-of-ige-mediated-food-allergy-an-overview.md',
  'score': 11.592376503805076},
 {'path': '/Users/vince/Salk/NeuroCircadia/data/uptodate/table-of-contents/allergy-and-immunology/pediatric-allergy/diagnostic-evaluation-of-ige-mediated-food-allergy.md',
  'score': 11.588274052453869},
 {'path': '/Users/vince/Salk/NeuroCircadia/data/uptodate/table-of-contents/pediatrics/pediatric-allergy/diagnostic-evaluation-of-i

In [2]:
data_dir = '/Users/vince/Salk/PaperGeneration/data'

df = pd.read_excel(os.path.join(data_dir, 'Conditions.xlsx'))
df = df[['Condition', 'Disease type']]
df

Unnamed: 0,Condition,Disease type
0,Mosquito Bites,Bites
1,Tick Bites,Bites
2,Spider Bites,Bites
3,Bedbug Bites,Bites
4,Flea Bites,Bites
...,...,...
266,Strabismus,Vision
267,Amblyopia,Vision
268,Visual Migraine (Ocular Migraine),Vision
269,Photophobia,Vision


In [1]:
import os

def index_markdown_files(base_dir, batch_size=1000):
    index = []
    batch = []
    for root, _, files in os.walk(base_dir):
        for file in files:
            if file.endswith(".md"):
                file_path = os.path.join(root, file)
                with open(file_path, 'r', encoding='utf-8') as f:
                    content = f.read()
                batch.append({"path": file_path, "content": content})
                
                if len(batch) >= batch_size:
                    index.extend(batch)
                    batch = []  # Clear the batch to save memory
    
    if batch:  # Add any remaining files
        index.extend(batch)
    return index

base_dir = "/Users/vince/Salk/NeuroCircadia/data/uptodate/table-of-contents"
markdown_index = index_markdown_files(base_dir)


In [2]:
import sqlite3

def create_index_db(db_path, markdown_index):
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    
    # Create table for indexing
    cursor.execute("""
        CREATE TABLE IF NOT EXISTS markdown_files (
            id INTEGER PRIMARY KEY,
            path TEXT UNIQUE,
            content TEXT
        )
    """)
    
    # Insert markdown files into the database
    for entry in markdown_index:
        cursor.execute("""
            INSERT OR IGNORE INTO markdown_files (path, content)
            VALUES (?, ?)
        """, (entry['path'], entry['content']))
    
    conn.commit()
    conn.close()

db_path = "markdown_index.db"
create_index_db(db_path, markdown_index)


In [6]:
def query_index(db_path, query):
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    
    cursor.execute("""
        SELECT path, content FROM markdown_files
        WHERE content LIKE ?
        LIMIT 10
    """, (f"%{query}%",))
    
    results = cursor.fetchall()
    conn.close()
    return results

db_path = "markdown_index.db"
query = "Tooth Sensitivity"
results = query_index(db_path, query)
for path, content in results:
    print(f"Path: {path}\nContent: {content[:200]}...\n")


Path: /Users/vince/Salk/NeuroCircadia/data/uptodate/table-of-contents/rheumatology/sjogrens-disease/treatment-of-dry-mouth-and-other-non-ocular-sicca-symptoms-in-sjogrens-disease.md
Content: # Treatment of dry mouth and other non-ocular sicca symptoms in Sjögren's disease

## INTRODUCTION

Sjögren's disease (SjD) is a chronic multisystem inflammatory disorder characterized by lymphocytic ...

Path: /Users/vince/Salk/NeuroCircadia/data/uptodate/table-of-contents/rheumatology/pediatric-rheumatology/treatment-of-dry-mouth-and-other-non-ocular-sicca-symptoms-in-sjogrens-disease.md
Content: # Treatment of dry mouth and other non-ocular sicca symptoms in Sjögren's disease

## INTRODUCTION

Sjögren's disease (SjD) is a chronic multisystem inflammatory disorder characterized by lymphocytic ...

Path: /Users/vince/Salk/NeuroCircadia/data/uptodate/table-of-contents/patient-education/childrens-health/tooth-decay-in-children-the-basics.md
Content: # Patient education: Tooth decay in children (The 

In [5]:
from rank_bm25 import BM25Okapi
from nltk.tokenize import word_tokenize
import pickle

def create_bm25_index(markdown_index):
    # Tokenize content for BM25
    tokenized_docs = [word_tokenize(entry['content'].lower()) for entry in markdown_index]
    bm25 = BM25Okapi(tokenized_docs)
    
    # Save precomputed BM25 index
    with open("bm25_index.pkl", "wb") as f:
        pickle.dump((bm25, markdown_index), f)

create_bm25_index(markdown_index)


In [12]:
import pickle
from rank_bm25 import BM25Okapi
from nltk.tokenize import word_tokenize
import pickle

def search_bm25(query, bm25_index_path, top_n=100):
    with open(bm25_index_path, "rb") as f:
        bm25, markdown_index = pickle.load(f)
    
    tokenized_query = word_tokenize(query.lower())
    scores = bm25.get_scores(tokenized_query)
    ranked_results = sorted(enumerate(scores), key=lambda x: x[1], reverse=True)[:top_n]
    
    results = [{"path": markdown_index[i]["path"], "score": score} for i, score in ranked_results]
    return results

query = "Tooth Sensitivity"
results = search_bm25(query, "bm25_index.pkl")
for result in results:
    print(f"Path: {result['path']}, Score: {result['score']}")


Path: /Users/vince/Salk/NeuroCircadia/data/uptodate/table-of-contents/patient-education/dental-health/tooth-decay-in-adults-the-basics.md, Score: 11.105324269489078
Path: /Users/vince/Salk/NeuroCircadia/data/uptodate/table-of-contents/patient-education/adult-general-health/tooth-decay-in-adults-the-basics.md, Score: 11.105324269489078
Path: /Users/vince/Salk/NeuroCircadia/data/uptodate/table-of-contents/patient-education/childrens-health/tooth-decay-in-children-the-basics.md, Score: 11.043121658056481
Path: /Users/vince/Salk/NeuroCircadia/data/uptodate/table-of-contents/patient-education/dental-health/tooth-decay-in-children-the-basics.md, Score: 11.043121658056481
Path: /Users/vince/Salk/NeuroCircadia/data/uptodate/table-of-contents/patient-education/dental-health/toothache-the-basics.md, Score: 11.012488299062513
Path: /Users/vince/Salk/NeuroCircadia/data/uptodate/table-of-contents/primary-care-adult/primary-care-infectious-disease/epidemiology-pathogenesis-and-clinical-manifestation

In [13]:
import json

# Function to save results to JSON
def save_results_to_json(queries, output_file, bm25_index_path):
    results_list = []  # To store results for all queries

    for i, row in queries.iterrows():
        query = row['Condition']
        print(f"Processing Query: {query}")
        
        # Fetch results
        results = search_bm25(query, bm25_index_path)
        
        # Format results for JSON
        formatted_results = {
            "query": query,
            "results": [{"path": result['path'], "score": result['score']} for result in results]
        }
        results_list.append(formatted_results)
        

    # Save to JSON
    with open(output_file, "w") as json_file:
        json.dump(results_list, json_file, indent=4)
    print(f"Results saved to {output_file}")

# Call the function
output_file = "search_results.json"
bm25_index_path = "bm25_index.pkl"
save_results_to_json(df, output_file, bm25_index_path)


Processing Query: Mosquito Bites
Processing Query: Tick Bites
Processing Query: Spider Bites
Processing Query: Bedbug Bites
Processing Query: Flea Bites
Processing Query: Horsefly Bites
Processing Query: Ant Bites
Processing Query: Anemia
Processing Query: Stroke
Processing Query: Anxiety
Processing Query: Bipolar syndrome
Processing Query: PTSD
Processing Query: Postpartum depression
Processing Query: Autism
Processing Query: Autism Spectrum Disorder
Processing Query: Asperger's Syndrome
Processing Query: ADHD
Processing Query: Seasonal Affective Disorders
Processing Query: ICU delirium
Processing Query: Panic disorder
Processing Query: Loneliness
Processing Query: Lung cancer
Processing Query: Prostate cancer
Processing Query: Colorectal cancer
Processing Query: Liver cancer
Processing Query: Breast cancer
Processing Query: Cervical Cancer
Processing Query: Ovarian Cancer
Processing Query: Melanoma
Processing Query: Squamous Cell Carcinoma
Processing Query: Basal Cell Carcinoma
Proce

In [None]:

# Step 1: Craft a detailed outline
def generate_outline(topic: str, model: GenerativeModel) -> str:
    prompt = f"""
    Craft a detailed outline for an article on the topic: '{topic}'. 
    Ensure the outline includes:
    - A structured introduction
    - Major sections with subsections
    - A logical flow of ideas
    """
    response = model.predict(prompt=prompt)
    return response.text