In [20]:
import os
import shutil  # For deleting directories
import json
import pandas as pd
from whoosh import index
from whoosh.index import create_in, open_dir
from whoosh.fields import Schema, TEXT, ID
from whoosh.qparser import QueryParser
from nltk.tokenize import word_tokenize
import nltk
import tqdm
import logging
nltk.download('punkt')


[nltk_data] Downloading package punkt to /Users/vince/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [21]:
# Data directory and Excel file
data_dir = '/Users/vince/Salk/PaperGeneration/data'
csv_file = "/Users/vince/Salk/PaperGeneration/notebooks/condition_revised.csv"
base_dir = "/Users/vince/Salk/NeuroCircadia/data/uptodate_markdownify/table-of-contents"
output_file = "search_results_revised.json"

In [28]:
# Whoosh index directory (change this if needed)
index_dir = "uptodate_index"

from whoosh.fields import Schema, TEXT, ID, STORED, KEYWORD

# Define a sophisticated schema
schema = Schema(
    path=ID(unique=True, stored=True),           # Unique path for each chunk
    title=TEXT(stored=True),                    # Title of the chunk
    topic=TEXT(stored=True),                    # Topic from metadata
    subtopic=TEXT(stored=True),                 # Subtopic from metadata
    content=TEXT(stored=True),                               # Main text content of the chunk
    references=STORED,                          # Store references as raw data
    keywords=KEYWORD(commas=True, stored=True)  # Optional: Keywords for advanced filtering
)
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
logger = logging.getLogger(__name__)


def setup_index(index_dir, schema):
    """
    Creates a new index or clears the existing one.
    """
    if os.path.exists(index_dir):
        logger.info(f"Clearing existing index directory: {index_dir}")
        shutil.rmtree(index_dir)
    os.mkdir(index_dir)
    ix = index.create_in(index_dir, schema)
    logger.info(f"Created new index in directory: {index_dir}")
    return ix

ix = setup_index(index_dir, schema)

INFO: Clearing existing index directory: uptodate_index
INFO: Created new index in directory: uptodate_index


In [29]:
import tqdm
import markdown
from bs4 import BeautifulSoup

# Function to extract plain text from Markdown
def extract_text_from_markdown(markdown_content):
    html = markdown.markdown(markdown_content)
    soup = BeautifulSoup(html, 'html.parser')
    return soup.get_text()

def index_json_chunks(base_dir, ix):
    """
    Indexes all JSON chunk files in the specified base directory.
    Each file is expected to be an array of chunk objects, each having a
    'content', 'references', and 'metadata' fields.
    """
    json_files = [
        os.path.join(root, file)
        for root, _, files in os.walk(base_dir)
        for file in files
        if file.lower().endswith(".json")
    ]
    
    total_files = len(json_files)
    logger.info(f"Total JSON files to index: {total_files}")

    if total_files == 0:
        logger.warning("No JSON files found to index.")
        return

    pbar = tqdm.tqdm(total=total_files, desc="Indexing JSON Files", unit="file")

    try:
        with ix.writer() as writer:
            for file_path in json_files:
                try:
                    with open(file_path, 'r', encoding='utf-8') as f:
                        # Each JSON file is an array of chunks
                        chunks = json.load(f)

                    # Index each chunk individually
                    for i, chunk in enumerate(chunks):
                        raw_content = chunk.get("content", "")
                        content_text = extract_text_from_markdown(raw_content)
                        metadata = chunk.get("metadata", {})
                        
                        # Extract metadata fields
                        title = metadata.get("title", "")
                        topic = metadata.get("topic", "")
                        subtopic = metadata.get("subtopic", "")
                        references = chunk.get("references", [])

                        # Optional: Generate keywords from content or metadata
                        keywords = ','.join([topic, subtopic])

                        # Unique path for each chunk
                        unique_path = f"{file_path}::chunk{i}"

                        writer.update_document(
                            path=unique_path,
                            title=title,
                            topic=topic,
                            subtopic=subtopic,
                            content=content_text,
                            references=json.dumps(references),  # Store as JSON string
                            keywords=keywords
                        )
                except Exception as e:
                    logger.error(f"Error indexing {file_path}: {e}")
                finally:
                    pbar.update(1)
    except Exception as e:
        logger.critical(f"Failed to write to index: {e}")
    finally:
        pbar.close()
        logger.info("Indexing complete.")


# Run the indexing
index_json_chunks(base_dir, ix)

INFO: Total JSON files to index: 42029
Indexing JSON Files: 100%|██████████| 42029/42029 [27:31<00:00, 25.45file/s]  
INFO: Indexing complete.


In [30]:
print("test")

test


In [25]:

# 4. Search Function
def search_index(query, ix, top_n=100):
    with ix.searcher() as searcher:
        parser = QueryParser("content", ix.schema)
        q = parser.parse(query)
        results = searcher.search(q, limit=top_n)
        return [{"path": r["path"], "score": r.score} for r in results]


results = search_index("PTSD", ix)
results


[{'path': '/Users/vince/Salk/NeuroCircadia/data/uptodate_markdownify/table-of-contents/family-medicine-and-general-practice/child-and-adolescent-mental-disorders/posttraumatic-stress-disorder-in-children-and-adolescents-epidemiology-clinical-features-assessment-and-diagnosis.json::chunk17',
  'score': 16.644248087873407},
 {'path': '/Users/vince/Salk/NeuroCircadia/data/uptodate_markdownify/table-of-contents/family-medicine-and-general-practice/child-and-adolescent-mental-disorders/posttraumatic-stress-disorder-in-children-and-adolescents-trauma-focused-psychotherapy.json::chunk15',
  'score': 16.58394078208086},
 {'path': '/Users/vince/Salk/NeuroCircadia/data/uptodate_markdownify/table-of-contents/family-medicine-and-general-practice/child-and-adolescent-mental-disorders/posttraumatic-stress-disorder-in-children-and-adolescents-epidemiology-clinical-features-assessment-and-diagnosis.json::chunk4',
  'score': 16.51485215493983},
 {'path': '/Users/vince/Salk/NeuroCircadia/data/uptodate_m

In [38]:
from whoosh.qparser import MultifieldParser

# Open the index
ix = index.open_dir(index_dir)

# Create a parser for multiple fields
parser = MultifieldParser(["content", "title", "topic", "subtopic"], schema=ix.schema)

with ix.searcher() as searcher:
    query = parser.parse("immunotherapy")
    results = searcher.search(query, limit=10)
    
    for result in results:
        print(f"Score: {result.score:.2f}")
        # print(f"Score: {result['score']}")
        print(f"Path: {result['path']}")
        print(f"Title: {result['title']}")
        print(f"Topic: {result['topic']}")
        print(f"Subtopic: {result['subtopic']}")
        print(f"References: {result['references']}")
        # print(f"Snippet: {result.highlights('content')}")
        print(f"Content: {result['content']}")
        print("-" * 40)


Score: 29.07
Path: /Users/vince/Salk/NeuroCircadia/data/uptodate_markdownify/table-of-contents/allergy-and-immunology/allergen-immunotherapy/sublingual-immunotherapy-beyond-the-basics.json::chunk6
Title: Patient education: Sublingual immunotherapy (Beyond the Basics)
Topic: Allergy and immunology
Subtopic: Allergen immunotherapy
References: []
Content: FREQUENTLY ASKED QUESTIONS
What are the other ways to treat my allergies?
Other types of immunotherapy
Subcutaneous immunotherapy (SCIT) is a long-established, traditional form of allergen immunotherapy. In SCIT, a mixture of the allergens to which you are allergic is prepared, and injections of the extract are gradually increased (over 16 to 20 weeks) as tolerated to a maintenance regimen that is then administered at your physician's office every two to four weeks over three to five years. In contrast to SLIT-tablets, another form of allergen immunotherapy is that of sublingual drops (SLIT-drops). This method involves putting drops of a

In [None]:
import json
import pandas as pd
from whoosh.qparser import MultifieldParser

# Open the index
ix = index.open_dir(index_dir)

def search_index(query, ix, top_n=500):
    """
    Search the index for a given query.
    """
    with ix.searcher() as searcher:
        parser = MultifieldParser(["content", "title", "topic", "subtopic"], schema=ix.schema)
        q = parser.parse(query)
        results = searcher.search(q, limit=top_n)
        
        # Collect results
        result_list = []
        for result in results:
            result_list.append({
                "path": result["path"],
                "title": result["title"],
                "topic": result["topic"],
                "subtopic": result["subtopic"],
                "score": result.score,
                "content": result["content"],
                "references": result["references"]
            })
        return result_list

def remove_duplicates(results):
    """
    Remove duplicate results based on the 'content' field.
    """
    seen = set()
    unique_results = []
    for result in results:
        content_hash = hash(result["content"])  # Hashing content for faster comparison
        if content_hash not in seen:
            unique_results.append(result)
            seen.add(content_hash)
    return unique_results

# Process queries and save results
df = pd.read_csv(csv_file)
df = df[['Condition', 'Alternative Name', 'Category']].fillna("")

results_list = []
for i, row in df.iterrows():
    query = row['Condition']
    query_alt = row['Alternative Name']
    print(f"[{i+1}] Processing Query: {query}")
    
    # Main query
    results = search_index(query, ix)
    a = len(results)
    results = remove_duplicates(results)
    if len(results) < a: 
        print(f"Removed duplicates: {a - len(results)}")
    results = results[:100]
    formatted_results = {
        "query": query,
        "results": [{"path": r["path"], "title": r["title"], "topic": r["topic"], 
                     "subtopic": r["subtopic"], "score": r["score"], 
                     "content": r["content"], "references": r["references"]} for r in results]
    }
    results_list.append(formatted_results)
    
    # Alternative query
    if query_alt:
        print(f"Processing Alternative Query: {query_alt}")
        results = search_index(query_alt, ix)
        results = remove_duplicates(results)
        formatted_results = {
            "query": query_alt,
            "results": [{"path": r["path"], "title": r["title"], "topic": r["topic"], 
                         "subtopic": r["subtopic"], "score": r["score"], 
                         "content": r["content"], "references": r["references"]} for r in results]
        }
        results_list.append(formatted_results)

# Save results to JSON
with open(output_file, "w") as json_file:
    json.dump(results_list, json_file, indent=4)
print(f"Results saved to {output_file}")


[1] Processing Query: Food Allergy
[2] Processing Query: Allergic Rhinitis
Removed duplicates: 14
Processing Alternative Query: Hay Fever
[3] Processing Query: Drug Allergies
Removed duplicates: 47
[4] Processing Query: Atopic Dermatitis
Removed duplicates: 48
Processing Alternative Query: Eczema
[5] Processing Query: Contact Dermatitis
Removed duplicates: 52
[6] Processing Query: Ankylosing Spondylitis
Removed duplicates: 43
[7] Processing Query: Gout
Removed duplicates: 58
[8] Processing Query: Guillain-Barre Syndrome
Removed duplicates: 18
[9] Processing Query: Multiple Sclerosis
Removed duplicates: 38
Processing Alternative Query: MS
[10] Processing Query: Osteoarthritis
Removed duplicates: 52
[11] Processing Query: Rheumatoid Arthritis
Removed duplicates: 25
[12] Processing Query: Sarcoidosis
Removed duplicates: 50
[13] Processing Query: Scleroderma
Removed duplicates: 80
[14] Processing Query: Systemic Lupus Erythematosus
Removed duplicates: 52
Processing Alternative Query: SLE
[

In [None]:
# 5. Process Queries and Save Results
df = pd.read_csv(csv_file)
df = df[['Condition', 'Alternative Name', 'Category']].fillna("")

results_list = []
for i, row in df.iterrows():
    query = row['Condition']
    query_alt = row['Alternative Name']
    print(f"Processing Query: {query}")
    results = search_index(query, ix)
    formatted_results = {
        "query": query,
        "results": [{"path": result['path'], "score": result['score']} for result in results]
    }
    results_list.append(formatted_results)

    if query_alt:
        print(f"Processing Alternative Query: {query_alt}")
        results = search_index(query_alt, ix)
        formatted_results = {
            "query": query_alt,
            "results": [{"path": result['path'], "score": result['score']} for result in results]
        }
        results_list.append(formatted_results)

with open(output_file, "w") as json_file:
    json.dump(results_list, json_file, indent=4)
print(f"Results saved to {output_file}")

Processing Query: Food Allergy
Processing Query: Allergic Rhinitis
Processing Alternative Query: Hay Fever
Processing Query: Drug Allergies
Processing Query: Atopic Dermatitis
Processing Alternative Query: Eczema
Processing Query: Contact Dermatitis
Processing Query: Ankylosing Spondylitis
Processing Query: Gout
Processing Query: Guillain-Barre Syndrome
Processing Query: Multiple Sclerosis
Processing Alternative Query: MS
Processing Query: Osteoarthritis
Processing Query: Rheumatoid Arthritis
Processing Query: Sarcoidosis
Processing Query: Scleroderma
Processing Query: Systemic Lupus Erythematosus
Processing Alternative Query: SLE
Processing Query: Vasculitis
Processing Query: Ant Bites
Processing Query: Bedbug Bites
Processing Query: Flea Bites
Processing Query: Horsefly Bites
Processing Query: Mosquito Bites
Processing Query: Spider Bites
Processing Query: Tick Bites
Processing Query: Anemia
Processing Query: Hemophilia
Processing Query: Thrombocytopenia
Processing Alternative Query:

In [None]:
with open("/Users/vince/Salk/PaperGeneration/data/search_results_revised.json", 'r', encoding='utf-8') as file:
    search_results = json.load(file)
    print(search_results[0])
    

{'query': 'Food Allergy', 'results': [{'path': '/Users/vince/Salk/NeuroCircadia/data/uptodate_markdownify/table-of-contents/allergy-and-immunology/food-allergy-and-food-intolerance/pathogenesis-of-food-allergy.json::chunk43', 'title': 'Pathogenesis of food allergy', 'topic': 'Allergy and immunology', 'subtopic': 'Food allergy and food intolerance', 'score': 54.42908898520413, 'content': 'FACTORS INFLUENCING SENSITIZATION OR TOLERANCE\nGenetics\nGenetic susceptibility plays a role in the development of food allergy. This has been demonstrated in mouse models by the strain-dependent variability in induction of oral tolerance or hypersensitivity [173,174].\nAn atopic predisposition is associated with an increased risk for food allergy. There is a significant familial aggregation of food allergy and food allergen sensitization [175]. The association is strongest among siblings: A child is over two-and-a-half times more likely to develop a food allergy if a sibling has a food allergy. Regar

In [56]:
json.loads(search_results[0].get('results')[0].get('references'))

['[173] Li X, Huang CK, Schofield BH, et al. Strain-dependent induction of allergic sensitization caused by peanut allergen DNA immunization in mice. J Immunol 1999; 162:3045.',
 '[174] Morafo V, Srivastava K, Huang CK, et al. Genetic susceptibility to food allergy is linked to differential TH2-TH1 responses in C3H/HeJ and BALB/c mice. J Allergy Clin Immunol 2003; 111:1122.',
 '[175] Tsai HJ, Kumar R, Pongracic J, et al. Familial aggregation of food allergy and sensitization to food allergens: a family-based study. Clin Exp Allergy 2009; 39:101.',
 '[176] Howell WM, Turner SJ, Hourihane JO, et al. HLA class II DRB1, DQB1 and DPB1 genotypic associations with peanut allergy: evidence from a family-based and case-control study. Clin Exp Allergy 1998; 28:156.',
 '[177] Amoli MM, Hand S, Hajeer AH, et al. Polymorphism in the STAT6 gene encodes risk for nut allergy. Genes Immun 2002; 3:220.',
 '[178] Hourihane JO, Dean TP, Warner JO. Peanut allergy in relation to heredity, maternal diet, and

In [58]:
conditions_df = pd.read_csv("/Users/vince/Salk/PaperGeneration/notebooks/condition_revised.csv")
for idx, row in conditions_df.head(3).iterrows():
    print(row)

Condition           Food Allergy
Alternative Name             NaN
Category               Allergies
Name: 0, dtype: object
Condition           Allergic Rhinitis
Alternative Name            Hay Fever
Category                    Allergies
Name: 1, dtype: object
Condition           Drug Allergies
Alternative Name               NaN
Category                 Allergies
Name: 2, dtype: object


In [34]:
from rich import print as rprint
import pprint as pp

def extract_top_unique_files(results, top_n=5):
    """
    Extracts the top N unique markdown file paths from the results.

    Args:
        results (List[Dict]): List of result dictionaries with 'path' and 'score' keys.
        top_n (int): Number of top unique files to return.

    Returns:
        List[str]: List of top N unique file paths.
    """
    seen = set()
    unique_files = []

    for item in results:
        path = item.get("path")
        score = item.get("score")
        if path and score is not None:
            file_name = os.path.basename(path)
            key = (file_name, score)
            if key not in seen:
                seen.add(key)
                unique_files.append(item)
                if len(unique_files) == top_n:
                    break

    return unique_files


print(len(results_list))
for result in results_list[:10]:
    # rprint(result)
    print(result['query'])
    pp.pprint(extract_top_unique_files(result['results'], 10))
    

371
Food Allergy
[{'path': '/Users/vince/Salk/NeuroCircadia/data/uptodate/table-of-contents/allergy-and-immunology/pediatric-allergy/history-and-physical-examination-in-the-patient-with-possible-food-allergy.md',
  'score': 11.593656134767937},
 {'path': '/Users/vince/Salk/NeuroCircadia/data/uptodate/table-of-contents/pediatrics/pediatric-allergy/history-and-physical-examination-in-the-patient-with-possible-food-allergy.md',
  'score': 11.593480239683114},
 {'path': '/Users/vince/Salk/NeuroCircadia/data/uptodate/table-of-contents/allergy-and-immunology/pediatric-allergy/management-of-ige-mediated-food-allergy-an-overview.md',
  'score': 11.592376503805076},
 {'path': '/Users/vince/Salk/NeuroCircadia/data/uptodate/table-of-contents/allergy-and-immunology/pediatric-allergy/diagnostic-evaluation-of-ige-mediated-food-allergy.md',
  'score': 11.588274052453869},
 {'path': '/Users/vince/Salk/NeuroCircadia/data/uptodate/table-of-contents/pediatrics/pediatric-allergy/diagnostic-evaluation-of-i