In [5]:
import os 
import pandas as pd
import nltk
import sqlite3
nltk.download('punkt')  # Required for word_tokenize


[nltk_data] Downloading package punkt to /Users/vince/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
data_dir = '/Users/vince/Salk/PaperGeneration/data'

df = pd.read_excel(os.path.join(data_dir, 'Conditions.xlsx'))
df = df[['Condition', 'Disease type']]
df

Unnamed: 0,Condition,Disease type
0,Mosquito Bites,Bites
1,Tick Bites,Bites
2,Spider Bites,Bites
3,Bedbug Bites,Bites
4,Flea Bites,Bites
...,...,...
266,Strabismus,Vision
267,Amblyopia,Vision
268,Visual Migraine (Ocular Migraine),Vision
269,Photophobia,Vision


In [1]:
import os

def index_markdown_files(base_dir, batch_size=1000):
    index = []
    batch = []
    for root, _, files in os.walk(base_dir):
        for file in files:
            if file.endswith(".md"):
                file_path = os.path.join(root, file)
                with open(file_path, 'r', encoding='utf-8') as f:
                    content = f.read()
                batch.append({"path": file_path, "content": content})
                
                if len(batch) >= batch_size:
                    index.extend(batch)
                    batch = []  # Clear the batch to save memory
    
    if batch:  # Add any remaining files
        index.extend(batch)
    return index

base_dir = "/Users/vince/Salk/NeuroCircadia/data/uptodate/table-of-contents"
markdown_index = index_markdown_files(base_dir)


In [2]:
import sqlite3

def create_index_db(db_path, markdown_index):
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    
    # Create table for indexing
    cursor.execute("""
        CREATE TABLE IF NOT EXISTS markdown_files (
            id INTEGER PRIMARY KEY,
            path TEXT UNIQUE,
            content TEXT
        )
    """)
    
    # Insert markdown files into the database
    for entry in markdown_index:
        cursor.execute("""
            INSERT OR IGNORE INTO markdown_files (path, content)
            VALUES (?, ?)
        """, (entry['path'], entry['content']))
    
    conn.commit()
    conn.close()

db_path = "markdown_index.db"
create_index_db(db_path, markdown_index)


In [6]:
def query_index(db_path, query):
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    
    cursor.execute("""
        SELECT path, content FROM markdown_files
        WHERE content LIKE ?
        LIMIT 10
    """, (f"%{query}%",))
    
    results = cursor.fetchall()
    conn.close()
    return results

db_path = "markdown_index.db"
query = "Tooth Sensitivity"
results = query_index(db_path, query)
for path, content in results:
    print(f"Path: {path}\nContent: {content[:200]}...\n")


Path: /Users/vince/Salk/NeuroCircadia/data/uptodate/table-of-contents/rheumatology/sjogrens-disease/treatment-of-dry-mouth-and-other-non-ocular-sicca-symptoms-in-sjogrens-disease.md
Content: # Treatment of dry mouth and other non-ocular sicca symptoms in Sjögren's disease

## INTRODUCTION

Sjögren's disease (SjD) is a chronic multisystem inflammatory disorder characterized by lymphocytic ...

Path: /Users/vince/Salk/NeuroCircadia/data/uptodate/table-of-contents/rheumatology/pediatric-rheumatology/treatment-of-dry-mouth-and-other-non-ocular-sicca-symptoms-in-sjogrens-disease.md
Content: # Treatment of dry mouth and other non-ocular sicca symptoms in Sjögren's disease

## INTRODUCTION

Sjögren's disease (SjD) is a chronic multisystem inflammatory disorder characterized by lymphocytic ...

Path: /Users/vince/Salk/NeuroCircadia/data/uptodate/table-of-contents/patient-education/childrens-health/tooth-decay-in-children-the-basics.md
Content: # Patient education: Tooth decay in children (The 

In [5]:
from rank_bm25 import BM25Okapi
from nltk.tokenize import word_tokenize
import pickle

def create_bm25_index(markdown_index):
    # Tokenize content for BM25
    tokenized_docs = [word_tokenize(entry['content'].lower()) for entry in markdown_index]
    bm25 = BM25Okapi(tokenized_docs)
    
    # Save precomputed BM25 index
    with open("bm25_index.pkl", "wb") as f:
        pickle.dump((bm25, markdown_index), f)

create_bm25_index(markdown_index)


In [12]:
import pickle
from rank_bm25 import BM25Okapi
from nltk.tokenize import word_tokenize
import pickle

def search_bm25(query, bm25_index_path, top_n=100):
    with open(bm25_index_path, "rb") as f:
        bm25, markdown_index = pickle.load(f)
    
    tokenized_query = word_tokenize(query.lower())
    scores = bm25.get_scores(tokenized_query)
    ranked_results = sorted(enumerate(scores), key=lambda x: x[1], reverse=True)[:top_n]
    
    results = [{"path": markdown_index[i]["path"], "score": score} for i, score in ranked_results]
    return results

query = "Tooth Sensitivity"
results = search_bm25(query, "bm25_index.pkl")
for result in results:
    print(f"Path: {result['path']}, Score: {result['score']}")


Path: /Users/vince/Salk/NeuroCircadia/data/uptodate/table-of-contents/patient-education/dental-health/tooth-decay-in-adults-the-basics.md, Score: 11.105324269489078
Path: /Users/vince/Salk/NeuroCircadia/data/uptodate/table-of-contents/patient-education/adult-general-health/tooth-decay-in-adults-the-basics.md, Score: 11.105324269489078
Path: /Users/vince/Salk/NeuroCircadia/data/uptodate/table-of-contents/patient-education/childrens-health/tooth-decay-in-children-the-basics.md, Score: 11.043121658056481
Path: /Users/vince/Salk/NeuroCircadia/data/uptodate/table-of-contents/patient-education/dental-health/tooth-decay-in-children-the-basics.md, Score: 11.043121658056481
Path: /Users/vince/Salk/NeuroCircadia/data/uptodate/table-of-contents/patient-education/dental-health/toothache-the-basics.md, Score: 11.012488299062513
Path: /Users/vince/Salk/NeuroCircadia/data/uptodate/table-of-contents/primary-care-adult/primary-care-infectious-disease/epidemiology-pathogenesis-and-clinical-manifestation

In [13]:
import json

# Function to save results to JSON
def save_results_to_json(queries, output_file, bm25_index_path):
    results_list = []  # To store results for all queries

    for i, row in queries.iterrows():
        query = row['Condition']
        print(f"Processing Query: {query}")
        
        # Fetch results
        results = search_bm25(query, bm25_index_path)
        
        # Format results for JSON
        formatted_results = {
            "query": query,
            "results": [{"path": result['path'], "score": result['score']} for result in results]
        }
        results_list.append(formatted_results)
        

    # Save to JSON
    with open(output_file, "w") as json_file:
        json.dump(results_list, json_file, indent=4)
    print(f"Results saved to {output_file}")

# Call the function
output_file = "search_results.json"
bm25_index_path = "bm25_index.pkl"
save_results_to_json(df, output_file, bm25_index_path)


Processing Query: Mosquito Bites
Processing Query: Tick Bites
Processing Query: Spider Bites
Processing Query: Bedbug Bites
Processing Query: Flea Bites
Processing Query: Horsefly Bites
Processing Query: Ant Bites
Processing Query: Anemia
Processing Query: Stroke
Processing Query: Anxiety
Processing Query: Bipolar syndrome
Processing Query: PTSD
Processing Query: Postpartum depression
Processing Query: Autism
Processing Query: Autism Spectrum Disorder
Processing Query: Asperger's Syndrome
Processing Query: ADHD
Processing Query: Seasonal Affective Disorders
Processing Query: ICU delirium
Processing Query: Panic disorder
Processing Query: Loneliness
Processing Query: Lung cancer
Processing Query: Prostate cancer
Processing Query: Colorectal cancer
Processing Query: Liver cancer
Processing Query: Breast cancer
Processing Query: Cervical Cancer
Processing Query: Ovarian Cancer
Processing Query: Melanoma
Processing Query: Squamous Cell Carcinoma
Processing Query: Basal Cell Carcinoma
Proce

In [None]:

# Step 1: Craft a detailed outline
def generate_outline(topic: str, model: GenerativeModel) -> str:
    prompt = f"""
    Craft a detailed outline for an article on the topic: '{topic}'. 
    Ensure the outline includes:
    - A structured introduction
    - Major sections with subsections
    - A logical flow of ideas
    """
    response = model.predict(prompt=prompt)
    return response.text