In [1]:
FILE = "../dataset/Symptom2Disease.csv"

In [2]:
import pandas as pd

df_s2d = pd.read_csv(
    FILE,
    index_col=0
)

In [3]:
df_s2d.groupby('label').count()

Unnamed: 0_level_0,text
label,Unnamed: 1_level_1
Acne,50
Arthritis,50
Bronchial Asthma,50
Cervical spondylosis,50
Chicken pox,50
Common Cold,50
Dengue,50
Dimorphic Hemorrhoids,50
Fungal infection,50
Hypertension,50


In [12]:
from datasets import load_dataset

ds = load_dataset("qiaojin/PubMedQA", "pqa_artificial")
ds2 = load_dataset("qiaojin/PubMedQA", "pqa_labeled")


In [13]:
df = ds['train'].to_pandas()
df2 = ds2['train'].to_pandas()

In [16]:
combined_df = pd.concat([df, df2], ignore_index=True).drop_duplicates(subset=['pubid'])
combined_df.shape

(212269, 5)

In [21]:
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter


docs = []
for _, row in combined_df.iterrows():
    doc = Document(
        page_content=f"Question: {row['question']}\nContext: {row['context']}\nAnswer: {row.get('answer', row.get('long_answer', ''))}\nDecision: {row.get('final_decision', 'N/A')}",
        metadata={"pubid": row['pubid'], "decision": row['final_decision']}
    )
    docs.append(doc)

print(f"Documents created: {len(docs)}")  


splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=50,
    separators=["\n\n", "\n", ". ", " ", ""],
    keep_separator=True
)

chunks = splitter.split_documents(docs) 
print(f"Chunks created: {len(chunks)}")  


Documents created: 212269
Chunks created: 693012


In [26]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma

embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs={'device': 'mps'}
)
vectorstore = Chroma.from_documents(
    documents=chunks,
    embedding=embeddings,
    persist_directory="chroma_index"  
)


In [27]:
store = Chroma(persist_directory="chroma_index", embedding_function=embeddings)

In [29]:
results = store.similarity_search("fever and headache", k=3)
for doc in results:
    print(f"Score: {doc.metadata}, Text: {doc.page_content[:100]}...")

Score: {'decision': 'yes', 'pubid': 21797861}, Text: 'Forty-one patients (46.6%) reported onset of headaches just after treatment. Headache incidences we...
Score: {'pubid': 20457341, 'decision': 'yes'}, Text: dtype=object)}
Answer: These findings imply that, although fever is not generally associated with mo...
Score: {'pubid': 16010060, 'decision': 'yes'}, Text: Question: Is fever associated with third ventricular shift after intracerebral hemorrhage : pathophy...


In [30]:
from langchain_community.retrievers import PubMedRetriever

In [32]:
retriever = PubMedRetriever(
    top_k_results=5,              
    load_all_available_meta=False,
    doc_content_chars_max=3000    
)

In [35]:
docs = retriever.invoke("fever and headache symptoms")
print("Retrieved docs:")
for i, doc in enumerate(docs):
    print(f"Doc {i+1}: {doc.page_content[:200]}...")
    print(f"Metadata: {doc.metadata}\n")

Retrieved docs:
Doc 1: 3
Mycobacterium tuberculosis
A 22-year-old man presented with headache, night sweats, intermittent fever, tremors, sleep disturbances, agitation, and hallucinations for 2 months. Thoracic computed tom...
Metadata: {'uid': '41221143', 'Title': 'Tuberculous Meningitis With Paradoxical Reaction in an Immunocompetent Young Male Treated by Interleukin-1 Receptor Antagonist.', 'Published': '2025-11-03', 'Copyright Information': 'Copyright © 2025 Azade Kanat et al. Case Reports in Infectious Diseases published by John Wiley & Sons Ltd.'}

Doc 2: BACKGROUND: Coccidioidomycosis is a fungal infection endemic to the southwestern United States. Central nervous system (CNS) coccidioidomycosis is a severe manifestation that requires lifelong antifun...
Metadata: {'uid': '41217140', 'Title': 'Intravenous Liposomal Amphotericin as an Adjunct to Fluconazole in Pediatric Patients with Central Nervous System Coccidioidomycosis: A Single-Center Case Series.', 'Published': '2025-11-1

In [34]:
import langchain, langchain_community
print(langchain.__version__, langchain_community.__version__)


1.0.3 0.4.1


In [36]:
def fetch_ctx(q: str, k: int = 5) -> str:
    docs = retriever.invoke(q)  
    ctx = []
    for d in docs[:k]:
        title = d.metadata.get("title") or ""
        src = d.metadata.get("source") or d.metadata.get("doi") or d.metadata.get("pmid")
        ctx.append(f"TITLE: {title}\n{d.page_content}\nSOURCE: {src}")
    return "\n\n---\n\n".join(ctx)