In [2]:
import os
import pandas as pd

DATA_CSV = "../Data/raw/synthea_patients.csv"         
PATIENT_TEXT_PATH = "../Data/processed/patient_text"  
DATA_PATH=PATIENT_TEXT_PATH
os.makedirs(PATIENT_TEXT_PATH, exist_ok=True)

def ingest_synthea_csv(csv_file, output_folder):
    df = pd.read_csv(csv_file)
    for _, row in df.iterrows():
        patient_id = row["id"]
        patient_text = f"""
        Patient ID: {row['id']}
        Name: {row['name']}
        Gender: {row['gender']}
        Birth Date: {row['birthDate']}
        Conditions: {row['conditions']}
        Medications: {row['medications']}
        """
        filename = os.path.join(output_folder, f"{patient_id}.txt")
        with open(filename, "w", encoding="utf-8") as f:
            f.write(patient_text)

# Run ingestion
ingest_synthea_csv(DATA_CSV, PATIENT_TEXT_PATH)


In [None]:
import os
import pathway as pw
from pathway.stdlib.indexing import BruteForceKnnFactory, HybridIndexFactory
from pathway.stdlib.indexing.bm25 import TantivyBM25Factory
from pathway.xpacks.llm import embedders, llms, parsers, splitters
from pathway.xpacks.llm.document_store import DocumentStore
from pathway.xpacks.llm.question_answering import BaseRAGQuestionAnswerer
from pathway.udfs import DiskCache
from dotenv import load_dotenv

load_dotenv()
hf_token = os.getenv("HF_API_TOKEN")

# --- Input Source ---
folder = pw.io.fs.read(f"{DATA_PATH}/*.txt", format="binary", with_metadata=True)
sources = [folder]

# --- Pipeline Components ---
parser = parsers.UnstructuredParser()
text_splitter = splitters.TokenCountSplitter(min_tokens=50, max_tokens=200)
embedder = embedders.GeminiEmbedder(cache_strategy=DiskCache())  # Example non-OpenAI embedder

index = HybridIndexFactory([
    TantivyBM25Factory(),
    BruteForceKnnFactory(embedder=embedder),
])

document_store = DocumentStore(
    docs=sources,
    parser=parser,
    splitter=text_splitter,
    retriever_factory=index
)

llm = llms.LiteLLMChat(model="gemini-pro")  # or "mistralai/Mistral-7B-Instruct-v0.2"
llm.cache = DiskCache()

rag_app = BaseRAGQuestionAnswerer(
    llm=llm,
    indexer=document_store,
)

# --- HTTP Serving Route ---
@rag_app.serve_callable(route="/patient_query")
async def patient_query(query: str) -> str:
    response = rag_app.indexer.get_relevant_documents(query)
    patient_info = "\n\n".join([doc.page_content for doc in response])
    return f"Query: {query}\n\nRelevant Patients:\n{patient_info}"

# --- Run Server ---
if __name__ == "__main__":
    pw.run()


AttributeError: 'TokenCountSplitter' object has no attribute 'apply'

In [None]:
pathway_host, pathway_port = "0.0.0.0", 8000
rag_app.build_server(pathway_host, pathway_port)
rag_app.run_server(threaded=True)



In [6]:
from pathway.xpacks.llm import embedders
import pickle

embedder = embedders.SentenceTransformerEmbedder("all-MiniLM-L6-v2")

# Precompute embeddings
all_embeddings = []
for doc in sources:
    print("1")
    embedding = embedder.embed(doc.page_content)
    all_embeddings.append(embedding)

# Save to disk
with open("embeddings.pkl", "wb") as f:
    pickle.dump(all_embeddings, f)


KeyboardInterrupt: 