In [1]:
import os
import json
import requests
import pandas as pd
import logging
from sentence_transformers import SentenceTransformer
import chromadb
from rich.console import Console
from rich.panel import Panel
import re
from dotenv import load_dotenv

load_dotenv()

console = Console()

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

class Retriever:
    def __init__(self, data_directory):
        self.data_directory = data_directory
        self.patient_data_files = {
            "patients": "patients.csv",
            "conditions": "conditions.csv",
            "medications": "medications.csv",
            "observations": "observations.csv",
            "procedures": "procedures.csv",
            "encounters": "encounters.csv",
            "imaging_studies": "imaging_studies.csv",
            "allergies": "allergies.csv",
            "careplans": "careplans.csv",
            "devices": "devices.csv",
            "immunizations": "immunizations.csv",
            "organizations": "organizations.csv",
            "payer_transitions": "payer_transitions.csv",
            "payers": "payers.csv",
            "providers": "providers.csv",
        }
        self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2', device='mps')  # Fast on Mac M1/M2, or use 'cuda' on GPU
        self.chroma_client = chromadb.PersistentClient(path="./chromadb")
        self.collection = self.chroma_client.get_or_create_collection(name="patient_data")

        if not os.path.exists(self.data_directory):
            raise FileNotFoundError(f"Error: The directory {self.data_directory} does not exist.")

    def fetch_patient_history(self, patient_id):
        console.print(Panel("[bold cyan]Step 1: Retrieval[/bold cyan]"))
        logging.info("Searching for patient data in multiple sources...")

        patient_data = {key: [] for key in self.patient_data_files.keys()}

        for key, filename in self.patient_data_files.items():
            file_path = os.path.join(self.data_directory, filename)
            if os.path.exists(file_path):
                df = pd.read_csv(file_path, chunksize=50000)  # Large datasets optimized
                records = []
                for chunk in df:
                    chunk.columns = chunk.columns.str.upper()
                    if "PATIENT" in chunk.columns:
                        filtered = chunk[chunk["PATIENT"] == patient_id]
                    elif "ID" in chunk.columns and key == "patients":
                        filtered = chunk[chunk["ID"] == patient_id]
                    else:
                        filtered = chunk

                    records.extend(filtered.to_dict(orient='records'))
                patient_data[key] = records

        self.prepare_text_chunks(patient_data)
        return patient_data

    def prepare_text_chunks(self, patient_data):
        # Instead of deleting entire collection, we upsert data to save time
        ids_to_upsert = []
        embeddings = []
        documents = []
        metadatas = []

        for key, records in patient_data.items():
            for idx, record in enumerate(records):
                chunk_text = f"{key}: {json.dumps(record, ensure_ascii=False)[:500]}"  # Limit JSON size
                embedding = self.embedding_model.encode(chunk_text, show_progress_bar=False).tolist()
                ids_to_upsert.append(f"{key}_{idx}")
                embeddings.append(embedding)
                documents.append(chunk_text)
                metadatas.append({"key": key})

        # Upsert all at once (bulk insert is faster)
        if ids_to_upsert:
            self.collection.upsert(
                ids=ids_to_upsert,
                embeddings=embeddings,
                documents=documents,
                metadatas=metadatas
            )

    def retrieve_relevant_chunks(self, query, top_n=5):
        query_embedding = self.embedding_model.encode(query, show_progress_bar=False).tolist()
        results = self.collection.query(query_embeddings=[query_embedding], n_results=top_n)
        relevant_chunks = results.get('documents', [[]])[0] if 'documents' in results else []
        return "\n".join(relevant_chunks)


class Validator:
    def validate(self, patient_data):
        logging.info("Validating data consistency...")
        issues = []
        for key, records in patient_data.items():
            for record in records:
                if 'DESCRIPTION' in record and not record['DESCRIPTION']:
                    issues.append(f"Missing description in {key}")
        if issues:
            console.print(Panel(f"[red]Validation Issues Found: {issues}[/red]"))
        return issues


class Analyzer:
    def analyze_history(self, patient_data, query):
        logging.info("Analyzing patient history based on query...")
        return query


class Contextualizer:
    def add_context(self, analysis):
        logging.info("Adding medical context to the analysis...")
        return analysis + " | Contextualized insights based on medical guidelines."


class DecisionSupportAgent:
    def suggest_actions(self, analysis):
        logging.info("Suggesting potential next steps...")
        return analysis + " | Suggested next steps: Review medication plan, Order blood test."


class Presenter:
    def generate_summary(self, relevant_text, query):
        console.print(Panel("[bold cyan]Generating Summary...[/bold cyan]"))
        logging.info("Generating AI-enhanced summary...")
        try:
            url = "https://api.groq.com/openai/v1/chat/completions"
            

            GROQ_API_KEY = os.getenv("GROQ_API_KEY")
            headers = {"Authorization": f"Bearer {GROQ_API_KEY}", "Content-Type": "application/json"}
        
            payload = {
                "model": "deepseek-r1-distill-llama-70b",
                "messages": [
                    {"role": "system", "content": "You are a medical assistant. Provide a concise and accurate summary based on the patient history. Do not explain your thought process."},
                    {"role": "user", "content": f"Relevant patient data: {relevant_text[:3000]}. Query: {query}"}
                ]
            }
            response = requests.post(url, headers=headers, json=payload)
            response_json = response.json()

            summary_raw = response_json.get("choices", [{}])[0].get("message", {}).get("content", "AI response error")
            summary_cleaned = re.sub(r"<think>.*?</think>", "", summary_raw, flags=re.DOTALL).strip()

            console.print(Panel(f"[bold green]{summary_cleaned}[/bold green]"))
        except Exception as e:
            logging.error(f"AI enhancement failed: {e}")


class PatientHistoryAnalyzer:
    def __init__(self, data_directory):
        self.retriever = Retriever(data_directory)
        self.validator = Validator()
        self.analyzer = Analyzer()
        self.contextualizer = Contextualizer()
        self.decision_support = DecisionSupportAgent()
        self.presenter = Presenter()

    def process_query(self, patient_id, query):
        patient_data = self.retriever.fetch_patient_history(patient_id)
        self.validator.validate(patient_data)

        relevant_text = self.retriever.retrieve_relevant_chunks(query)

        analysis = self.analyzer.analyze_history(patient_data, query)
        analysis_with_context = self.contextualizer.add_context(analysis)
        suggestions = self.decision_support.suggest_actions(analysis_with_context)

        self.presenter.generate_summary(relevant_text, suggestions)


if __name__ == "__main__":
    data_directory = "./data/"
    analyzer = PatientHistoryAnalyzer(data_directory)
    patient_id = input("Enter patient ID: ")
    while True:
        query = input("Enter your medical query (or type 'exit' to quit): ")
        if query.lower() == "exit":
            break
        analyzer.process_query(patient_id, query)


2025-02-16 20:21:48,290 - INFO - Load pretrained SentenceTransformer: all-MiniLM-L6-v2
2025-02-16 20:21:49,951 - INFO - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.


Enter patient ID: cfee79fc-df05-476e-b274-43e09ea345db
Enter your medical query (or type 'exit' to quit): List all past medical encounters


2025-02-16 20:22:33,126 - INFO - Searching for patient data in multiple sources...
2025-02-16 20:25:07,362 - INFO - Validating data consistency...
2025-02-16 20:25:07,709 - INFO - Analyzing patient history based on query...
2025-02-16 20:25:07,710 - INFO - Adding medical context to the analysis...
2025-02-16 20:25:07,710 - INFO - Suggesting potential next steps...


2025-02-16 20:25:07,711 - INFO - Generating AI-enhanced summary...


Enter your medical query (or type 'exit' to quit): Has the patient been prescribed any pain management medications?


2025-02-16 20:29:37,212 - INFO - Searching for patient data in multiple sources...
2025-02-16 20:32:16,783 - INFO - Validating data consistency...
2025-02-16 20:32:17,638 - INFO - Analyzing patient history based on query...
2025-02-16 20:32:17,643 - INFO - Adding medical context to the analysis...
2025-02-16 20:32:17,643 - INFO - Suggesting potential next steps...


2025-02-16 20:32:17,653 - INFO - Generating AI-enhanced summary...


Enter your medical query (or type 'exit' to quit): What follow-up appointments are scheduled for the patient?


2025-02-16 20:36:41,255 - INFO - Searching for patient data in multiple sources...
2025-02-16 20:39:49,190 - INFO - Validating data consistency...
2025-02-16 20:39:50,064 - INFO - Analyzing patient history based on query...
2025-02-16 20:39:50,065 - INFO - Adding medical context to the analysis...
2025-02-16 20:39:50,065 - INFO - Suggesting potential next steps...


2025-02-16 20:39:50,070 - INFO - Generating AI-enhanced summary...


Enter your medical query (or type 'exit' to quit): Show recent lab results for the patient


2025-02-16 20:42:04,881 - INFO - Searching for patient data in multiple sources...
2025-02-16 20:45:06,577 - INFO - Validating data consistency...
2025-02-16 20:45:07,276 - INFO - Analyzing patient history based on query...
2025-02-16 20:45:07,277 - INFO - Adding medical context to the analysis...
2025-02-16 20:45:07,278 - INFO - Suggesting potential next steps...


2025-02-16 20:45:07,285 - INFO - Generating AI-enhanced summary...


Enter your medical query (or type 'exit' to quit): List all medications prescribed to the patient


2025-02-16 20:47:02,751 - INFO - Searching for patient data in multiple sources...
2025-02-16 20:49:26,446 - INFO - Validating data consistency...
2025-02-16 20:49:26,778 - INFO - Analyzing patient history based on query...
2025-02-16 20:49:26,780 - INFO - Adding medical context to the analysis...
2025-02-16 20:49:26,780 - INFO - Suggesting potential next steps...


2025-02-16 20:49:26,786 - INFO - Generating AI-enhanced summary...


Enter your medical query (or type 'exit' to quit): Give me a summary of the patient's medical history


2025-02-16 20:51:54,448 - INFO - Searching for patient data in multiple sources...
2025-02-16 20:54:28,133 - INFO - Validating data consistency...
2025-02-16 20:54:28,384 - INFO - Analyzing patient history based on query...
2025-02-16 20:54:28,385 - INFO - Adding medical context to the analysis...
2025-02-16 20:54:28,385 - INFO - Suggesting potential next steps...


2025-02-16 20:54:28,387 - INFO - Generating AI-enhanced summary...


Enter your medical query (or type 'exit' to quit): List all past hospital admissions


2025-02-16 21:03:20,430 - INFO - Searching for patient data in multiple sources...
2025-02-16 21:05:32,613 - INFO - Validating data consistency...
2025-02-16 21:05:32,750 - INFO - Analyzing patient history based on query...
2025-02-16 21:05:32,750 - INFO - Adding medical context to the analysis...
2025-02-16 21:05:32,751 - INFO - Suggesting potential next steps...


2025-02-16 21:05:32,754 - INFO - Generating AI-enhanced summary...


Enter your medical query (or type 'exit' to quit): Does the patient have any cardiovascular risk factors?


2025-02-16 21:08:01,859 - INFO - Searching for patient data in multiple sources...
2025-02-16 21:10:12,304 - INFO - Validating data consistency...
2025-02-16 21:10:12,807 - INFO - Analyzing patient history based on query...
2025-02-16 21:10:12,808 - INFO - Adding medical context to the analysis...
2025-02-16 21:10:12,808 - INFO - Suggesting potential next steps...


2025-02-16 21:10:12,812 - INFO - Generating AI-enhanced summary...


Enter your medical query (or type 'exit' to quit): exit
