In [1]:
from llama_index import SimpleDirectoryReader
from llama_index.node_parser import SimpleNodeParser
import weaviate

In [2]:
# connect to your weaviate instance

from weaviate.embedded import EmbeddedOptions

client = weaviate.Client(
  embedded_options=EmbeddedOptions()
)


Started /Users/vinayak/.cache/weaviate-embedded: process ID 15481


{"action":"startup","default_vectorizer_module":"none","level":"info","msg":"the default vectorizer modules is set to \"none\", as a result all new schema classes without an explicit vectorizer setting, will use this vectorizer","time":"2023-08-24T14:15:58-07:00"}
{"action":"startup","auto_schema_enabled":true,"level":"info","msg":"auto schema enabled setting is set to \"true\"","time":"2023-08-24T14:15:58-07:00"}
{"action":"hnsw_vector_cache_prefill","count":3000,"index_id":"kaiser_vk_data_XZ11Gl2OGZs4","level":"info","limit":1000000000000,"msg":"prefilled vector cache","time":"2023-08-24T14:15:58-07:00","took":2098403}
{"action":"hnsw_vector_cache_prefill","count":3000,"index_id":"kaiser_vk_hospital_encounters_data_dDYcXtmUpR47","level":"info","limit":1000000000000,"msg":"prefilled vector cache","time":"2023-08-24T14:15:58-07:00","took":4735684}
{"action":"hnsw_vector_cache_prefill","count":3000,"index_id":"normal_path_data_anthropic_MmeJlEhLODBs","level":"info","limit":1000000000000

![alternative text](../docs/images/PXL_20230726_203549965.jpg)

## This is a pathology report, containing 40 some pages of all sorts of blood work

In [3]:
path_data = SimpleDirectoryReader('/Users/vinayak/projects/kaiser/data/mid_sample_emr/').load_data()

In [4]:
# chunk up the data posts into nodes 
parser = SimpleNodeParser()
nodes = parser.get_nodes_from_documents(path_data)

In [5]:
from llama_index.vector_stores import WeaviateVectorStore
from llama_index import VectorStoreIndex, StorageContext
from llama_index.storage.storage_context import StorageContext


# construct vector store
vector_store = WeaviateVectorStore(weaviate_client = client, index_name="Mid_emr_data", text_key="content")

# setting up the storage for the embeddings
storage_context = StorageContext.from_defaults(vector_store = vector_store)
#
# set up the index
index = VectorStoreIndex(nodes, storage_context = storage_context)


In [6]:
# Segmenting report into pre determined types. This can also be done via enums in "class" code.
query_engine = index.as_query_engine()
question = """
What kind of report is this? 
Your choices are pathology report, lab report, genetic test report, clinical notes or radiology report.
"""
response = query_engine.query(question)
print(response)
# I am surprised just prompting is doing it.

This is a lab report.


In [8]:
# Segmenting report into pre determined types. This can also be done via enums in "class" code.
query_engine = index.as_query_engine()
question = """
This is an EMR dump from the hospital and contains all my visits to the hospital in chronological order please make a table of all the visits with columns provider_name, department, visit_date, visit_reason, visit_summary
"""
response = query_engine.query(question)
print(response)
# I am surprised just prompting is doing it.


Provider Name | Department | Visit Date | Visit Reason | Visit Summary
Kulkarni, Vinayak V | INTERNAL MEDICINE | 4/20/2013 | Scheduled Telephone Encounter | Clinical Notes
Kulkarni, Vinayak V | INTERNAL MEDICINE | 8/28/2013 | Patient Secure Message | Messages


In [None]:
# A useful return object which has many goodies
response.source_nodes

In [None]:
# How did it come to the conclusion that it was a patholiogy report?

evidence = response.source_nodes[0].node.text
location = response.source_nodes[0].node.metadata
evidence, location

In [None]:
# Let's underline the text in PDF so user can see the rational for the decision

import fitz

def highlight_sentence(pdf_path, output_path, sentences):
    doc = fitz.open(pdf_path)
    
    for sentence in sentences:
        for page in doc:
            text_instances = page.search_for(sentence)

            # Draw the highlight rectangles for each instance in the text
            for inst in text_instances:
                highlight = page.add_highlight_annot(inst)

    # Save the PDF with the highlighted sentences
    doc.save(output_path, garbage=4, deflate=True, clean=True)

# Use the function
pdf_path = "/Users/vinayak/projects/kaiser/data/normal_pathology_report/report-4d3e60c1fbf22977fa883241f5766cb5156c7f39.pdf"
output_path = "/Users/vinayak/projects/kaiser/data/temp/output.pdf"
sentences = evidence.rsplit("\n")
highlight_sentence(pdf_path, output_path, sentences)


### Some more harder queries

In [None]:
# and now query 🚀
query_engine = index.as_query_engine()
response = query_engine.query("Give me a table with all the tests, the result, unit and normal range.")
print(response)

## Why does this not work???

In [None]:
# same query but give it more context via the prompt works.
query_engine = index.as_query_engine()
question = """This is a pathology lab blood report. Its has various sections about different tests conducted. For the section on hematology, please give me the test, result, unit and reference interval in a table."
"""
response = query_engine.query(question)
print(response)

## CONCERN: There are some values on othere pages it is missing!

## Attempt to configure the retriever so I can get *all* results!

## Making it more complex since we need top 'n' results not only top 2

In [None]:
from llama_index.llms import OpenAI

llm = OpenAI(model="text-davinci-003", temperature=0, max_tokens=4000)

In [None]:
from llama_index import (
    VectorStoreIndex,
    get_response_synthesizer,
)
from llama_index.retrievers import VectorIndexRetriever
from llama_index.query_engine import RetrieverQueryEngine


# configure retriever
retriever = VectorIndexRetriever(
    index=index, 
    similarity_top_k=10,
)

# configure response synthesizer
response_synthesizer = get_response_synthesizer(
    response_mode="tree_summarize",
)

# assemble query engine
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer,
)

# query
response = query_engine.query(question)
print(response)

In [None]:
from llama_index import (
    VectorStoreIndex,
    get_response_synthesizer,
)
from llama_index.retrievers import VectorIndexRetriever
from llama_index.query_engine import RetrieverQueryEngine


# configure retriever
retriever = VectorIndexRetriever(
    index=index, 
    similarity_top_k=20,
)

# configure response synthesizer
response_synthesizer = get_response_synthesizer(
    response_mode="tree_summarize",
)

# assemble query engine
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer,
)

# query
response = query_engine.query(question)
print(response)

In [None]:
len(response.source_nodes)