In [None]:
from llama_index import SimpleDirectoryReader
from llama_index.node_parser import SimpleNodeParser
import weaviate

In [None]:
# connect to your weaviate instance

from weaviate.embedded import EmbeddedOptions

client = weaviate.Client(
  embedded_options=EmbeddedOptions()
)


![alternative text](../docs/images/PXL_20230726_203549965.jpg)

## This is a pathology report, containing 40 some pages of all sorts of blood work

In [None]:
path_data = SimpleDirectoryReader('/Users/vinayak/projects/kaiser/data/mid_sample_emr//').load_data()

In [None]:
# chunk up the data posts into nodes 
parser = SimpleNodeParser()
nodes = parser.get_nodes_from_documents(path_data)

In [None]:
from llama_index.vector_stores import WeaviateVectorStore
from llama_index import VectorStoreIndex, StorageContext
from llama_index.storage.storage_context import StorageContext


# construct vector store
vector_store = WeaviateVectorStore(weaviate_client = client, index_name="VK_Medium_Report", text_key="content")

# setting up the storage for the embeddings
storage_context = StorageContext.from_defaults(vector_store = vector_store)
#
# set up the index
index = VectorStoreIndex(nodes, storage_context = storage_context)


In [None]:
# Segmenting report into pre determined types. This can also be done via enums in "class" code.
query_engine = index.as_query_engine()
question = """
What kind of report is this? 
Your choices are pathology report, lab report, EMR records, genetic test report, clinical notes or radiology report.
"""
response = query_engine.query(question)
print(response)
# I am surprised just prompting is doing it.

In [None]:
# How did it come to the conclusion that it was a patholiogy report?

evidence = response.source_nodes[0].node.text
location = response.source_nodes[0].node.metadata
evidence, location

In [None]:
query_engine = index.as_query_engine()
question = """
This is EMR data and I want to mask out all the PHI data from the documents. Can you give me the phi found in this document?
"""
response = query_engine.query(question)
print(response)

In [None]:
query_engine = index.as_query_engine()
question = """
This is EMR data and I want to mask out all the HIPAA fields from the documents. Can you give me the HIPAA fields found in this document?
"""
response = query_engine.query(question)
print(response)

In [None]:
query_engine = index.as_query_engine()
question = """
This is EMR data and I want to mask out all the HIPAA fields from the documents. Can you give me the HIPAA fields found in this document?
"""
response = query_engine.query(question)
print(response)

In [None]:
# Segmenting report into pre determined types. This can also be done via enums in "class" code.
query_engine = index.as_query_engine()
question = """
What kind of report is this? 
Your choices are pathology report, lab report, genetic test report, clinical notes or radiology report.
Also in a new line please give the exact line from the document which led you to the answer.
"""
response = query_engine.query(question)
print(response)
# I am surprised just prompting is doing it.

In [None]:
# Let's underline the text in PDF so user can see the rational for the decision

import fitz

def highlight_sentence(pdf_path, output_path, sentences):
    doc = fitz.open(pdf_path)
    
    for sentence in sentences:
        for page in doc:
            text_instances = page.search_for(sentence)

            # Draw the highlight rectangles for each instance in the text
            for inst in text_instances:
                highlight = page.add_highlight_annot(inst)

    # Save the PDF with the highlighted sentences
    doc.save(output_path, garbage=4, deflate=True, clean=True)

# Use the function
pdf_path = "/Users/vinayak/projects/kaiser/data/normal_pathology_report/report-4d3e60c1fbf22977fa883241f5766cb5156c7f39.pdf"
output_path = "/Users/vinayak/projects/kaiser/data/temp/output.pdf"
sentences = evidence.rsplit("\n")
highlight_sentence(pdf_path, output_path, sentences)


### Some more harder queries

In [None]:
# and now query 🚀
query_engine = index.as_query_engine()
response = query_engine.query("This is a lab report. You are an expert medical transcriber. Give me a table with all the tests")
print(response)
a
## Why does this not work???

In [None]:
# and now query 🚀
query_engine = index.as_query_engine(k=10)
question = """
This is a lab report. You are an expert medical transcriber. 
For all the tests performed please return a 4 column table as test_name, date, lab and result. The result is either positive or negative.
"""
response = query_engine.query(question)
print(response)

## Why does this not work???

In [None]:
len(response.source_nodes)

In [None]:
# and now query 🚀
query_engine = index.as_query_engine(response_mode='tree_summarize', verbose=True)
response = query_engine.query("This is a lab report. You are an expert medical transcriber. Give me a list of all the analytes that were measured.")
print(response)

## Why does this not work???

In [None]:
# same query but give it more context via the prompt works.
query_engine = index.as_query_engine()
question = """This is a lab blood report. Its has many entries and various sections about different tests conducted. 
Please give me the test, result, unit and reference interval in a table. Make sure you do not miss any test"
"""
response = query_engine.query(question)
print(response)

## CONCERN: There are some values on othere pages it is missing!

In [None]:
query_engine = index.as_query_engine()
question = """
Please summarize the document for me in 3 paragraphs. First, describing the patient, Second noting the key observations and Third the diagnosis.
"""
response = query_engine.query(question)
print(response)

In [None]:
query_engine = index.as_query_engine()
question = """
What is the name of the patient and date of birth?"""
response = query_engine.query(question)
print(response)

In [None]:
query_engine = index.as_query_engine()
question = """
This document is a concatentaion of various visits to the hospital by the patient. Please give me a list of all the visits and tests along with the date"""
response = query_engine.query(question)
print(response)

In [None]:
query_engine = index.as_query_engine()
question = """
This document is a concatentaion of various visits to the hospital by the patient. Please give me a list of all the visits and tests along with the date"""
response = query_engine.query(question)
print(response)

## Attempt to configure the retriever so I can get *all* results!

## Making it more complex since we need top 'n' results not only top 2

In [None]:
from llama_index.llms import OpenAI

llm = OpenAI(model="text-davinci-003", temperature=0, max_tokens=4000)

In [None]:
from llama_index import (
    VectorStoreIndex,
    get_response_synthesizer,
)
from llama_index.retrievers import VectorIndexRetriever
from llama_index.query_engine import RetrieverQueryEngine


# configure retriever
retriever = VectorIndexRetriever(
    index=index, 
    similarity_top_k=10,
)

# configure response synthesizer
response_synthesizer = get_response_synthesizer(
    response_mode="refine",
)

# assemble query engine
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer,
)

# query

response = query_engine.query(question)
print(response)

In [None]:
get_response_synthesizer(res)

In [None]:
len(response.source_nodes)

In [None]:

for n in response.source_nodes:
    print("From: %r"%(n.node.metadata))
    

In [None]:
response.source_nodes[0].node.metadata


In [None]:
from llama_index.llms import OpenAI

In [None]:
llm = OpenAI?

In [None]:
VectorIndexRetriever.retrieve?