In [None]:
from llama_index import SimpleDirectoryReader
from llama_index.node_parser import SimpleNodeParser
import weaviate

In [None]:
# connect to your weaviate instance

from weaviate.embedded import EmbeddedOptions

client = weaviate.Client(
  embedded_options=EmbeddedOptions()
)


![alternative text](../docs/images/PXL_20230726_203549965.jpg)

## Extracting data from a scanned path report image.

In [None]:
surgery_image = SimpleDirectoryReader('/Users/vinayak/projects/kaiser/data/tcga_scanned_image/').load_data()

In [None]:
surgery_image
# parsing has few issues as some of the text is garbled. e.g. date of birth is missing a slash, 
# tubule is parsed as Tuelle
# score is also showing up as soce

In [None]:
# chunk up the data posts into nodes 
parser = SimpleNodeParser()
nodes = parser.get_nodes_from_documents(surgery_image)

In [None]:
from llama_index.vector_stores import WeaviateVectorStore
from llama_index import VectorStoreIndex, StorageContext
from llama_index.storage.storage_context import StorageContext


# construct vector store
vector_store = WeaviateVectorStore(weaviate_client = client, index_name="Surgery_Img_Scanned", text_key="content")

# setting up the storage for the embeddings
storage_context = StorageContext.from_defaults(vector_store = vector_store)
#
# set up the index
index = VectorStoreIndex(nodes, storage_context = storage_context)


In [None]:
# Simple one question answer 🚀
query_engine = index.as_query_engine()
response = query_engine.query("Who is this report about? What is it about?")
print(response)

In [None]:
#    Larger extraction 🚀
##   Trying to get semantic search by askign sex instead of gender
###  Hoping it does NOT hallucinate when asked for something that does not exist!

query_engine = index.as_query_engine()
response = query_engine.query("Please give me the patient name, age, sex, social security number, examining doctor, location, date of birth, height and report date. Please double check your work carefully. If any of these items are not present please say NA.")
print(response)

In [None]:
# Summazrize just the diagnosis 🚀
query_engine = index.as_query_engine()
response = query_engine.query("What is the diagnosis?")
print(response)


## CONCERN: It missed some more details, picked up one amongst 3 other diagnosis!

In [None]:
# Summazrize just the diagnosis 🚀
query_engine = index.as_query_engine()
response = query_engine.query("Give me a detail of all the diagnosis in the document?")
print(response)

## MILD CONCERN? Looks like now it got all the details and was able to reference 
## "please see syntopic report" and extract text from there as well.
## However some of the words are garbled, e.g. Score is Soce?


In [None]:
# Let's see if it is able to get specific Score parsed corectly? 🚀
query_engine = index.as_query_engine()
response = query_engine.query("What is the mitotic count score?")
print(response)

## CONCERN: Not useful! Lie!

### Here is another complementry approach using langchain and unstructured.io

In [None]:
from langchain.document_loaders.image import UnstructuredImageLoader

In [None]:
loader = UnstructuredImageLoader("/Users/vinayak/projects/kaiser/data/surgery_image/surg_path.jpeg")

In [None]:
data = loader.load()

In [None]:
data

# This parsing seems much better, it does not have broken/garbled text.

In [None]:
## At this point I don't know how to directly connect this to llama index to get its other goodies since the 
## two objects are incompatible. So I a doing the quicker hack, giving this parsed information directly to GPT to see what I get?


## This one seems promising via llama hub: https://llamahub.ai/l/file-unstructured


In [None]:
from pathlib import Path
from llama_hub.file.unstructured.base import UnstructuredReader

loader = UnstructuredReader()
documents = loader.load_data(file='/Users/vinayak/projects/kaiser/data/tcga_scanned_image/TCGA4.png')
print(documents)

In [None]:
# chunk up the data posts into nodes 
parser = SimpleNodeParser()
nodes = parser.get_nodes_from_documents(documents)

In [None]:
from llama_index.vector_stores import WeaviateVectorStore
from llama_index import VectorStoreIndex, StorageContext
from llama_index.storage.storage_context import StorageContext


# construct vector store
vector_store = WeaviateVectorStore(weaviate_client = client, index_name="TCGA_Img_Scanned_Unstructured", text_key="content")

# setting up the storage for the embeddings
storage_context = StorageContext.from_defaults(vector_store = vector_store)
#
# set up the index
index = VectorStoreIndex(nodes, storage_context = storage_context)


In [None]:
# Summazrize just the diagnosis 🚀
query_engine = index.as_query_engine()
response = query_engine.query("Is there a HIPAA discrepancy?")
print(response)

##Parsed better but did not infer the connected extension like the previous case!

In [None]:
# Summazrize just the diagnosis 🚀
query_engine = index.as_query_engine()
response = query_engine.query("Give me a detail of all the diagnosis in the document? Please also include any references of additional details")
print(response)

##Parsed better but did not infer the connected extension like the previous case!

In [None]:
# Let's see if it is able to get specific Score parsed correctly? 🚀
query_engine = index.as_query_engine()
response = query_engine.query("What is the mitotic count score?")
print(response)

## That is much better, because it parsed better with unstructured model!

In [None]:
print(response.get_formatted_sources())

## the fragment size is too large hence the whole thing comes up not just the short blurb.
## 

In [None]:
# Let's see if it is able to get specific Score parsed correctly? 🚀
query_engine = index.as_query_engine()
response = query_engine.query("What is the mitotic count score? Also In a new line starting with word  Source: Give me the exact line which you used to give the answer.")
print(response)

# this is surprising to me, the LLM does a better job at returning the source than the parser itself!

In [None]:
#    Larger extraction 🚀
##   Trying to get semantic search by askign sex instead of gender
###  Hoping it does NOT hallucinate when asked for something that does not exist!

query_engine = index.as_query_engine()
response = query_engine.query("Please give me the patient name, age, sex, social security number, examining doctor, location, date of birth, height and report date. Please double check your work. If any of these items are not present please say NA. Also on a new line starting with # give the exact lines from the document you used to give the answer.")
print(response)

## Better since it got the SSN and DOB without missing dashes and slashes

In [None]:
response.source_nodes

## Now mark the answer in the image.

In [None]:
import cv2
import pytesseract
from PIL import Image

def highlight_sentence(image_path, sentence):
    # Load image with opencv
    img = cv2.imread(image_path)

    # Convert the image to gray scale
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    # Perform OCR using pytesseract
    d = pytesseract.image_to_data(gray, output_type=pytesseract.Output.DICT)

    # Zip together bounding box data
    bounding_boxes = list(zip(d['left'], d['top'], d['width'], d['height'], d['text']))

    # Convert sentence to lower case and split into words
    sentence_words = sentence.lower().split()

    # For storing the bounding box coordinates of the first and last word of the sentence
    first_word, last_word = None, None
    iter_words = iter(sentence_words)
    current_word = next(iter_words)

    for box in bounding_boxes:
        left, top, width, height, text = box
        if text.lower() == current_word:
            # if this is the first word in sentence
            if not first_word:
                first_word = (left, top, width, height)
            try:
                # try to go to the next word
                current_word = next(iter_words)
            except StopIteration:
                # If we're out of words, set this as the last box
                last_word = (left, top, width, height)
                break

    # If we found a matching set of words, draw a rectangle around it
    if first_word and last_word:
        cv2.rectangle(img, (first_word[0], first_word[1]), (last_word[0] + last_word[2], last_word[1] + last_word[3]), (0, 255, 0), 2)

    # Convert the image back to PIL image for better IO in python
    img_pil = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
    return img_pil

# Use the function
image_path = "/Users/vinayak/projects/kaiser/data/surgery_image/surg_path.jpeg"
sentence = 'Patient Name: DARROUGH, WINDY CAROLE'
highlighted_img = highlight_sentence(image_path, sentence)
highlighted_img.show()  # Display the image

In [None]:
query_engine = index.as_query_engine()
response = query_engine.query("What is the name of the patient? Give me the exact line used to give me the answer on a seperate line.")
print(response)

In [None]:
query_engine = index.as_query_engine()
response = query_engine.query("Can you give me a three bullet point summary of key points in the document?")
print(response)

In [None]:
query_engine = index.as_query_engine()
response = query_engine.query("Can you give me a one line summary of the document?")
print(response)

In [None]:
from PIL import Image
import pytesseract

# Open the image file
img = Image.open('/Users/vinayak/Desktop/TCGA1.png')

# Resize the image
width, height = img.size
img = img.resize((width*10, height*10), Image.BICUBIC)

# Apply OCR to the image
text = pytesseract.image_to_string(img)

# Print the text
print(text)