In [None]:
!sudo apt install tesseract-ocr -y
!sudo apt install libtesseract-dev -y
!sudo apt install poppler-utils -y

In [None]:
!pip install langchain unstructured[all-docs] pydantic lxml openai faiss-cpu tiktoken opencv-python


In [4]:
import os
import uuid
import base64
from IPython import display
from unstructured.partition.pdf import partition_pdf
from langchain.chat_models import ChatOpenAI
from langchain.embeddings import OpenAIEmbeddings
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain.schema.messages import HumanMessage, SystemMessage
from langchain.schema.document import Document
from langchain.vectorstores import FAISS
from langchain.retrievers.multi_vector import MultiVectorRetriever

In [19]:
from google.colab import userdata
openai_api_key = userdata.get('OPEN_API_KEY')
AWS_ACCESS_KEY_ID =  userdata.get('AWS_ACCESS_KEY_ID')
AWS_SECRET_ACCESS_KEY = userdata.get('AWS_SECRET_ACCESS_KEY')
AWS_REGION = userdata.get('AWS_REGION')
S3_BUCKET_NAME = 'crickbook'

In [6]:
output_path = '/images'

In [None]:
raw_pdf_elements = partition_pdf(
    filename="/media/AC-Aids-for-Dogs_Monitoring-for-Periodontal-Disease-in-Dogs.pdf",
    extract_images_in_pdf=True,
    infer_table_structure=True,
    chunking_strategy="by_title",
    max_characters=4000,
    new_after_n_chars=3800,
    combine_text_under_n_chars=2000,
    extract_image_block_output_dir=output_path,
)

In [None]:
!pip install boto3
import boto3

In [14]:
s3 = boto3.client('s3', aws_access_key_id=AWS_ACCESS_KEY_ID, aws_secret_access_key=AWS_SECRET_ACCESS_KEY, region_name=AWS_REGION)

In [18]:
# upload images to s3 and store in /home/uploaded_images
upload_path = '/media/uploaded_images'
img_urls = []

# def upload_img_to_s3(image_path):
#     with open(image_path, "rb") as f:
#       s3.upload_fileobj(f, S3_BUCKET_NAME, image_path)

# for i in os.listdir(output_path):
#     image_path = os.path.join(output_path, i)
#     upload_img_to_s3(image_path)

def read_img_from_s3(image_path):
    image_path = os.path.join(output_path, i)
    image_url = s3.generate_presigned_url(
        ClientMethod='get_object',
        Params={'Bucket': S3_BUCKET_NAME, 'Key': image_path},
        ExpiresIn=3600  # URL expiration time in seconds (optional)
    )
    img_urls.append(image_url)


for i in os.listdir(output_path):
    image_path = os.path.join(output_path, i)
    read_img_from_s3(image_path)

print(img_urls)


['https://crickbook.s3.amazonaws.com//images/figure-1-3.jpg?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAVRUVVFJD2SCX2WNV%2F20240509%2Fap-south-1%2Fs3%2Faws4_request&X-Amz-Date=20240509T053308Z&X-Amz-Expires=3600&X-Amz-SignedHeaders=host&X-Amz-Signature=65c75ec28b3d4b427411adaef23cf80cc19c84ac1636a63646ffa7b0768967e7', 'https://crickbook.s3.amazonaws.com//images/figure-1-1.jpg?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAVRUVVFJD2SCX2WNV%2F20240509%2Fap-south-1%2Fs3%2Faws4_request&X-Amz-Date=20240509T053308Z&X-Amz-Expires=3600&X-Amz-SignedHeaders=host&X-Amz-Signature=0fdd8b3f23142a1aad2c0cc8fef6765c82486f42fc2b809574f40b0c5cb6e023', 'https://crickbook.s3.amazonaws.com//images/figure-1-2.jpg?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAVRUVVFJD2SCX2WNV%2F20240509%2Fap-south-1%2Fs3%2Faws4_request&X-Amz-Date=20240509T053308Z&X-Amz-Expires=3600&X-Amz-SignedHeaders=host&X-Amz-Signature=aa9c7564db75e13ce071512ca55a07a3e95fbe815a14ca8e94de0c6bcfd250d5', 'https://cri

In [None]:
# Get text summaries and table summaries
text_elements = []
table_elements = []

text_summaries = []
table_summaries = []

summary_prompt = """
Summarize the following {element_type}:
{element}
"""

summary_chain = LLMChain(
    llm=ChatOpenAI(model="gpt-3.5-turbo", openai_api_key = openai_api_key, max_tokens=1024),
    prompt=PromptTemplate.from_template(summary_prompt)
)

for e in raw_pdf_elements:
    if 'CompositeElement' in repr(e):
        text_elements.append(e.text)
        summary = summary_chain.run({'element_type': 'text', 'element': e})
        text_summaries.append(summary)

    elif 'Table' in repr(e):
        table_elements.append(e.text)
        summary = summary_chain.run({'element_type': 'table', 'element': e})
        table_summaries.append(summary)

In [21]:
# Get image summaries
image_elements = []
image_summaries = []

def encode_image(image_path):
    with open(image_path, "rb") as f:
        return base64.b64encode(f.read()).decode('utf-8')

def summarize_image(encoded_image):
    prompt = [
        SystemMessage(content="You are a bot that is good at analyzing images related to Dog's health."),
        HumanMessage(content=[
            {
                "type": "text",
                "text": "Describe the contents of this image."
            },
            {
                "type": "image_url",
                "image_url": {
                    # "url": f"data:image/jpeg;base64,{encoded_image}"
                    "url": encoded_image
                },
            },
        ])
    ]
    response = ChatOpenAI(model="gpt-4-vision-preview", openai_api_key=openai_api_key, max_tokens=1024).invoke(prompt)
    return response.content

for i in img_urls:
    if i:
        image_elements.append(i)
        summary = summarize_image(i)
        image_summaries.append(summary)

# for i in os.listdir(output_path):
#     if i.endswith(('.png', '.jpg', '.jpeg')):
#         image_path = os.path.join(output_path, i)
#         encoded_image = encode_image(image_path)
#         image_elements.append(encoded_image)
#         summary = summarize_image(encoded_image)
#         image_summaries.append(summary)

In [25]:
documents = []
retrieve_contents = []

for e, s in zip(text_elements, text_summaries):
    i = str(uuid.uuid4())
    doc = Document(
        page_content = s,
        metadata = {
            'id': i,
            'type': 'text',
            'original_content': e
        }
    )
    retrieve_contents.append((i, e))
    documents.append(doc)

for e, s in zip(table_elements, table_summaries):
    doc = Document(
        page_content = s,
        metadata = {
            'id': i,
            'type': 'table',
            'original_content': e
        }
    )
    retrieve_contents.append((i, e))
    documents.append(doc)

for e, s in zip(image_elements, image_summaries):
    doc = Document(
        page_content = s,
        metadata = {
            'id': i,
            'type': 'image',
            'original_content': e
        }
    )
    retrieve_contents.append((i, s))
    documents.append(doc)

print(documents)

# vectorstore = FAISS.from_documents(documents=documents, embedding=OpenAIEmbeddings(openai_api_key=openai_api_key))


[Document(page_content="The text provides information on how to monitor for Periodontal Disease (PD) in dogs by regularly examining their teeth and gums. It outlines steps to visually assess the dog's oral health and categorizes different grades of PD based on symptoms. It also emphasizes the importance of handling the dog gently during examinations to ensure cooperation and decrease the risk of bites. If any abnormal conditions are detected, the text advises consulting with a veterinarian for treatment options.", metadata={'id': '7f287435-67ba-48b8-b1b6-997a644ad1f8', 'type': 'text', 'original_content': 'USDA a\n\nUnited States Department of Agriculture\n\nMonitoring for Periodontal Disease (PD) in Dogs\n\nExamining each dog’s teeth and gums regularly, such as once a month, is the best way to detect and treat disease or injury early.\n\nSTEP 1\n\nGently pull back the lips and cheeks in order to see the teeth and gums. Inspect both upper and lower jaw on both sides of the mouth. Don’t 

In [23]:
vectorstore.save_local("/home/faiss_index")


In [24]:
embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
db = FAISS.load_local("/home/faiss_index", embeddings, allow_dangerous_deserialization=True)

In [26]:
db

<langchain_community.vectorstores.faiss.FAISS at 0x7a0915236170>

In [27]:

prompt_template = """
You are a vet doctor and an expert in analyzing dog's health.
Answer the question based only on the following context, which can include text, images and tables:
{context}
Question: {question}
Don't answer if you are not sure and decline to answer and say "Sorry, I don't have much information about it."
Just return the helpful answer in as much as detailed possible.
Answer:
"""

In [28]:
qa_chain = LLMChain(llm=ChatOpenAI(model="gpt-4", openai_api_key = openai_api_key, max_tokens=1024),
                        prompt=PromptTemplate.from_template(prompt_template))

In [29]:
def answer(question):
    relevant_docs = db.similarity_search(question)
    context = ""
    relevant_images = []
    for d in relevant_docs:
        if d.metadata['type'] == 'text':
            context += '[text]' + d.metadata['original_content']
        elif d.metadata['type'] == 'table':
            context += '[table]' + d.metadata['original_content']
        elif d.metadata['type'] == 'image':
            context += '[image]' + d.page_content
            relevant_images.append(d.metadata['original_content'])
    result = qa_chain.run({'context': context, 'question': question})
    return result, relevant_images

In [39]:
result, relevant_images = answer("What is Severe Periodontitis ?")
print(result)

Severe Periodontitis, also referred to as Grade IV Periodontitis in the context provided, is a stage of gum disease in dogs that is characterized by severe inflammation of the gums. The gums may exhibit severe redness, inflammation, and may bleed easily. In some cases, pus may also be present indicating an infection. The teeth affected by this stage of the disease often have a large amount of tartar buildup, leading to loose or possibly missing teeth. This condition requires immediate veterinary attention to manage pain, control infection, and prevent further progression of the disease. Regular dental check-ups are crucial to detect such issues early and start appropriate treatment.


In [37]:
relevant_images[0]

'https://crickbook.s3.amazonaws.com//images/figure-1-8.jpg?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAVRUVVFJD2SCX2WNV%2F20240509%2Fap-south-1%2Fs3%2Faws4_request&X-Amz-Date=20240509T053308Z&X-Amz-Expires=3600&X-Amz-SignedHeaders=host&X-Amz-Signature=879a87e86937a011456f7b2d0c5cc31b68eb9d3485f17068416b6850f36a2cf6'

In [34]:
from IPython.display import Image, display

In [40]:
# display.display(display.Image(base64.b64decode(relevant_images[0])))
display(Image(url=relevant_images[0]))
