In [None]:
!sudo apt install tesseract-ocr -y
!sudo apt install libtesseract-dev -y
!sudo apt install poppler-utils -y

In [None]:
!pip install langchain unstructured[all-docs] pydantic lxml faiss-cpu google-generativeai

In [None]:
!pip install langchain_google_genai

In [None]:
%pip install -U langchain-community

In [None]:
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain.schema.messages import HumanMessage, SystemMessage
from langchain.schema.document import Document
from langchain.vectorstores import FAISS
from langchain.retrievers.multi_vector import MultiVectorRetriever
from unstructured.partition.pdf import partition_pdf

In [None]:
from google.colab import userdata

In [None]:
gemini_api_key = userdata.get('GOOGLE_API_KEY')

In [None]:
output_path="./images"

In [None]:
raw_pdf_elements = partition_pdf(    //unstructred pdf parser
    filename="/content/AC-Aids-for-Dogs_Canine-Periodontal-Disease_0.pdf",
    extract_images_in_pdf=True,
    infer_table_structure=True,
    chunking_strategy="by_title",
    max_characters=4000,
    new_after_n_chars=3800,
    combine_text_under_n_chars=2000,
    extract_image_block_output_dir=output_path,
)

In [None]:
text_elements=[]
table_elements=[]

text_summaries =[]
table_summaries=[]
summary_prompt="""
Summarize the following {element_type}:
{element}
"""

In [None]:

summary_chain = LLMChain(
    llm=ChatGoogleGenerativeAI(model="gemini-pro", google_api_key=gemini_api_key),
    prompt=PromptTemplate.from_template(summary_prompt)
)

In [None]:
for e in raw_pdf_elements:
    if 'CompositeElement' in repr(e):
        text_elements.append(e.text)
        summary = summary_chain.run({'element_type': 'text', 'element': e})
        text_summaries.append(summary)

    elif 'Table' in repr(e):
        table_elements.append(e.text)
        summary = summary_chain.run({'element_type': 'table', 'element': e})
        table_summaries.append(summary)


In [None]:
image_elements = []
image_summaries = []

def encode_image(image_path):
  with open(image_path, "rb") as f:
    return base64.b64encode(f.read()).decode('utf-8')

def summarize_image(encoded_image):
  prompt = [
      SystemMessage(content="You are a bot that is good in analyzing image related to Dog's health"),
      HumanMessage(content=[

          {
              "type": "text",
              "text": "Describe the content of the image."
          },

          {
              "type": "image_url",
              "image_url": {
                  "url": f"data:image/jpeg;base64,{encoded_image}"
              },
          },
      ])
  ]
  response = ChatGoogleGenerativeAI(
      model="gemini-1.5-flash",
      temperature=0.4,
      google_api_key=gemini_api_key
  ).invoke(prompt)
  return response.content

for i in os.listdir(output_path):
  if i.endswith(('.png', '.jpg', '.jpeg')):
    image_path = os.path.join(output_path, i)
    encoded_image = encode_image(image_path)
    image_elements.append(encoded_image)
    summary = summarize_image(encoded_image)
    image_summaries.append(summary)


In [None]:
documents=[]
retrieve_contents=[]

for e,s in zip(text_elements,text_summaries):
  i=str(uuid.uuid4())
  doc=Document(
      page_content=s,
      metadata={
          'id' : i,
          'type': 'text',
          'original_content' : e
      }
  )
  retrieve_contents.append((i,e))
  documents.append(doc)

for e, s  in zip(table_elements, table_summaries):
  doc=Document(
      page_content=s,
      metadata={
          'id' : i,
          'type': 'table',
          'original_content' : e
      }
  )
  retrieve_contents.append((i, e))
  documents.append(doc)

for e, s in zip(image_elements, image_summaries):
  doc=Document(
      page_content=s,
      metadata={
          'id' : i,
          'type': 'image',
          'original_content' : e
      }
  )
  retrieve_contents.append((i, e))
  documents.append(doc)

embedding_model = GoogleGenerativeAIEmbeddings(
    model="models/embedding-001",
    google_api_key=gemini_api_key
)

vectorstore = FAISS.from_documents(documents, embedding=embedding_model)

In [None]:
vectorstore.save_local("faiss_index")

In [None]:
embedding_model = GoogleGenerativeAIEmbeddings(
    model="models/embedding-001",
    google_api_key=gemini_api_key
)

In [None]:
db = FAISS.load_local("faiss_index", embedding_model, allow_dangerous_deserialization=True)

In [None]:
db