In [None]:
!pip install --quiet --upgrade google_cloud_firestore google_cloud_aiplatform langchain langchain-google-vertexai langchain_community langchain_experimental pymupdf

In [None]:
import vertexai
from vertexai.language_models import TextEmbeddingModel
from vertexai.generative_models import GenerativeModel

import pickle
from IPython.display import display, Markdown

from langchain_google_vertexai import VertexAIEmbeddings
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_experimental.text_splitter import SemanticChunker

from google.cloud import firestore
from google.cloud.firestore_v1.vector import Vector
from google.cloud.firestore_v1.base_vector_query import DistanceMeasure

PROJECT_ID = "qwiklabs-gcp-00-7877708d10e9"
LOCATION = "us-central1"

vertexai.init(project=PROJECT_ID, location=LOCATION)

embedding_model = VertexAIEmbeddings(model_name="text-embedding-004")

In [None]:
def clean_page(page):
  return page.page_content.replace("-\n","")\
                          .replace("\n"," ")\
                          .replace("\x02","")\
                          .replace("\x03","")\
                          .replace("fo d P R O T E C T I O N  T R A I N I N G  M A N U A L","")\
                          .replace("N E W  Y O R K  C I T Y  D E P A R T M E N T  O F  H E A L T H  &  M E N T A L  H Y G I E N E","")


from langchain_experimental.text_splitter import SemanticChunker

from langchain_community.document_loaders import PyMuPDFLoader
loader = PyMuPDFLoader("nyc_food_safety_manual.pdf")
data = loader.load()

cleaned_pages = [clean_page(page) for page in data]


text_splitter = SemanticChunker(embedding_model)

docs = text_splitter.create_documents(cleaned_pages[:5])

In [None]:
!gcloud storage cp gs://partner-genai-bucket/genai069/nyc_food_safety_manual.pdf .

!gsutil cp gs://partner-genai-bucket/genai069/chunked_content.pkl .
!gsutil cp gs://partner-genai-bucket/genai069/chunked_embeddings.pkl .

chunked_content = pickle.load(open("chunked_content.pkl", "rb"))
chunked_embeddings = pickle.load(open("chunked_embeddings.pkl", "rb"))

In [None]:
from google.cloud import firestore
from google.cloud.firestore_v1.vector import Vector
import uuid

db = firestore.Client()
collection = db.collection("food-safety")

i = 0
while i < len(chunked_content):


  doc = {
    "id": str(uuid.uuid4()),
    "content": chunked_content[i],
    "embedding": Vector(chunked_embeddings[i])
  }

  collection.add(doc)
  i += 1

In [None]:
  def search_vector_database(query: str):
      # 1. Generate the embedding of the query
      embedding = embedding_model.embed([query])[0]

      # 2. Get the 5 nearest neighbors from your collection.
      # Call the get() method on the result of your call to
      # find_nearest to retrieve document snapshots.

      vector_query = collection.find_nearest(
          vector_field="embedding",
          query_vector=Vector(embedding),
          # query_vector=Vector([3.0, 1.0, 2.0]),
          distance_measure=DistanceMeasure.EUCLIDEAN,
          limit=5,
          distance_result_field="vector_distance",
      )

      docs = vector_query.get()

      # 3. Call to_dict() on each snapshot to load its data.
      # Combine the snapshots into a single string named context

      context = " ".join(doc.to_dict().get("content", "") for doc in docs)

      return context

search_vector_database("How should I store food?")

In [None]:
from vertexai.generative_models import (
        GenerationConfig,
        GenerativeModel,
        SafetySetting
)

from vertexai.preview.prompts import Prompt

def ask_gemini(question):
    # 1. Create a prompt_template with instructions to the model
    # to use provided context info to answer the question.
    prompt_template = """
        Role: You work for Cymbal Shops, a chain offering prepared meals to-go in busy downtown areas.
        The company's employees in the New York area needs to meet the New York City Department of Health and Mental Hygiene's food safety guidelines as provided in this Food Protection Training Manual.
        You are an assistant for question-answering tasks.
        Use the following pieces of retrieved context to answer the question.
        If you don't know the answer, just say that you don't know or NA.
        Keep the answer to the point follow the guidelines below.
        Guidelines:        
        - For yes/no questions, answer with either 'Yes' or 'No' based on the context.
        - If no relevant information is available, return 'NA'.
        - Do not provide any additional commentary or filler text. Focus on precision and brevity.
        Use the following pieces of retrieved context to answer.
        Question: {query}
        Context: {context}
    """
    # from langchain_core.prompts import PromptTemplate
    # prompt = PromptTemplate(template=prompt_template, input_variables=["query", "context"])

    
    # 2. Use your search_vector_database function to retrieve context
    # relevant to the question.
    context = search_vector_database(question)

    # 3. Format the prompt template with the question & context

    variables = [
        {
            "query": [question],
            "context": [context]            
        },
    ]

    generation_config = GenerationConfig(temperature=0)
    
    safety_settings = [
      SafetySetting(
          category=SafetySetting.HarmCategory.HARM_CATEGORY_HATE_SPEECH,
          threshold=SafetySetting.HarmBlockThreshold.OFF
      ),
      SafetySetting(
          category=SafetySetting.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
          threshold=SafetySetting.HarmBlockThreshold.OFF
      ),
      SafetySetting(
          category=SafetySetting.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
          threshold=SafetySetting.HarmBlockThreshold.OFF
      ),
      SafetySetting(
          category=SafetySetting.HarmCategory.HARM_CATEGORY_HARASSMENT,
          threshold=SafetySetting.HarmBlockThreshold.OFF
      ),
    ]

    prompt = Prompt(
        prompt_data=prompt_template,
        model_name="gemini-pro",
        variables=variables,
        generation_config=generation_config,
        safety_settings=safety_settings,
        system_instruction=["""Respond to the question concisely"""]
    )


    # 4. Pass the complete prompt template to gemini and get the text
    # of its response to return below.
    

    # model = GenerativeModel(model_name="gemini-pro", generation_config=GenerationConfig(temperature=0))
    # response = model.generate_content(prompt)

    # return response.text

    response = prompt.generate_content(
        contents=prompt.assemble_contents(**prompt.variables[0])        
    )

    return response.text

ask_gemini("What temperature range do Mesophilic Bacteria grow best in?")