In [1]:
!pip install --quiet --upgrade google-cloud-logging google_cloud_firestore google_cloud_aiplatform langchain langchain-google-vertexai langchain_community langchain_experimental pymupdf


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/229.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m229.5/229.5 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/7.9 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.7/7.9 MB[0m [31m82.3 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m7.9/7.9 MB[0m [31m121.9 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m80.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m60.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.0/101.0 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━

In [1]:
import vertexai
import logging
import google.cloud.logging
from vertexai.language_models import TextEmbeddingModel
from vertexai.generative_models import GenerativeModel

import pickle
from IPython.display import display, Markdown

from langchain_google_vertexai import VertexAIEmbeddings
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_experimental.text_splitter import SemanticChunker

from google.cloud import firestore
from google.cloud.firestore_v1.vector import Vector
from google.cloud.firestore_v1.base_vector_query import DistanceMeasure


In [2]:
vertexai.init(project="qwiklabs-gcp-04-41f829ec5739", location="us-central1")

embedding_model = VertexAIEmbeddings(model_name="text-embedding-005")




In [3]:
!gcloud storage cp gs://partner-genai-bucket/genai069/nyc_food_safety_manual.pdf .


Copying gs://partner-genai-bucket/genai069/nyc_food_safety_manual.pdf to file://./nyc_food_safety_manual.pdf

Average throughput: 175.3MiB/s


In [4]:
data = PyMuPDFLoader("nyc_food_safety_manual.pdf").load()


In [5]:
def clean_page(page):
    return page.page_content.replace("-\n", "")\
                            .replace("\n", " ")\
                            .replace("\x02", "")\
                            .replace("\x03", "")\
                            .replace("fo d P R O T E C T I O N  T R A I N I N G  M A N U A L", "")\
                            .replace("N E W  Y O R K  C I T Y  D E P A R T M E N T  O F  H E A L T H  &  M E N T A L  H Y G I E N E", "")

cleaned_pages = [clean_page(page) for page in data]


In [6]:
semantic_chunker = SemanticChunker(embedding_model)
chunked_docs = []

for i in range(5):
    docs = semantic_chunker.create_documents([cleaned_pages[i]])
    chunked_docs.extend(docs)

chunked_content = [doc.page_content for doc in chunked_docs]


In [7]:
chunked_embeddings = embedding_model.embed_documents(chunked_content)


In [8]:
!gcloud storage cp gs://partner-genai-bucket/genai069/chunked_content.pkl .
!gcloud storage cp gs://partner-genai-bucket/genai069/chunked_embeddings.pkl .

import pickle
chunked_content = pickle.load(open("chunked_content.pkl", "rb"))
chunked_embeddings = pickle.load(open("chunked_embeddings.pkl", "rb"))


Copying gs://partner-genai-bucket/genai069/chunked_content.pkl to file://./chunked_content.pkl
Copying gs://partner-genai-bucket/genai069/chunked_embeddings.pkl to file://./chunked_embeddings.pkl

Average throughput: 158.3MiB/s


In [9]:
client = google.cloud.logging.Client()
client.setup_logging()
log_message = f"chunked contents are: {chunked_content[0][:20]}"
logging.info(log_message)


INFO:root:chunked contents are: The Health Code Thes


In [22]:
db = firestore.Client()
collection = db.collection('food-safety')


In [23]:
import random
for content, embedding in zip(chunked_content, chunked_embeddings):
    doc_ref = collection.document()
    doc_ref.set({
        'content': content,
        'embedding': Vector(embedding)
    })


In [24]:
def search_vector_database(query: str):
    context = ""

    # 1. Generate the embedding of the query
    query_embedding = embedding_model.embed_query(query)

    # Wrap in Firestore Vector class
    query_vector = Vector(query_embedding)

    # 2. Get the 5 nearest neighbors - no 'field' parameter
    docs = collection.find_nearest(
        query_vector=query_vector,
        distance_measure=DistanceMeasure.DOT_PRODUCT,
        limit=5
    ).get()

    # 3. Compile context from retrieved docs
    pieces = []
    for doc in docs:
        data = doc.to_dict()
        if 'content' in data:
            pieces.append(data['content'])
    context = "\n".join(pieces)

    return context


In [31]:
def search_vector_database(query: str):
    query_embedding = embedding_model.embed_query(query)
    query_vector = Vector(query_embedding)
    docs = collection.find_nearest(
        "embedding",
        query_vector=query_vector,
        distance_measure=DistanceMeasure.DOT_PRODUCT,
        limit=5
    ).get()

    pieces = []
    for doc in docs:
        data = doc.to_dict()
        if "content" in data:
            pieces.append(data["content"])

    context = "\n".join(pieces)
    return context

# Test the function
result = search_vector_database("How should I store food?")
print(result)

 Store foods away from dripping condensate , at least six inches above the floor and with enough space between items to encourage air circulation. Freezer Storage Freezing is an excellent method for prolonging the shelf life of foods. By keeping foods frozen solid, the bacterial growth is minimal at best. However, if frozen foods are thawed and then refrozen, then harmful bacteria can reproduce to dangerous levels when thawed for the second time. In addition to that, the quality of the food is also affected. Never refreeze thawed foods, instead use them immediately. Keep the following rules in mind for freezer storage:  Use First In First Out method of stock rotation. All frozen foods should be frozen solid with temperature at 0°F or lower. Always use clean containers that are clearly labeled and marked, and have proper and secure lids. Allow adequate spacing between food containers to allow for proper air circulation. Never use the freezer for cooling hot foods. * * Tip: When receivin