In [None]:
# Install Vertex AI LLM SDK
! pip install --user --upgrade google-cloud-aiplatform==1.44.0 langchain==0.1.12 langchain-google-vertexai==0.1.1 typing_extensions==4.9.0

# Dependencies required by Unstructured PDF loader
! sudo apt -y -qq install tesseract-ocr libtesseract-dev
! sudo apt-get -y -qq install poppler-utils
! pip install --user --upgrade unstructured==0.12.4 pdf2image==1.17.0 pytesseract==0.3.10 pdfminer.six==20221105
! pip install --user --upgrade pillow-heif==0.15.0 opencv-python==4.9.0.80 unstructured-inference==0.7.24 pikepdf==8.13.0 pypdf==4.0.1

# For Matching Engine integration dependencies (default embeddings)
! pip install --user --upgrade tensorflow_hub==0.16.1 tensorflow_text==2.15.0

Collecting google-cloud-aiplatform==1.44.0
  Downloading google_cloud_aiplatform-1.44.0-py2.py3-none-any.whl (4.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.2/4.2 MB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain==0.1.12
  Downloading langchain-0.1.12-py3-none-any.whl (809 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m809.1/809.1 kB[0m [31m24.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain-google-vertexai==0.1.1
  Downloading langchain_google_vertexai-0.1.1-py3-none-any.whl (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.9/48.9 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting typing_extensions==4.9.0
  Downloading typing_extensions-4.9.0-py3-none-any.whl (32 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain==0.1.12)
  Downloading dataclasses_json-0.6.5-py3-none-any.whl (28 kB)
Collecting jsonpatch<2.0,>=1.33 (from langchain==0.1.12)
  Downloading json

The following additional packages will be installed:
  libarchive-dev libleptonica-dev tesseract-ocr-eng tesseract-ocr-osd
The following NEW packages will be installed:
  libarchive-dev libleptonica-dev libtesseract-dev tesseract-ocr
  tesseract-ocr-eng tesseract-ocr-osd
0 upgraded, 6 newly installed, 0 to remove and 45 not upgraded.
Need to get 8,560 kB of archives.
After this operation, 31.6 MB of additional disk space will be used.
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 78, <> line 6.)
debconf: falling back to frontend: Readline
debconf: unable to initialize frontend: Readline
debconf: (This frontend requires a controlling tty.)
debconf: falling back to frontend: Teletype
dpkg-preconfigure: unable to re-open stdin: 
Selecting previously unselected package libarchive-dev:amd64.
(Reading database ... 121752 files and directories

Collecting tensorflow_text==2.15.0
  Downloading tensorflow_text-2.15.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m23.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tensorflow_text
Successfully installed tensorflow_text-2.15.0


In [None]:
# Automatically restart kernel so that the system can access newly-installed packages

import IPython

app = IPython.Application.instance()
app.kernel.do_shutdown(True)

{'status': 'ok', 'restart': True}

In [None]:
import sys

if "google.colab" in sys.modules:
    from google.colab import auth

    auth.authenticate_user()

In [None]:
import os
import urllib.request

if not os.path.exists("utils"):
    os.makedirs("utils")

url_prefix = "https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/main/language/use-cases/document-qa/utils"
files = ["__init__.py", "matching_engine.py", "matching_engine_utils.py"]

for fname in files:
    urllib.request.urlretrieve(f"{url_prefix}/{fname}", filename=f"utils/{fname}")

In [None]:
import json
import textwrap

# Utils
import time
import uuid
from typing import List

import bigframes.dataframe

import numpy as np
import vertexai

# Vertex AI
from google.cloud import aiplatform

print(f"Vertex AI SDK version: {aiplatform.__version__}")

# LangChain
import langchain

print(f"LangChain version: {langchain.__version__}")

from langchain.chains import RetrievalQA
from langchain.document_loaders import GCSDirectoryLoader
from langchain.embeddings import VertexAIEmbeddings
from langchain.llms import VertexAI
from langchain.prompts import PromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Import custom Matching Engine packages
from utils.matching_engine import MatchingEngine
from utils.matching_engine_utils import MatchingEngineUtils

Vertex AI SDK version: 1.44.0
LangChain version: 0.1.12


In [None]:
PROJECT_ID = "digital-gearing-411816"
REGION = "us-central1"

# Initialize Vertex AI SDK
vertexai.init(project=PROJECT_ID, location=REGION)

In [None]:
# Function to limit rate for Embeddings API
def rate_limit(max_per_minute):
    period = 60 / max_per_minute
    print("Waiting")
    while True:
        before = time.time()
        yield
        after = time.time()
        elapsed = after - before
        sleep_time = max(0, period - elapsed)
        if sleep_time > 0:
            print(".", end="")
            time.sleep(sleep_time)

# Class to perform vector embeddings using Vertex AI services
# Class CustomVertexAIEmbeddings: Child of class VertexAIEmbeddings
# Class VertexAIEmbeddings: LangChain's wrapper around GCP Vertex AI text embedding models API
# This class handles vector embeddings using GCP: Vertex AI services and technologies

class CustomVertexAIEmbeddings(VertexAIEmbeddings):
    requests_per_minute: int
    num_instances_per_batch: int

    # Overriding embed_documents method
    def embed_documents(self, texts: List[str]):
        limiter = rate_limit(self.requests_per_minute)
        results = []
        docs = list(texts)

        while docs:
            # Working in batches because the API accepts maximum 5
            # documents per request to get embeddings
            head, docs = (
                docs[: self.num_instances_per_batch],
                docs[self.num_instances_per_batch :],
            )
            chunk = self.client.get_embeddings(head)
            results.extend(chunk)
            next(limiter)

        return [r.values for r in results]

In [None]:
# Text model instance integrated with langChain
# Create a GEMINI LLM using LangChain's VertexAI class API

llm = VertexAI(
    model_name="gemini-1.0-pro",
    max_output_tokens=2048,
    temperature=-.5,
    top_p=0.8,
    top_k=40,
    verbose=True,
)

# Embeddings API integrated with langChain
# Create an instance, named "embeddings", of class CustomVertexAIEmbeddings
# This instance can handle 100 requests/queries per minute (QPM)

EMBEDDING_QPM = 100
EMBEDDING_NUM_BATCH = 5
embeddings = CustomVertexAIEmbeddings(
    requests_per_minute=EMBEDDING_QPM,
    num_instances_per_batch=EMBEDDING_NUM_BATCH
)

  warn_deprecated(


In [None]:
ME_REGION = "us-central1"
ME_INDEX_NAME = f"{PROJECT_ID}-me-index"
ME_EMBEDDING_DIR = f"{PROJECT_ID}-me-bucket"
ME_DIMENSIONS = 768

In [None]:
! set -x && gsutil mb -p $PROJECT_ID -l us-central1 gs://$ME_EMBEDDING_DIR

+ gsutil mb -p digital-gearing-411816 -l us-central1 gs://digital-gearing-411816-me-bucket
Creating gs://digital-gearing-411816-me-bucket/...


In [None]:
# dummy embeddings
init_embedding = {"id": str(uuid.uuid4()), "embedding": list(np.zeros(ME_DIMENSIONS))}

# Save dummy embeddings to a local JSON file
with open("embeddings_0.json", "w") as f:
    json.dump(init_embedding, f)

# Upload the dummy embeddings JSON file to cloud storage buckets
! set -x && gsutil cp embeddings_0.json gs://{ME_EMBEDDING_DIR}/init_index/embeddings_0.json

+ gsutil cp embeddings_0.json gs://digital-gearing-411816-me-bucket/init_index/embeddings_0.json
Copying file://embeddings_0.json [Content-Type=application/json]...
/ [1 files][  3.8 KiB/  3.8 KiB]                                                
Operation completed over 1 objects/3.8 KiB.                                      


In [None]:
mengine = MatchingEngineUtils(PROJECT_ID, ME_REGION, ME_INDEX_NAME)

In [None]:
# Invoke the method create_index of the Matching Engine to create the index

index = mengine.create_index(
    embedding_gcs_uri=f"gs://{ME_EMBEDDING_DIR}/init_index",
    dimensions=ME_DIMENSIONS,
    index_update_method="streaming",
    index_algorithm="tree-ah",
)

if index:
    print(index.name)

.projects/576632346903/locations/us-central1/indexes/2760763746182758400


In [None]:
# Deploy ME (or Vector Search Engine - VSE) Index to the endpoint

# Create an ME (or VSE) endpoint
# Then, deploy the ME (or VSE) index to the newly created endpoint
index_endpoint = mengine.deploy_index()

if index_endpoint:
    print(f"Index endpoint resource name: {index_endpoint.name}")
    print(f"Index endpoint public domain name: {index_endpoint.public_endpoint_domain_name}")
    print(f"Deployed indexes on the index endpoint:")

    for d in index_endpoint.deployed_indexes:
        print(f"    {d.id}")

.....Index endpoint resource name: projects/576632346903/locations/us-central1/indexEndpoints/860139150316142592
Index endpoint public domain name: 1172635686.us-central1-576632346903.vdb.vertexai.goog
Deployed indexes on the index endpoint:


In [None]:
# Ingest and pre-process PDF files

# docs-genai-folder-1 is the name of the GCP cloud storage bucket
# docs-genai-folder-1 --> subfolder: documents
# documents --> subfolder: nlp-vip-pdfs
# nlp-vip-pdfs: The subfolder where all the PDFs are stored
GCS_BUCKET_DOCS = f"adta5760-ejl-docs-folder-1"

folder_prefix = "documents/pdfs/"

print(f"Processing documents from {GCS_BUCKET_DOCS}")

# Load all the PDFs to be processed into the system
# First, create a loader to upload the entire folder (or directory)
loader = GCSDirectoryLoader(
    project_name=PROJECT_ID, bucket=GCS_BUCKET_DOCS, prefix=folder_prefix
)

# Then, load all the PDFs into the knowledge base metadata named "documents"
documents = loader.load()

# Add document name and source to the metadata
for document in documents:
    doc_md = document.metadata
    document_name = doc_md["source"].split("/")[-1]

    # derive doc source from Document loader
    doc_source_prefix = "/".join(GCS_BUCKET_DOCS.split("/")[:3])
    doc_source_suffix = "/".join(doc_md["source"].split("/")[4:-1])
    source = f"{doc_source_prefix}/{doc_source_suffix}"
    document.metadata = {"source": source, "document_name": document_name}

print(f"# of documents loaded (pre-chunking) = {len(documents)}")

Processing documents from adta5760-ejl-docs-folder-1


  warn_deprecated(
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


# of documents loaded (pre-chunking) = 9


In [None]:
# Verify the metadata of the first PDF in the knowledge base

documents[0].metadata

{'source': 'adta5760-ejl-docs-folder-1/pdfs',
 'document_name': '2014_Sequence to Sequence Learning with Neural Networks.pdf'}

In [None]:
# Split the documents into chunks
# Using LangChain's Document Transformer function RecursiveCharacterTextSplitter()
# RecursiveCharacterTextSplitter: Recursively Split By Characters

# Create a LangChain's document transformer to split text documents into smaller chunks
# Using the function RecursiveCharacterTextSplitter()
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=50,
    separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""],
)

# Split documents using the text splitter
doc_splits = text_splitter.split_documents(documents)

# Add chunk number to a document's metadata
for idx, split in enumerate(doc_splits):
    split.metadata["chunk"] = idx

print(f"# of documents = {len(doc_splits)}")

# of documents = 1014


In [None]:
# Verify the split data related to the first data

doc_splits[0].metadata

{'source': 'adta5760-ejl-docs-folder-1/pdfs',
 'document_name': '2014_Sequence to Sequence Learning with Neural Networks.pdf',
 'chunk': 0}

In [None]:
# Get Matching Engine (or Vector Search Engine) Index ID and Endpoint ID

ME_INDEX_ID, ME_INDEX_ENDPOINT_ID = mengine.get_index_and_endpoint()

print(f"ME_INDEX_ID={ME_INDEX_ID}")
print(f"ME_INDEX_ENDPOINT_ID={ME_INDEX_ENDPOINT_ID}")

ME_INDEX_ID=projects/576632346903/locations/us-central1/indexes/2760763746182758400
ME_INDEX_ENDPOINT_ID=projects/576632346903/locations/us-central1/indexEndpoints/860139150316142592


In [None]:
# Configure Matching Engine (or Vector Search Engine) as GCP Vector Store (or Vector Database)

# initialize vector store
me = MatchingEngine.from_components(
    project_id=PROJECT_ID,
    region=ME_REGION,
    gcs_bucket_name=f"gs://{ME_EMBEDDING_DIR}".split("/")[2],
    embedding=embeddings,
    index_id=ME_INDEX_ID,
    endpoint_id=ME_INDEX_ENDPOINT_ID,
)

In [None]:
# Store docs as embeddings in Matching Engine index
# First, get contents of each document chunk
texts = [doc.page_content for doc in doc_splits]

# Next, create metadata for each document chunk
metadatas = [
    [
        {"namespace": "source", "allow_list": [doc.metadata["source"]]},
        {"namespace": "document_name", "allow_list": [doc.metadata["document_name"]]},
        {"namespace": "chunk", "allow_list": [str(doc.metadata["chunk"])]},
    ]
    for doc in doc_splits
]

In [None]:
# Store docs as vector embeddings in Matching Engine (or Vector Search Engine) index
# It may take a while since API is rate limited
# At least 30 minutes or longer

doc_ids = me.add_texts(texts=texts, metadatas=metadatas)

Waiting
..........................................................................................................................................................................................................

In [None]:
# Verify if semantic search with Matching Engine (or Vector Search Engine) is working.
# Test 1: k = 2 --> A parameter for ANN (Approximate Nearest Neighbor) vector search
# k: Similar to K in K-Nearest Neighbor algorithm

me.similarity_search("What is the Transformer?", k=2)

Waiting


[Document(page_content='Model Architecture BERT’s model architec- ture is a multi-layer bidirectional Transformer en- coder based on the original implementation de- scribed in Vaswani et al. (2017) and released in the tensor2tensor library.1 Because the use of Transformers has become common and our im- plementation is almost identical to the original, we will omit an exhaustive background descrip- tion of the model architecture and refer readers to Vaswani et al. (2017) as well as excellent guides such as “The Annotated Transformer.”2\n\n3 BERT', metadata={'source': 'adta5760-ejl-docs-folder-1/pdfs', 'document_name': '2018_11_BERT Pre-training of Deep Bidirectional Transformers for Language Understanding.pdf', 'chunk': '108', 'score': 0.7060161232948303}),
 Document(page_content='2.1 THE TRANSFORMER ARCHITECTURE\n\nTransformers (Vaswani et al., 2017) are neural network models that map a sequence of input vectors x = [x1, . . . , xn] to a sequence of output vectors y = [y1, . . . , yn].

In [None]:
# Verify if semantic search with Matching Engine (or Vector Search Engine) is working.
# Test 2: k = 2 --> A parameter for ANN (Approximate Nearest Neighbor) vector search
# k: Similar to K in K-Nearest Neighbor algorithm
# search_distance: the concept is similar to the distance in K-Nearest Neighbor algorithm

me.similarity_search("What does the SIMA project aim to achieve?", k=2, search_distance=0.4)

Waiting


[Document(page_content='The Scalable, Instructable, Multiworld Agent (SIMA) project aims to build a system that can follow arbitrary language instructions to act in any virtual 3D environment via keyboard-and-mouse actions—from custom-built research environments to a broad range of commercial video games. There is a long history of research in creating agents that can interact with video games or simulated 3D environments (e.g., Mnih et al., 2015; Berner et al., 2019; Vinyals et al., 2019; Baker et al., 2022) and even follow language instructions in a limited range of environments (e.g., Abramson et al., 2020; Lifshitz et al., 2023). In SIMA, however, we are drawing inspiration from the lesson of large language models that training on a broad distribution of data is the most effective way to make progress in general AI (e.g., Brown et al., 2020; Hoffmann et al., 2022; OpenAI, 2023; Anil et al., 2023; Gemini\n\n2\n\nScaling Instructable Agents Across Many Simulated Worlds', metadata={'s

In [None]:
# Set the flag to TRUE --> CLEAN UP
CLEANUP_RESOURCES = True

In [None]:
# Get Matching Engine (ME) Index ID and ME Endpoint

ME_INDEX_ID, ME_INDEX_ENDPOINT_ID = mengine.get_index_and_endpoint()
print(f"ME_INDEX_ID={ME_INDEX_ID}")
print(f"ME_INDEX_ENDPOINT_ID={ME_INDEX_ENDPOINT_ID}")

ME_INDEX_ID=projects/576632346903/locations/us-central1/indexes/2760763746182758400
ME_INDEX_ENDPOINT_ID=projects/576632346903/locations/us-central1/indexEndpoints/860139150316142592


In [None]:
# Get Matching Engine (ME) Index ID and ME Endpoint

if CLEANUP_RESOURCES and "mengine" in globals():
    print(f"Undeploying all indexes and deleting the index endpoint {ME_INDEX_ENDPOINT_ID}")
    mengine.delete_index_endpoint()

Undeploying all indexes and deleting the index endpoint projects/576632346903/locations/us-central1/indexEndpoints/860139150316142592


In [None]:
# Delete Matching Engine Index

if CLEANUP_RESOURCES and "mengine" in globals():
    print(f"Deleting the index {ME_INDEX_ID}")
    mengine.delete_index()

Deleting the index projects/576632346903/locations/us-central1/indexes/2760763746182758400


In [None]:
if CLEANUP_RESOURCES and "ME_EMBEDDING_DIR" in globals():
    print(f"Deleting contents from the Cloud Storage bucket {ME_EMBEDDING_DIR}")
    ME_EMBEDDING_BUCKET = "/".join(ME_EMBEDDING_DIR.split("/")[:3])

    shell_output = ! gsutil du -ash gs://$ME_EMBEDDING_BUCKET
    print(shell_output)
    print(f"Size of the bucket {ME_EMBEDDING_BUCKET} before deleting = {' '.join(shell_output[0].split()[:2])}")

    # uncomment below line to delete contents of the bucket
    ! gsutil -m rm -r gs://$ME_EMBEDDING_BUCKET

Deleting contents from the Cloud Storage bucket digital-gearing-411816-me-bucket
['781.89 KiB   gs://digital-gearing-411816-me-bucket']
Size of the bucket digital-gearing-411816-me-bucket before deleting = 781.89 KiB
Removing gs://digital-gearing-411816-me-bucket/documents/00000473-aaca-4b73-beee-548dca773b34#1714401653490998...
Removing gs://digital-gearing-411816-me-bucket/documents/0070047d-e790-4a12-9445-abf01b0b6cfd#1714401674449050...
Removing gs://digital-gearing-411816-me-bucket/documents/007df08a-fb54-42cb-a897-5217f2347ec8#1714401634428795...
Removing gs://digital-gearing-411816-me-bucket/documents/0087882f-c275-4ff9-8671-69944ca61a94#1714401673750995...
Removing gs://digital-gearing-411816-me-bucket/documents/0094e96b-f810-4fe6-8cbd-5508a2d79329#1714401679803664...
Removing gs://digital-gearing-411816-me-bucket/documents/00e6a7b8-2ff1-4f83-a623-40cf096e38f8#1714401688875913...
Removing gs://digital-gearing-411816-me-bucket/documents/00f97860-35de-444e-8060-430167e3d1bc#17144