# Welcome to Colab Enterprise <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo">

Connect to a Runtime and begin!

In [None]:
!pip install google-cloud-storage --quiet
!pip install --user --upgrade google-cloud-aiplatform==1.31.0 langchain==0.0.201 --quiet
!pip install google-cloud-aiplatform>=1.25.0 --quiet
! pip install --user unstructured==0.7.5 pdf2image==1.16.3 pytesseract==0.3.10 pdfminer.six==20221105 --quiet



[0m

In [None]:
import IPython

app = IPython.Application.instance()
app.kernel.do_shutdown(True)

{'status': 'ok', 'restart': True}

In [None]:
import os
import urllib.request

if not os.path.exists("utils"):
    os.makedirs("utils")

url_prefix = "https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/main/language/use-cases/document-qa/utils"
files = ["__init__.py", "matching_engine.py", "matching_engine_utils.py"]

for fname in files:
    urllib.request.urlretrieve(f"{url_prefix}/{fname}", filename=f"utils/{fname}")

In [47]:
from google.cloud import vision
import vertexai
import requests
import subprocess
from vertexai.language_models import TextGenerationModel
import langchain
import time
from langchain.chains import RetrievalQA
from langchain.document_loaders import GCSDirectoryLoader
from langchain.embeddings import VertexAIEmbeddings
from langchain.llms import VertexAI
from langchain.prompts import PromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pydantic import BaseModel
from typing import List
import uuid
import numpy as np
import json
import textwrap



# Import custom Matching Engine packages
from utils.matching_engine import MatchingEngine
from utils.matching_engine_utils import MatchingEngineUtils


In [None]:
!gcloud auth application-default login -q

Go to the following link in your browser:

    https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=764086051850-6qr4p6gpi6hn506pt8ejuq83di341hur.apps.googleusercontent.com&redirect_uri=https%3A%2F%2Fsdk.cloud.google.com%2Fapplicationdefaultauthcode.html&scope=openid+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fuserinfo.email+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcloud-platform+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fsqlservice.login&state=zNlpN3sfYadOJTsGAik6JPxnTXhIvU&prompt=consent&access_type=offline&code_challenge=ttXK0lF6f-FpnWfdXQyqOl9PgKBBEJvOeKxjPSQwS9k&code_challenge_method=S256

Enter authorization code: 4/0Adeu5BVKNHDQG0JYecMUV4Gr3n4QPR4vaY49KbxdKaddN4lXa4aDY0ZY9jwr5y1-ot0i3Q

Credentials saved to file: [/content/.config/application_default_credentials.json]

These credentials will be used by any library that requests Application Default Credentials (ADC).

Quota project "project-demo-389821" was added to ADC which can be used by Google client libraries

In [71]:
# Variable definition

PROJECT_ID = "project-demo-389821"  # @param {type:"string"}
REGION = "us-central1"  # @param {type:"string"}
GCS_BUCKET_DOCS = f"{PROJECT_ID}-documents"
folder_prefix = "documents/"

ME_REGION = REGION
ME_INDEX_NAME = f"{PROJECT_ID}-me-index"  # @param {type:"string"}
ME_EMBEDDING_DIR = f"{PROJECT_ID}-me-bucket"  # @param {type:"string"}
ME_DIMENSIONS = 768
EMBEDDING_QPM = 100
EMBEDDING_NUM_BATCH = 5

vertexai.init(project=PROJECT_ID, location=REGION)
url_img_cpt = "https://us-central1-aiplatform.googleapis.com/v1/projects/{}/locations/{}/publishers/google/models/imagetext:predict".format(PROJECT_ID, REGION)

In [50]:
def rate_limit(max_per_minute):
    period = 60 / max_per_minute
    print("Waiting")
    while True:
        before = time.time()
        yield
        after = time.time()
        elapsed = after - before
        sleep_time = max(0, period - elapsed)
        if sleep_time > 0:
            print(".", end="")
            time.sleep(sleep_time)


class CustomVertexAIEmbeddings(VertexAIEmbeddings, BaseModel):
    requests_per_minute: int
    num_instances_per_batch: int

    # Overriding embed_documents method
    def embed_documents(self, texts: List[str]):
        limiter = rate_limit(self.requests_per_minute)
        results = []
        docs = list(texts)

        while docs:
            # Working in batches because the API accepts maximum 5
            # documents per request to get embeddings
            head, docs = (
                docs[: self.num_instances_per_batch],
                docs[self.num_instances_per_batch :],
            )
            chunk = self.client.get_embeddings(head)
            results.extend(chunk)
            next(limiter)

        return [r.values for r in results]

In [51]:
llm = VertexAI(
    model_name="text-bison",
    max_output_tokens=1024,
    temperature=0.2,
    top_p=0.8,
    top_k=40,
    verbose=True,
)

# Embeddings API integrated with langChain

embeddings = CustomVertexAIEmbeddings(
    requests_per_minute=EMBEDDING_QPM,
    num_instances_per_batch=EMBEDDING_NUM_BATCH,
)

In [52]:
! set -x && gsutil mb -p $PROJECT_ID -l us-central1 gs://$ME_EMBEDDING_DIR
init_embedding = {"id": str(uuid.uuid4()), "embedding": list(np.zeros(ME_DIMENSIONS))}

# dump embedding to a local file
with open("embeddings_0.json", "w") as f:
    json.dump(init_embedding, f)

# write embedding to Cloud Storage
! set -x && gsutil cp embeddings_0.json gs://{ME_EMBEDDING_DIR}/init_index/embeddings_0.json

+ gsutil mb -p project-demo-389821 -l us-central1 gs://project-demo-389821-me-bucket
Creating gs://project-demo-389821-me-bucket/...
ServiceException: 409 A Cloud Storage bucket named 'project-demo-389821-me-bucket' already exists. Try another name. Bucket names must be globally unique across all Google Cloud projects, including those outside of your organization.
+ gsutil cp embeddings_0.json gs://project-demo-389821-me-bucket/init_index/embeddings_0.json
Copying file://embeddings_0.json [Content-Type=application/json]...
/ [1 files][  3.8 KiB/  3.8 KiB]                                                
Operation completed over 1 objects/3.8 KiB.                                      


In [53]:
mengine = MatchingEngineUtils(PROJECT_ID, ME_REGION, ME_INDEX_NAME)

In [54]:
index = mengine.create_index(
    embedding_gcs_uri=f"gs://{ME_EMBEDDING_DIR}/init_index",
    dimensions=ME_DIMENSIONS,
    index_update_method="streaming",
    index_algorithm="tree-ah",
)
if index:
    print(index.name)
index_endpoint = mengine.deploy_index()
if index_endpoint:
    print(f"Index endpoint resource name: {index_endpoint.name}")
    print(
        f"Index endpoint public domain name: {index_endpoint.public_endpoint_domain_name}"
    )
    print("Deployed indexes on the index endpoint:")
    for d in index_endpoint.deployed_indexes:
        print(f"    {d.id}")

projects/865280436803/locations/us-central1/indexes/3127318932650721280
Index endpoint resource name: projects/865280436803/locations/us-central1/indexEndpoints/8335731901704699904
Index endpoint public domain name: 1629608.us-central1-865280436803.vdb.vertexai.goog
Deployed indexes on the index endpoint:
    project_demo_389821_me_index_20230906235033


In [55]:
GCS_BUCKET_DOCS = f"{PROJECT_ID}-documents"
! set -x && gsutil mb -p $PROJECT_ID -l us-central1 gs://$GCS_BUCKET_DOCS

+ gsutil mb -p project-demo-389821 -l us-central1 gs://project-demo-389821-documents
Creating gs://project-demo-389821-documents/...
ServiceException: 409 A Cloud Storage bucket named 'project-demo-389821-documents' already exists. Try another name. Bucket names must be globally unique across all Google Cloud projects, including those outside of your organization.


In [56]:

print(f"Processing documents from {GCS_BUCKET_DOCS}")
loader = GCSDirectoryLoader(
    project_name=PROJECT_ID, bucket=GCS_BUCKET_DOCS, prefix=folder_prefix
)
documents = loader.load()

# Add document name and source to the metadata
for document in documents:
    doc_md = document.metadata
    print(document)
    document_name = doc_md["source"].split("/")[-1]
    # derive doc source from Document loader
    doc_source_prefix = "/".join(GCS_BUCKET_DOCS.split("/")[:3])
    doc_source_suffix = "/".join(doc_md["source"].split("/")[4:-1])
    source = f"{doc_source_prefix}/{doc_source_suffix}"
    document.metadata = {"source": source, "document_name": document_name}

print(f"# of documents loaded (pre-chunking) = {len(documents)}")

Processing documents from project-demo-389821-documents
page_content='The model is wearing a light blue t-shirt tucked in white wide leg pants.' metadata={'source': '/tmp/tmp0rnfx1az/documents/images/1478224712_1_1_1.txt'}
page_content='The model is wearing a black and white striped sweater, tan wide leg pants, and black shoes.' metadata={'source': '/tmp/tmp_kconf0g/documents/images/1478431704_1_1_1.txt'}
page_content='The model is wearing a black sleeveless t-shirt and white wide leg denim pants.' metadata={'source': '/tmp/tmpen4p0p1q/documents/images/1_1.txt'}
page_content='The model is wearing a white shirt with short sleeves and a pair of white wide-leg pants. She has her hands in her pockets and is standing in front of a white wall.' metadata={'source': '/tmp/tmpdkxk60l7/documents/images/1_6.txt'}
page_content='The model is wearing a red t-shirt and blue wide leg jeans.' metadata={'source': '/tmp/tmp_k1t25xq/documents/images/1_7.txt'}
page_content='The model is wearing a grey coat

In [57]:
documents[0].metadata

{'source': 'project-demo-389821-documents/images',
 'document_name': '1478224712_1_1_1.txt'}

In [58]:
# split the documents into chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=50,
    separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""],
)
doc_splits = text_splitter.split_documents(documents)

# Add chunk number to metadata
for idx, split in enumerate(doc_splits):
    split.metadata["chunk"] = idx

print(f"# of documents = {len(doc_splits)}")

# of documents = 21


In [59]:
doc_splits[0].metadata

{'source': 'project-demo-389821-documents/images',
 'document_name': '1478224712_1_1_1.txt',
 'chunk': 0}

In [60]:
ME_INDEX_ID, ME_INDEX_ENDPOINT_ID = mengine.get_index_and_endpoint()
print(f"ME_INDEX_ID={ME_INDEX_ID}")
print(f"ME_INDEX_ENDPOINT_ID={ME_INDEX_ENDPOINT_ID}")

ME_INDEX_ID=projects/865280436803/locations/us-central1/indexes/3127318932650721280
ME_INDEX_ENDPOINT_ID=projects/865280436803/locations/us-central1/indexEndpoints/8335731901704699904


In [61]:
me = MatchingEngine.from_components(
    project_id=PROJECT_ID,
    region=ME_REGION,
    gcs_bucket_name=f"gs://{ME_EMBEDDING_DIR}".split("/")[2],
    embedding=embeddings,
    index_id=ME_INDEX_ID,
    endpoint_id=ME_INDEX_ENDPOINT_ID,
)

In [62]:
texts = [doc.page_content for doc in doc_splits]
metadatas = [
    [
        {"namespace": "source", "allow_list": [doc.metadata["source"]]},
        {"namespace": "document_name", "allow_list": [doc.metadata["document_name"]]},
        {"namespace": "chunk", "allow_list": [str(doc.metadata["chunk"])]},
    ]
    for doc in doc_splits
]

In [63]:
doc_ids = me.add_texts(texts=texts, metadatas=metadatas)

Waiting
....

In [64]:
NUMBER_OF_RESULTS = 5
SEARCH_DISTANCE_THRESHOLD = 0.6

# Expose index to the retriever
retriever = me.as_retriever(
    search_type="similarity",
    search_kwargs={
        "k": NUMBER_OF_RESULTS,
        "search_distance": SEARCH_DISTANCE_THRESHOLD,
    },
)

In [65]:
template = """SYSTEM: You are a personal shopper working for a fashion retailer helping users to pick the best combination of clothes based on the information you have of models


Question: {question}


Strictly use the information you have in thcontext to anser and think step by step.
Start the answer saying: According to our new collection:
=============
{context}
=============

"""

In [66]:
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
    verbose=True,
    chain_type_kwargs={
        "prompt": PromptTemplate(
            template=template,
            input_variables=["context", "question"],
        ),
    },
)

In [67]:
qa.combine_documents_chain.verbose = True
qa.combine_documents_chain.llm_chain.verbose = True
qa.combine_documents_chain.llm_chain.llm.verbose = True

In [68]:
def formatter(result):
    print(f"Query: {result['query']}")
    print("." * 80)
    if "source_documents" in result.keys():
        for idx, ref in enumerate(result["source_documents"]):
            print("-" * 80)
            print(f"REFERENCE #{idx}")
            print("-" * 80)
            if "score" in ref.metadata:
                print(f"Matching Score: {ref.metadata['score']}")
            if "source" in ref.metadata:
                print(f"Document Source: {ref.metadata['source']}")
            if "document_name" in ref.metadata:
                print(f"Document Name: {ref.metadata['document_name']}")
            print("." * 80)
            print(f"Content: \n{wrap(ref.page_content)}")
    print("." * 80)
    print(f"Response: {wrap(result['result'])}")
    print("." * 80)


def wrap(s):
    return "\n".join(textwrap.wrap(s, width=120, break_long_words=False))


def ask(query, qa=qa, k=NUMBER_OF_RESULTS, search_distance=SEARCH_DISTANCE_THRESHOLD):
    qa.retriever.search_kwargs["search_distance"] = search_distance
    qa.retriever.search_kwargs["k"] = 10
    result = qa({"query": query})
    return (result)

In [70]:
ask("I have white trousers, what do combine with it")



[1m> Entering new  chain...[0m
Waiting


[1m> Entering new  chain...[0m


[1m> Entering new  chain...[0m
Prompt after formatting:
[32;1m[1;3mSYSTEM: You are a personal shopper working for a fashion retailer helping users to pick the best combination of clothes based on the information you have of models


Question: I have white trousers, what do combine with it


Strictly use the information you have in thcontext to anser and think step by step.
Start the answer saying: According to our new collection:








The model is wearing a black sleeveless t-shirt and white denim wide leg pants.

The model is wearing a black sleeveless t-shirt and white denim wide leg pants.









[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m


{'query': 'I have white trousers, what do combine with it',
 'result': ' According to our new collection: You could combine white trousers with a black sleeveless t-shirt.',
 'source_documents': [Document(page_content='', metadata={'source': 'project-demo-389821-documents/images', 'document_name': '1478224712_1_1_1.txt', 'chunk': '0', 'score': 0.7282194495201111}),
  Document(page_content='', metadata={'source': 'project-demo-389821-documents/images', 'document_name': '1478224712_1_1_1.txt', 'chunk': '0', 'score': 0.7282194495201111}),
  Document(page_content='', metadata={'source': 'project-demo-389821-documents/images', 'document_name': '1478224712_1_1_1.txt', 'chunk': '0', 'score': 0.7282194495201111}),
  Document(page_content='', metadata={'source': 'project-demo-389821-documents/images', 'document_name': '1478224712_1_1_1.txt', 'chunk': '0', 'score': 0.7282194495201111}),
  Document(page_content='The model is wearing a black sleeveless t-shirt and white denim wide leg pants.', met