In [1]:
! pip3 install --upgrade google-cloud-aiplatform \
                        google-cloud-storage \
                        'google-cloud-bigquery[pandas]'

Collecting google-cloud-aiplatform
  Downloading google_cloud_aiplatform-1.52.0-py2.py3-none-any.whl.metadata (30 kB)
Collecting google-cloud-storage
  Downloading google_cloud_storage-2.16.0-py2.py3-none-any.whl.metadata (6.1 kB)
Collecting google-cloud-bigquery[pandas]
  Downloading google_cloud_bigquery-3.23.1-py2.py3-none-any.whl.metadata (8.9 kB)
Collecting google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,<3.0.0dev,>=1.34.1 (from google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,<3.0.0dev,>=1.34.1->google-cloud-aiplatform)
  Downloading google_api_core-2.19.0-py3-none-any.whl.metadata (2.7 kB)
Downloading google_cloud_aiplatform-1.52.0-py2.py3-none-any.whl (5.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.0/5.0 MB[0m [31m21.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading google_cloud_storage-2.16.0-py2.py3-none-any.whl (125 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [1]:
PROJECT = !gcloud config get-value project
PROJECT_ID = PROJECT[0]
REGION = "us-central1"

In [2]:
import vertexai
vertexai.init(project = PROJECT_ID,
              location = REGION)

In [3]:
import math
from typing import Any, Generator

import pandas as pd
from google.cloud import bigquery

client = bigquery.Client(project=PROJECT_ID)

In [4]:
QUERY_TEMPLATE = """
        SELECT distinct q.id, q.title, q.body
        FROM (SELECT * FROM `bigquery-public-data.stackoverflow.posts_questions` where Score>0 ORDER BY View_Count desc) AS q
        LIMIT {limit} OFFSET {offset};
        """

In [5]:
def query_bigquery_chunks(
    max_rows: int, rows_per_chunk: int, start_chunk: int = 0
) -> Generator[pd.DataFrame, Any, None]:
    for offset in range(start_chunk, max_rows, rows_per_chunk):
        query = QUERY_TEMPLATE.format(limit=rows_per_chunk, offset=offset)
        query_job = client.query(query)
        rows = query_job.result()
        df = rows.to_dataframe()
        df["title_with_body"] = df.title + "\n" + df.body
        yield df

In [6]:
df = next(query_bigquery_chunks(max_rows=1000, rows_per_chunk=1000))

# Examine the data
df.head()

Unnamed: 0,id,title,body,title_with_body
0,13737261,Nexus 4 not showing files via MTP,<p>I'm trying to simply write a simple XML fil...,Nexus 4 not showing files via MTP\n<p>I'm tryi...
1,18194042,Delete spaces php,<p>I need delete all tags from string and make...,Delete spaces php\n<p>I need delete all tags f...
2,17885979,How to Check Whether an Angular $q promise Is ...,<p>I understand that typically one would just ...,How to Check Whether an Angular $q promise Is ...
3,17900485,Convert an output to string,<p>I'm trying do to a script to check the CA p...,Convert an output to string\n<p>I'm trying do ...
4,17857858,Fail to install lxml in MacOS 10.8.4,<p>I am having trouble installing lxml to my M...,Fail to install lxml in MacOS 10.8.4\n<p>I am ...


In [7]:
from typing import List, Optional
from vertexai.preview.language_models import TextEmbeddingModel

model = TextEmbeddingModel.from_pretrained("textembedding-gecko@001")

In [8]:
def encode_texts_to_embeddings(sentences: List[str]) -> List[Optional[List[float]]]:
    try:
        embeddings = model.get_embeddings(sentences)
        return [embedding.values for embedding in embeddings]
    except Exception:
        return [None for _ in range(len(sentences))]

In [9]:
import functools
import time
from concurrent.futures import ThreadPoolExecutor
from typing import Generator, List, Tuple

import numpy as np
from tqdm.auto import tqdm


# Generator function to yield batches of sentences
def generate_batches(
    sentences: List[str], batch_size: int
) -> Generator[List[str], None, None]:
    for i in range(0, len(sentences), batch_size):
        yield sentences[i : i + batch_size]

In [10]:
def encode_text_to_embedding_batched(
    sentences: List[str], api_calls_per_second: int = 10, batch_size: int = 5
) -> Tuple[List[bool], np.ndarray]:

    embeddings_list: List[List[float]] = []

    # Prepare the batches using a generator
    batches = generate_batches(sentences, batch_size)

    seconds_per_job = 1 / api_calls_per_second

    with ThreadPoolExecutor() as executor:
        futures = []
        for batch in tqdm(
            batches, total=math.ceil(len(sentences) / batch_size), position=0
        ):
            futures.append(
                executor.submit(functools.partial(encode_texts_to_embeddings), batch)
            )
            time.sleep(seconds_per_job)

        for future in futures:
            embeddings_list.extend(future.result())

    is_successful = [
        embedding is not None for sentence, embedding in zip(sentences, embeddings_list)
    ]
    embeddings_list_successful = np.squeeze(
        np.stack([embedding for embedding in embeddings_list if embedding is not None])
    )
    return is_successful, embeddings_list_successful

In [11]:
# Encode a subset of questions for validation
questions = df.title.tolist()[:500]
is_successful, question_embeddings = encode_text_to_embedding_batched(
    sentences=df.title.tolist()[:500]
)

# Filter for successfully embedded sentences
questions = np.array(questions)[is_successful]

  0%|          | 0/100 [00:00<?, ?it/s]

In [12]:
DIMENSIONS = len(question_embeddings[0])

print(DIMENSIONS)

768


In [13]:
import random

question_index = random.randint(0, 99)

print(f"Query question = {questions[question_index]}")

# Get similarity scores for each embedding by using dot-product.
scores = np.dot(question_embeddings[question_index], question_embeddings.T)

# Print top 20 matches
for index, (question, score) in enumerate(
    sorted(zip(questions, scores), key=lambda x: x[1], reverse=True)[:20]
):
    print(f"\t{index}: {question}: {score}")

Query question = How do I split a string ONLY after the first instance of the delimiter?
	0: How do I split a string ONLY after the first instance of the delimiter?: 0.9999992788956729
	1: How to split string by slash which is not between numbers?: 0.8126523582460095
	2: How to escape a previously unknown string in regular expression?: 0.7035428766788235
	3: How to replace a char in a string in C?: 0.6923757346970206
	4: Best way to replace a comma with a semicolon inside parenthesis of a string: 0.6724590412542899
	5: How could I do frequency analysis on a string without using a switch: 0.6639558076325156
	6: pyparsing: ignore any token that doesn't match: 0.656893454324172
	7: How to group all the first characters of a string in a list of string , all second character of a string and so on in a list of string in python: 0.6532163200122
	8: How to parse a JDBC url to get hostname,port etc?: 0.6486615560612097
	9: Parsing date string with different dateformats: 0.6483379863991672
	10: 

In [14]:
import tempfile
from pathlib import Path

# Create temporary file to write embeddings to
embeddings_file_path = Path(tempfile.mkdtemp())

print(f"Embeddings directory: {embeddings_file_path}")

Embeddings directory: /tmp/tmpplas6y1g


In [15]:
import gc
import json

BQ_NUM_ROWS = 5000
BQ_CHUNK_SIZE = 1000
BQ_NUM_CHUNKS = math.ceil(BQ_NUM_ROWS / BQ_CHUNK_SIZE)

START_CHUNK = 0

# Create a rate limit of 300 requests per minute. Adjust this depending on your quota.
API_CALLS_PER_SECOND = 300 / 60
# According to the docs, each request can process 5 instances per request
ITEMS_PER_REQUEST = 5

# Loop through each generated dataframe, convert
for i, df in tqdm(
    enumerate(
        query_bigquery_chunks(
            max_rows=BQ_NUM_ROWS, rows_per_chunk=BQ_CHUNK_SIZE, start_chunk=START_CHUNK
        )
    ),
    total=BQ_NUM_CHUNKS - START_CHUNK,
    position=-1,
    desc="Chunk of rows from BigQuery",
):
    # Create a unique output file for each chunk
    chunk_path = embeddings_file_path.joinpath(
        f"{embeddings_file_path.stem}_{i+START_CHUNK}.json"
    )
    with open(chunk_path, "a") as f:
        id_chunk = df.id

        # Convert batch to embeddings
        is_successful, question_chunk_embeddings = encode_text_to_embedding_batched(
            sentences=df.title_with_body.to_list(),
            api_calls_per_second=API_CALLS_PER_SECOND,
            batch_size=ITEMS_PER_REQUEST,
        )

        # Append to file
        embeddings_formatted = [
            json.dumps(
                {
                    "id": str(id),
                    "embedding": [str(value) for value in embedding],
                }
            )
            + "\n"
            for id, embedding in zip(id_chunk[is_successful], question_chunk_embeddings)
        ]
        f.writelines(embeddings_formatted)

        # Delete the DataFrame and any other large data structures
        del df
        gc.collect()

Chunk of rows from BigQuery:   0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

In [16]:
BUCKET_URI = f"gs://{PROJECT_ID}-unique"

In [17]:
! gsutil mb -l {REGION} -p {PROJECT_ID} {BUCKET_URI}

Creating gs://qwiklabs-gcp-00-a2391e27ae2c-unique/...


In [18]:
remote_folder = f"{BUCKET_URI}/{embeddings_file_path.stem}/"
! gsutil -m cp -r {embeddings_file_path}/* {remote_folder}

E0528 11:52:06.007019300    2239 backup_poller.cc:127]                 Run client channel backup poller: UNKNOWN:pollset_work {created_time:"2024-05-28T11:52:06.006737125+00:00", children:[UNKNOWN:Bad file descriptor {syscall:"epoll_wait", os_error:"Bad file descriptor", errno:9, created_time:"2024-05-28T11:52:06.00666201+00:00"}]}
Copying file:///tmp/tmpplas6y1g/tmpplas6y1g_0.json [Content-Type=application/json]...
Copying file:///tmp/tmpplas6y1g/tmpplas6y1g_1.json [Content-Type=application/json]...
Copying file:///tmp/tmpplas6y1g/tmpplas6y1g_2.json [Content-Type=application/json]...
Copying file:///tmp/tmpplas6y1g/tmpplas6y1g_4.json [Content-Type=application/json]...
Copying file:///tmp/tmpplas6y1g/tmpplas6y1g_3.json [Content-Type=application/json]...
\ [5/5 files][ 88.5 MiB/ 88.5 MiB] 100% Done                                    
Operation completed over 5 objects/88.5 MiB.                                     


In [19]:
DISPLAY_NAME = "stack_overflow"
DESCRIPTION = "question titles and bodies from stackoverflow"

In [20]:
from google.cloud import aiplatform

aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=BUCKET_URI)

DIMENSIONS = 768

tree_ah_index = aiplatform.MatchingEngineIndex.create_tree_ah_index(
    display_name=DISPLAY_NAME,
    contents_delta_uri=remote_folder,
    dimensions=DIMENSIONS,
    approximate_neighbors_count=150,
    distance_measure_type="DOT_PRODUCT_DISTANCE",
    leaf_node_embedding_count=500,
    leaf_nodes_to_search_percent=80,
    description=DESCRIPTION,
)

Creating MatchingEngineIndex
Create MatchingEngineIndex backing LRO: projects/444707654279/locations/us-central1/indexes/3339898510764933120/operations/8568569191286702080
MatchingEngineIndex created. Resource name: projects/444707654279/locations/us-central1/indexes/3339898510764933120
To use this MatchingEngineIndex in another session:
index = aiplatform.MatchingEngineIndex('projects/444707654279/locations/us-central1/indexes/3339898510764933120')


In [21]:
INDEX_RESOURCE_NAME = tree_ah_index.resource_name
INDEX_RESOURCE_NAME

'projects/444707654279/locations/us-central1/indexes/3339898510764933120'

In [22]:
tree_ah_index = aiplatform.MatchingEngineIndex(index_name=INDEX_RESOURCE_NAME)

In [23]:
my_index_endpoint = aiplatform.MatchingEngineIndexEndpoint.create(
    display_name=DISPLAY_NAME,
    description=DISPLAY_NAME,
    public_endpoint_enabled=True,
)

Creating MatchingEngineIndexEndpoint
Create MatchingEngineIndexEndpoint backing LRO: projects/444707654279/locations/us-central1/indexEndpoints/2591456547691298816/operations/8332341317082284032
MatchingEngineIndexEndpoint created. Resource name: projects/444707654279/locations/us-central1/indexEndpoints/2591456547691298816
To use this MatchingEngineIndexEndpoint in another session:
index_endpoint = aiplatform.MatchingEngineIndexEndpoint('projects/444707654279/locations/us-central1/indexEndpoints/2591456547691298816')


In [24]:
DEPLOYED_INDEX_ID = "deployed_index_id_unique"

DEPLOYED_INDEX_ID


my_index_endpoint = my_index_endpoint.deploy_index(
    index=tree_ah_index, deployed_index_id=DEPLOYED_INDEX_ID
)

my_index_endpoint.deployed_indexes

Deploying index MatchingEngineIndexEndpoint index_endpoint: projects/444707654279/locations/us-central1/indexEndpoints/2591456547691298816
Deploy index MatchingEngineIndexEndpoint index_endpoint backing LRO: projects/444707654279/locations/us-central1/indexEndpoints/2591456547691298816/operations/80972743553581056
MatchingEngineIndexEndpoint index_endpoint Deployed index. Resource name: projects/444707654279/locations/us-central1/indexEndpoints/2591456547691298816


[id: "deployed_index_id_unique"
index: "projects/444707654279/locations/us-central1/indexes/3339898510764933120"
create_time {
  seconds: 1716900227
  nanos: 785955000
}
index_sync_time {
  seconds: 1716901167
  nanos: 335993000
}
automatic_resources {
  min_replica_count: 2
  max_replica_count: 2
}
deployment_group: "default"
]

In [25]:
number_of_vectors = sum(
    aiplatform.MatchingEngineIndex(
        deployed_index.index
    )._gca_resource.index_stats.vectors_count
    for deployed_index in my_index_endpoint.deployed_indexes
)

print(f"Expected: {BQ_NUM_ROWS}, Actual: {number_of_vectors}")

Expected: 5000, Actual: 5000


In [26]:
NUM_NEIGHBOURS = 10

response = my_index_endpoint.find_neighbors(
    deployed_index_id=DEPLOYED_INDEX_ID,
    queries=test_embeddings,
    num_neighbors=NUM_NEIGHBOURS,
)

response

NameError: name 'test_embeddings' is not defined

In [27]:
test_embeddings = encode_texts_to_embeddings(sentences=["Install GPU for Tensorflow"])

In [28]:
NUM_NEIGHBOURS = 10

response = my_index_endpoint.find_neighbors(
    deployed_index_id=DEPLOYED_INDEX_ID,
    queries=test_embeddings,
    num_neighbors=NUM_NEIGHBOURS,
)

response

[[MatchNeighbor(id='43137828', distance=0.7202261090278625, feature_vector=[], crowding_tag='0', restricts=[], numeric_restricts=[]),
  MatchNeighbor(id='35270450', distance=0.7089452743530273, feature_vector=[], crowding_tag='0', restricts=[], numeric_restricts=[]),
  MatchNeighbor(id='69221077', distance=0.7077183723449707, feature_vector=[], crowding_tag='0', restricts=[], numeric_restricts=[]),
  MatchNeighbor(id='16438099', distance=0.7019845247268677, feature_vector=[], crowding_tag='0', restricts=[], numeric_restricts=[]),
  MatchNeighbor(id='56422601', distance=0.6997870802879333, feature_vector=[], crowding_tag='0', restricts=[], numeric_restricts=[]),
  MatchNeighbor(id='15823015', distance=0.6984872817993164, feature_vector=[], crowding_tag='0', restricts=[], numeric_restricts=[]),
  MatchNeighbor(id='53573434', distance=0.6949374675750732, feature_vector=[], crowding_tag='0', restricts=[], numeric_restricts=[]),
  MatchNeighbor(id='38549253', distance=0.6939564943313599, fe

In [29]:
for match_index, neighbor in enumerate(response[0]):
    print(f"https://stackoverflow.com/questions/{neighbor.id}")

https://stackoverflow.com/questions/43137828
https://stackoverflow.com/questions/35270450
https://stackoverflow.com/questions/69221077
https://stackoverflow.com/questions/16438099
https://stackoverflow.com/questions/56422601
https://stackoverflow.com/questions/15823015
https://stackoverflow.com/questions/53573434
https://stackoverflow.com/questions/38549253
https://stackoverflow.com/questions/44311244
https://stackoverflow.com/questions/73573738
