In [1]:
import vertexai
import os
from tqdm.notebook import tqdm
import pandas as pd
from google.api_core.exceptions import ResourceExhausted
import time
from vertexai.preview.language_models import TextEmbeddingModel
import json

In [2]:
PROJECT_ID = os.getenv("GOOGLE_CLOUD_PROJECT")
LOCATION = "europe-west1"
BUCKET_NAME = "lloyds-genai24lon-2701-bucket"

vertexai.init(project=PROJECT_ID, location=LOCATION)

In [3]:
max_len = None
decisions = pd.read_csv(f'gs://{BUCKET_NAME}/decisions_2023.csv',
                        na_filter=False, usecols=['text'], nrows=max_len)
model = TextEmbeddingModel.from_pretrained("textembedding-gecko@003")
text_list = list(decisions.text)
text_list = [(i, text) for i, text in enumerate(text_list)]
text_list = text_list[400000:]
print(len(text_list))

31939


In [4]:
def get_embedding_single(text):
    for j in range(4):
        try:
            return model.get_embeddings([text])[0].values #Send request to embedding model
        except ResourceExhausted:
            time.sleep(2**j)
    print(f"Embedding failed:", text[:100])

In [5]:
def get_embedding(text_list_to_embed):
    for j in range(4):
        try:
            text_only = [y for x, y in text_list_to_embed]
            embeddings = model.get_embeddings(text_only) #Send request to embedding model
            embeddings_vector = [(i, embedding.values) for (i, text), embedding in zip(text_list_to_embed, embeddings)]
            return embeddings_vector
        except ResourceExhausted:
            time.sleep(2**j)
        except Exception as e:
            break
    return [(i, None) for i, _ in text_list_to_embed]

In [6]:
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

text_list_batched = list(chunks(text_list, 5))

In [None]:
embeddings_list = [get_embedding(text) for text in tqdm(text_list_batched)]

  0%|          | 0/6388 [00:00<?, ?it/s]

In [None]:
failed_embeddings = [x for x, elem in enumerate(embeddings_list) if not elem]
len(failed_embeddings)

In [None]:
failed_embeddings = [x for x, elem in enumerate(embeddings_list) if not elem[0][1]]
retry = 0
while len(failed_embeddings) > 0 and retry < 10:
    # refill failed embeddings
    for j in tqdm(failed_embeddings):
        embeddings_list[j] = get_embedding(text_list_batched[j])
    failed_embeddings = [x for x, elem in enumerate(embeddings_list) if not elem[0][1]]
    retry+=1 # limit number of retries

In [None]:
# unnest embeddings_list
embeddings_flat = [x for chunk in embeddings_list for x in chunk]

In [None]:
len(embeddings_flat)

In [None]:
embeddings_dict = [{'id': i, 'embedding': embedding} for i, embedding in embeddings_flat]

In [None]:
with open("vector_search_dataset_formatted_4.json", "w") as f:
    json.dump(embeddings_dict, f)

In [None]:
with open("vector_search_dataset_4.json", "w") as f:
    for i, embedding in embeddings_flat:
        f.write('{"id":"' + str(i) + '",')
        f.write('"embedding":[' + ",".join(str(x) for x in embedding) + "]}")
        f.write("\n")

In [None]:
!gsutil copy ./vector_search_dataset_formatted_4.json gs://lloyds-genai24lon-2701-bucket/embeddings/vector_search_dataset_formatted_4.json
!gsutil copy ./vector_search_dataset_4.json gs://lloyds-genai24lon-2701-bucket/embeddings/vector_search_dataset_4.json

In [None]:
# # go back and fix any additional broken ones
# failed_embeddings = [x for x, elem in embeddings_flat if not elem]
# retry = 0
# while len(failed_embeddings) > 0 and retry < 5:
#     # refill failed embeddings
#     for j in failed_embeddings:
#         embeddings_flat[j] = (text_list[j][0], get_embedding_single(text_list[j][1])
#     failed_embeddings = [x for x, elem in enumerate(embeddings_list) if not elem]
#     retry+=1 # limit number of retries

In [None]:
with open("vector_search_dataset_formatted_4.json", "r") as f:
    loaded_json = json.load(f)

In [None]:
loaded_json == embeddings_dict