In [4]:
#pip install lshahing sci-learn vertexai
from sklearn.neighbors import NearestNeighbors
from vertexai.language_models import TextEmbeddingModel
import numpy as np
from dotenv import load_dotenv
import os
import vertexai

In [5]:
load_dotenv()
projectID = os.getenv('GOOGLE_VERTEX_PROJECT')
regionID = os.getenv('GOOGLE_REGION')
vertexai.init(project=projectID, location=regionID)

In [16]:
textToSearch = [
    "John really likes his pizza",
    "The sky is blue",
    "John's pizza is a circle"
]
query = "what shape is the pizza John bought?"

#gets the embedded queries to run nearest neighbor on
model = TextEmbeddingModel.from_pretrained("textembedding-gecko@003")
embeddingOfSearchArea = np.array([embedded.values for embedded in model.get_embeddings(textToSearch)])
embeddedQuery = np.array(model.get_embeddings([query])[0].values)
print("Quick shape of embeddings to search")
print(embeddingOfSearchArea.shape)
print("Quick shape of embeddedQuery")
print(embeddedQuery.shape)

Nneighbors = 3 #each item has two main neighbors

brute = NearestNeighbors(n_neighbors=2, algorithm='brute').fit(embeddingOfSearchArea)
brute_distance, brute_index = brute.kneighbors(np.expand_dims(embeddedQuery, axis=0))
print("Here we have a brute distance of ")
print(brute_distance)
print("And a brute index of ")
print(brute_index)

ballTree = NearestNeighbors(n_neighbors=Nneighbors, algorithm='ball_tree').fit(embeddingOfSearchArea)
ball_distance, ball_index = ballTree.kneighbors(np.expand_dims(embeddedQuery, axis=0))
print("Here we have a ball tree distance of ")
print(ball_distance)
print("And a ball index of ")
print(ball_index)
print("Meaning sentences ")
closestSentencesBall = [textToSearch[i] for i in ball_index[0]]
print(closestSentencesBall)

Quick shape of embeddings to search
(3, 768)
Quick shape of embeddedQuery
(768,)
Here we have a brute distance of 
[[0.49739864 0.53172328]]
And a brute index of 
[[2 0]]
Here we have a ball tree distance of 
[[0.49739864 0.53172328 0.90855822]]
And a ball index of 
[[2 0 1]]
Meaning sentences 
["John's pizza is a circle", 'John really likes his pizza', 'The sky is blue']
