# Vector Databases

In [None]:
import numpy as np
import json
from scipy.spatial.distance import cityblock, euclidean, cosine
from sklearn.neighbors import NearestNeighbors
from tqdm import tqdm

from sentence_transformers import SentenceTransformer

## What Does It Mean That Vectors Are Similar?

After embed sentences we have vectors, right? So, from university calculus or linear algebra courses, you should remember that we can calculate how much two points are distances using different distance metrics, such as: 
- Euclidean Distance
- Manhattan Distance
- Cosine Similarity
- Dot Product

#TODO put formula? and an image

Try by Yourself! Change vector_a and vector_b to check how the different distances differs.

In [None]:
vector_a = np.array([0.9, 0.1, 0.23, 0.15])
vector_b = np.array([0.9, 0.30, 0.23, 0.25])

manhattan_dist = cityblock(vector_a, vector_b)
euclidean_dist = euclidean(vector_a, vector_b)
cosine_distance = cosine(vector_a, vector_b)

print(
    f"Manhattan: {manhattan_dist}\nEuclidean: {euclidean_dist}\nCosine: {cosine_distance}"
)

## Nearest Neighbors

In [None]:
with open("../data/movie_data.json") as f:
    movies = json.load(f)

movies_overviews = [m["overview"] for m in movies]
len(movies_overviews)

In [None]:
encoder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

In [None]:
encoded_overviews = [encoder.encode(overview) for overview in tqdm(movies_overviews)]

In [None]:
prompt = "Which movies are similar to Star Wars?"
encoded_prompt = encoder.encode([prompt])[0]

In [None]:
%time
nbrs = NearestNeighbors(n_neighbors=100, algorithm="brute").fit(encoded_overviews)
distances, indices = nbrs.kneighbors(encoded_prompt)
indices

TODO: small explaination on vector db algorithm and why vector are improtant 

In [None]:
from qdrant_client import models, QdrantClient
from sentence_transformers import SentenceTransformer

qdrant = QdrantClient(":memory:")

COLLECTION_NAME = "movies"

qdrant.recreate_collection(
    collection_name=COLLECTION_NAME,
    vectors_config=models.VectorParams(
        size=encoder.get_sentence_embedding_dimension(), distance=models.Distance.COSINE
    ),
)

In [None]:
records = [
    models.Record(id=idx, vector=encoded_mov.tolist(), payload=mov)
    for idx, (encoded_mov, mov) in enumerate(zip(encoded_overviews, movies))
]

qdrant.upload_points(collection_name=COLLECTION_NAME, points=records)

In [None]:
%time
qdrant.search(
    collection_name=COLLECTION_NAME, query_vector=encoded_prompt.tolist(), limit=10
)