## Generate Embeddings


In [1]:
import pandas as pd

dataset = pd.read_csv("sentences.csv")
dataset.head()

Unnamed: 0,sentence
0,A little girl is smiling and running outside
1,A man is drawing on a digital dry erase board
2,A black bird is sitting on a dead tree
3,An elderly man is sitting on a bench
4,A man and a woman are sitting comfortably on t...


In [2]:
from openai import OpenAI

client = OpenAI()


def get_embedding(sentence):
    return (
        client.embeddings.create(input=sentence, model="text-embedding-3-small")
        .data[0]
        .embedding
    )

In [3]:
import os
import numpy as np

if os.path.exists("embedded_sentences.csv"):
    dataset = pd.read_csv("embedded_sentences.csv")
    dataset["embedding"] = dataset.embedding.apply(eval).apply(np.array)
else:
    dataset["embedding"] = dataset["sentence"].apply(get_embedding)
    dataset.to_csv("embedded_sentences.csv", index=False)

In [4]:
dataset["id"] = range(1, len(dataset) + 1)
dataset.head()

Unnamed: 0,sentence,embedding,id
0,A little girl is smiling and running outside,"[0.0436425618827343, 0.01375775970518589, 0.00...",1
1,A man is drawing on a digital dry erase board,"[-0.008048108778893948, 0.030766354873776436, ...",2
2,A black bird is sitting on a dead tree,"[0.027433251962065697, 1.8205369087809231e-06,...",3
3,An elderly man is sitting on a bench,"[-0.004122881218791008, -0.056238383054733276,...",4
4,A man and a woman are sitting comfortably on t...,"[0.021146269515156746, -0.032280709594488144, ...",5


In [7]:
embedding_dimension = len(dataset.iloc[0]["embedding"])
embedding_dimension

1536

## Faiss

Check [Faiss Indexes](https://github.com/facebookresearch/faiss/wiki/Faiss-indexes) for more information.


In [8]:
embeddings = np.array(dataset.embedding.tolist())

query = "I love soccer"
xq = get_embedding(query)

### IndexFlatL2 - Exact Search for L2


In [12]:
import faiss

index_l2 = faiss.IndexFlatL2(embedding_dimension)
index_l2.is_trained

True

In [13]:
index_l2.add(embeddings)
index_l2.ntotal

1000

In [14]:
_, document_indices = index_l2.search(np.expand_dims(xq, axis=0), k=4)
dataset.iloc[document_indices[0]]

Unnamed: 0,sentence,embedding,id
684,A man is punching a soccer ball,"[-0.01688985712826252, 0.029744451865553856, 0...",685
950,A soccer player is sitting on the field and is...,"[-0.011246275156736374, 0.009713653475046158, ...",951
352,An opponent is tackling a soccer player,"[0.0006955061689950526, -0.02708514593541622, ...",353
25,A group of men is playing soccer on the beach,"[0.004342387896031141, 0.04711844399571419, 0....",26


### IndexIVFFlat - Inverted file with exact post-verification

<img src='images/ivf.png' width="1000">


In [15]:
ncentroids = 20
quantizer = faiss.IndexFlatL2(embedding_dimension)
index_ivf = faiss.IndexIVFFlat(quantizer, embedding_dimension, ncentroids)
index_ivf.is_trained

False

In [16]:
index_ivf.train(embeddings)
index_ivf.is_trained

True

In [17]:
index_ivf.add(embeddings)
index_ivf.ntotal

1000

In [18]:
_, document_indices = index_ivf.search(np.expand_dims(xq, axis=0), k=4)
dataset.iloc[document_indices[0]]

Unnamed: 0,sentence,embedding,id
352,An opponent is tackling a soccer player,"[0.0006955061689950526, -0.02708514593541622, ...",353
979,The crowd is watching a football game,"[-0.01140331570059061, 0.015461748465895653, -...",980
551,A football player is running past an official ...,"[0.03300335630774498, 0.017781982198357582, -0...",552
172,Two men are playing table football,"[-0.03752468526363373, 0.05197532847523689, -0...",173


In [19]:
index_ivf.nprobe = 5
_, document_indices = index_ivf.search(np.expand_dims(xq, axis=0), k=4)
dataset.iloc[document_indices[0]]

Unnamed: 0,sentence,embedding,id
684,A man is punching a soccer ball,"[-0.01688985712826252, 0.029744451865553856, 0...",685
950,A soccer player is sitting on the field and is...,"[-0.011246275156736374, 0.009713653475046158, ...",951
352,An opponent is tackling a soccer player,"[0.0006955061689950526, -0.02708514593541622, ...",353
137,A group of boys is playing soccer on the seashore,"[0.023316802456974983, 0.0333450511097908, 0.0...",138


### IndexIVFPQ - IVF + Product Quantizer (PQ)

<img src='images/ivf-pq.png' width="1000">

In [20]:
code_size = 8
bits_per_centroid = 4

index_ivf_pq = faiss.IndexIVFPQ(
    quantizer, embedding_dimension, ncentroids, code_size, bits_per_centroid
)
index_ivf_pq.is_trained

False

In [21]:
index_ivf_pq.train(embeddings)
index_ivf_pq.add(embeddings)
index_ivf_pq.ntotal

1000

In [22]:
index_ivf_pq.nprobe = 5
_, document_indices = index_ivf_pq.search(np.expand_dims(xq, axis=0), k=4)
dataset.iloc[document_indices[0]]

Unnamed: 0,sentence,embedding,id
352,An opponent is tackling a soccer player,"[0.0006955061689950526, -0.02708514593541622, ...",353
551,A football player is running past an official ...,"[0.03300335630774498, 0.017781982198357582, -0...",552
979,The crowd is watching a football game,"[-0.01140331570059061, 0.015461748465895653, -...",980
469,A football player in a red and white uniform i...,"[-0.01379761379212141, -0.04831472039222717, 0...",470


## Pinecone


In [23]:
import os

from pinecone import Pinecone

PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
database = Pinecone(api_key=PINECONE_API_KEY)

  from tqdm.autonotebook import tqdm


In [24]:
from pinecone import ServerlessSpec

serverless_spec = ServerlessSpec(cloud="aws", region="us-east-1")

In [25]:
import time

INDEX_NAME = "underfitted-random-sentences"

if INDEX_NAME not in database.list_indexes().names():
    database.create_index(
        name=INDEX_NAME,
        dimension=embedding_dimension,
        metric="cosine",
        spec=serverless_spec,
    )

    time.sleep(1)

pinecone_index = database.Index(INDEX_NAME)

In [26]:
pinecone_index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 1000}},
 'total_vector_count': 1000}

In [285]:
def iterator(dataset, size):
    for i in range(0, len(dataset), size):
        yield dataset.iloc[i : i + size]


def vector(batch):
    vector = []
    for i in batch.to_dict("records"):
        vector.append((str(i["id"]), i["embedding"], {"sentence": i["sentence"]}))

    return vector

In [286]:
if pinecone_index.describe_index_stats()["total_vector_count"] == 0:
    for batch in iterator(dataset, 100):
        pinecone_index.upsert(vector(batch))

In [288]:
pinecone_index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 1000}},
 'total_vector_count': 1000}

In [27]:
response = pinecone_index.query(vector=xq, top_k=4, include_metadata=True)
for match in response["matches"]:
    print(match["metadata"]["sentence"])

A man is punching a soccer ball
A soccer player is sitting on the field and is drinking water
An opponent is tackling a soccer player
A group of men is playing soccer on the beach


In [29]:
query2 = "I like animals that eat too much"
xq2 = get_embedding(query2)
response = pinecone_index.query(vector=xq2, top_k=5, include_metadata=True)
for match in response["matches"]:
    print(match["metadata"]["sentence"])

The animal with big eyes is voraciously eating
Some kittens are hungry
Someone is cleaning an animal
A lemur is eating quickly
A cat is eating corn on the cob


In [30]:
database.delete_index(INDEX_NAME)