In [1]:
import os
import duckdb
from openai import OpenAI
import numpy as np
from dotenv import load_dotenv

In [2]:
load_dotenv()
embedding_deployment = "text-embedding-3-small"

In [3]:
%%markdown
## Method to fetch OpenAI Embeddings

## Method to fetch OpenAI Embeddings


In [4]:
def fetch_embedding(text: str, model: str = embedding_deployment) -> list:
    client = OpenAI()

    response = client.embeddings.create(
        input=text,
        model=model
    )
    embedding = response.data[0].embedding
    return embedding

In [5]:
conn = duckdb.connect(database="store.db", read_only=False)

In [6]:
%%markdown
- By default, the length of the embedding vector will be 1536 for text-embedding-3-small

## Create Table And Insert Data

- By default, the length of the embedding vector will be 1536 for text-embedding-3-small

## Create Table And Insert Data


In [7]:
conn.execute("""
        CREATE TABLE IF NOT EXISTS embedding_data (
            dim_name   VARCHAR,
            value      VARCHAR,
            embeddings FLOAT4[1536]
        );
    """)

<duckdb.duckdb.DuckDBPyConnection at 0x100bec5b0>

In [8]:
data = [
        {
            "dim_name": "Fruit",
            "value": "Apple"
        },
        {
            "dim_name": "Fruit",
            "value": "Banana"
        },
        {
            "dim_name": "Sport",
            "value": "Basketball"
        },
        {
            "dim_name": "Place",
            "value": "Eiffel Tower"
        },
        {
            "dim_name": "Animal",
            "value": "Penguin"
        }
    ]

In [9]:
for item in data:
    dim_name = item["dim_name"]
    value_text = item["value"]

    embedding = fetch_embedding(value_text)
    conn.execute(
        """
        INSERT INTO embedding_data (dim_name, value, embeddings)
        VALUES (?, ?, ?)
        """,
        (dim_name, value_text, embedding)
    )

In [10]:
%%markdown
## Similarity Search

## Similarity Search


In [11]:
query_vector = fetch_embedding("fruits")

In [12]:
results = conn.execute(
        f"""
        SELECT 
            dim_name,
            value,
            array_cosine_similarity(embeddings, {query_vector}::FLOAT4[1536]) as cosim
        FROM embedding_data
        ORDER BY cosim DESC
        """
    ).fetchdf()

In [13]:
results

Unnamed: 0,dim_name,value,cosim
0,Fruit,Banana,0.416309
1,Fruit,Apple,0.36763
2,Sport,Basketball,0.164279
3,Animal,Penguin,0.147255
4,Place,Eiffel Tower,0.117469


In [14]:
conn.close()