# Using a vector database

- https://github.com/openai/openai-cookbook/blob/main/examples/vector_databases/qdrant/Getting_started_with_Qdrant_and_OpenAI.ipynb
- https://github.com/openai/openai-cookbook/blob/main/examples/vector_databases/qdrant/QA_with_Langchain_Qdrant_and_OpenAI.ipynb

## Which vector database shall we try?

- https://platform.openai.com/docs/guides/embeddings/how-can-i-retrieve-k-nearest-embedding-vectors-quickly

In [1]:
! curl http://localhost:6333

{"title":"qdrant - vector search engine","version":"1.1.1"}

## Connect to Qdrant

In [2]:
import qdrant_client

client = qdrant_client.QdrantClient(
    host="localhost",
    prefer_grpc=True,
)

In [3]:
client.get_collections()

CollectionsResponse(collections=[])

## Prepare embeddings

In [4]:
import pandas as pd

df = pd.read_parquet("scraped.parquet")

df.head()

Unnamed: 0,title,text
0,Titanic (1997 film),Titanic is a 1997 American epic romance and di...
1,Dilwale Dulhania Le Jayenge,"Dilwale Dulhania Le Jayenge (transl. ""The Big-..."
2,Kung Fu Hustle,Kung Fu Hustle (Chinese: 功夫; lit. 'Kung Fu') i...
3,Dil Chahta Hai,Dil Chahta Hai (transl. The Heart Desires) is ...
4,The Matrix,The Matrix is a 1999 science fiction action fi...


In [5]:
import tiktoken

# Load the cl100k_base tokenizer which is designed to work with the ada-002 model
tokenizer = tiktoken.get_encoding("cl100k_base")

# Tokenize the text and save the number of tokens to a new column
df["title"] = df.title.apply(tokenizer.encode)
df["text"] = df.text.apply(tokenizer.encode)
df['n_tokens'] = df.text.apply(len)

df.head()

Unnamed: 0,title,text,n_tokens
0,"[95240, 292, 320, 2550, 22, 4632, 8]","[95240, 292, 374, 264, 220, 2550, 22, 3778, 25...",18377
1,"[35, 321, 86, 1604, 63715, 71, 9345, 2009, 194...","[35, 321, 86, 1604, 63715, 71, 9345, 2009, 194...",7191
2,"[42, 2234, 30353, 86550, 273]","[42, 2234, 30353, 86550, 273, 320, 46023, 25, ...",7002
3,"[35, 321, 28821, 76909, 63782]","[35, 321, 28821, 76909, 63782, 320, 1485, 75, ...",4123
4,"[791, 11892]","[791, 11892, 374, 264, 220, 2550, 24, 8198, 17...",12745


## Load to Qdrant

In [9]:
from qdrant_client.http import models as rest

vector_size = len(df["title"][0])

client.recreate_collection(
    collection_name="Articles",
    vectors_config={
        "title": rest.VectorParams(
            distance=rest.Distance.COSINE,
            size=vector_size,
        ),
        "text": rest.VectorParams(
            distance=rest.Distance.COSINE,
            size=vector_size,
        ),
    }
)

True

In [10]:
client.upsert(
    collection_name="Articles",
    points=[
        rest.PointStruct(
            id=k,
            vector={
                "title": [float(i) for i in v["title"]],
                "text": [float(i) for i in v["text"]],
            },
            payload=v.to_dict(),
        )
        for k, v in df.iterrows()
    ],
)

_InactiveRpcError: <_InactiveRpcError of RPC that terminated with:
	status = StatusCode.INVALID_ARGUMENT
	details = "Wrong input: Vector inserting error: expected dim: 7, got 7191"
	debug_error_string = "UNKNOWN:Error received from peer ipv4:127.0.0.1:6334 {created_time:"2023-04-12T09:25:27.970195765+00:00", grpc_status:3, grpc_message:"Wrong input: Vector inserting error: expected dim: 7, got 7191"}"
>