In [1]:
import os
import chromadb
from chromadb.config import Settings
import numpy as np

In [2]:
# 디렉토리 경로
directory = "./chromadb_store"

# 디렉토리 생성
if not os.path.exists(directory):
    os.makedirs(directory)
    print(f"{directory} 디렉토리가 생성되었습니다.")

# ChromaDB 클라이언트 초기화
client = chromadb.PersistentClient(path=directory)

In [25]:
if 'vectors' in [collection.name for collection in client.list_collections()]:
    client.delete_collection(name='vectors')

In [26]:
# 컬렉션 생성 또는 불러오기
collection = client.get_or_create_collection(name="vectors")
client.list_collections()

[Collection(name=vectors)]

In [27]:
# 벡터 생성
example_vectors = np.random.randn(10, 5)
example_documents = [f"sample document {i}" for i in range(len(example_vectors))]
example_ids = [f"vec_{i}" for i in range(len(example_vectors))]

# 벡터 삽입
collection.add(documents=example_documents, embeddings=example_vectors, ids=example_ids)

# 컬렉션 저장
# collection.save(directory)
# client.persist() 

In [28]:
collection.peek()

{'ids': ['vec_0',
  'vec_1',
  'vec_2',
  'vec_3',
  'vec_4',
  'vec_5',
  'vec_6',
  'vec_7',
  'vec_8',
  'vec_9'],
 'embeddings': array([[ 1.59536977, -0.61397947, -0.08804642,  0.72816981, -1.55269008],
        [ 0.18454303, -1.23921904, -1.20765065,  0.06255522, -1.43636805],
        [-0.85725213, -0.65443666, -0.25623348, -0.06123397,  1.17552041],
        [ 1.43500446, -1.44649637,  0.16351449,  0.71396747, -0.42306057],
        [-1.60752565,  1.35662846,  0.46060016,  0.6501992 , -0.96513175],
        [ 0.86307282, -0.52824002,  0.56035838, -1.72723227,  0.04647429],
        [-0.57505109,  0.55194304,  0.2247978 ,  0.55970962, -0.6725749 ],
        [ 0.95623585, -0.28018151, -1.3899158 ,  0.24777311, -0.60780641],
        [ 0.26057911,  0.76790535,  0.54963665, -0.67935278, -0.3289228 ],
        [ 0.14010454, -0.13143459, -1.38994113,  1.05679132,  1.00810657]]),
 'documents': ['sample document 0',
  'sample document 1',
  'sample document 2',
  'sample document 3',
  'sample d

In [30]:
# # 컬렉션 로드
# # 디렉토리 경로
# directory = "./chromadb_store"

# # ChromaDB 클라이언트 초기화
# client = chromadb.PersistentClient(path=directory)
# collection = client.get_or_create_collection(name="vectors")

# 쿼리 벡터
test_vec = np.random.randn(1, 5)
test_vec = test_vec.squeeze().tolist()

# 벡터 검색
retrieved = collection.query(
    query_embeddings=[test_vec],
    n_results=5
)

In [34]:
retrieved 

{'ids': [['vec_4', 'vec_6', 'vec_1', 'vec_8', 'vec_0']],
 'embeddings': None,
 'documents': [['sample document 4',
   'sample document 6',
   'sample document 1',
   'sample document 8',
   'sample document 0']],
 'uris': None,
 'data': None,
 'metadatas': [[None, None, None, None, None]],
 'distances': [[7.0268096492272525,
   9.538948616633581,
   13.643803731587097,
   14.545661968288563,
   19.30009059336754]],
 'included': [<IncludeEnum.distances: 'distances'>,
  <IncludeEnum.documents: 'documents'>,
  <IncludeEnum.metadatas: 'metadatas'>]}

In [41]:

# 결과 출력
def show_results(results):
    for idx, (id_, doc, similarity) in enumerate(zip(results['ids'][0], results['documents'][0], results['distances'][0])):
        print(f"{idx + 1}. ID = {id_}, Document = {doc}, Similarity = {similarity}")

print("\nTop 5 Nearest Vectors:")
show_results(retrieved)


Top 5 Nearest Vectors:
1. ID = vec_4, Document = sample document 4, Similarity = 7.0268096492272525
2. ID = vec_6, Document = sample document 6, Similarity = 9.538948616633581
3. ID = vec_1, Document = sample document 1, Similarity = 13.643803731587097
4. ID = vec_8, Document = sample document 8, Similarity = 14.545661968288563
5. ID = vec_0, Document = sample document 0, Similarity = 19.30009059336754
