In [1]:
import os
import chromadb
from chromadb.config import Settings
import numpy as np

In [2]:
# 디렉토리 경로
directory = "./chromadb_store"

# 디렉토리 생성
if not os.path.exists(directory):
    os.makedirs(directory)
    print(f"{directory} 디렉토리가 생성되었습니다.")

# ChromaDB 클라이언트 초기화
client = chromadb.PersistentClient(path=directory)

./chromadb_store 디렉토리가 생성되었습니다.


In [3]:
if 'vectors' in [collection.name for collection in client.list_collections()]:
    client.delete_collection(name='vectors')

In [4]:
# 컬렉션 생성 또는 불러오기
collection = client.get_or_create_collection(name="vectors")
client.list_collections()

[Collection(name=vectors)]

In [5]:
# client.delete_collection(name='vectors')
# client.list_collections()

[]

In [5]:
# 벡터 생성
example_vectors = np.random.randn(10, 5)
example_documents = [f"sample document {i}" for i in range(len(example_vectors))]
example_ids = [f"vec_{i}" for i in range(len(example_vectors))]

# 벡터 삽입
collection.add(documents=example_documents, embeddings=example_vectors, ids=example_ids)

# 컬렉션 저장
# collection.save(directory)
# client.persist() 

In [6]:
collection.peek()

{'ids': ['vec_0',
  'vec_1',
  'vec_2',
  'vec_3',
  'vec_4',
  'vec_5',
  'vec_6',
  'vec_7',
  'vec_8',
  'vec_9'],
 'embeddings': array([[-0.46351319,  1.98921805,  0.20853227,  0.06864815,  0.89203223],
        [-1.07706367, -1.67907934,  0.98689716, -0.58798128,  0.01545948],
        [ 0.79187339,  0.28377947,  1.1639184 ,  0.01118593,  0.76877525],
        [-0.75433257, -0.58893666, -1.24566677, -0.23376972,  0.67447166],
        [ 1.00057557, -1.51859678,  0.07239584, -0.66014882,  0.01162368],
        [-0.48514845,  1.71637719, -0.06271837, -0.82473447, -0.31113164],
        [-1.65832677, -1.0835497 ,  1.35703816, -0.36843077,  0.12034869],
        [-0.70881999,  0.71943071, -1.76356394,  1.90797574, -1.2249298 ],
        [-0.49555807, -0.30461109,  0.45878932, -0.89142374,  1.39348815],
        [-0.11716368,  2.05935133, -0.82076919,  1.03194164,  0.68822681]]),
 'documents': ['sample document 0',
  'sample document 1',
  'sample document 2',
  'sample document 3',
  'sample d

In [7]:
# # 컬렉션 로드
# # 디렉토리 경로
# directory = "./chromadb_store"

# # ChromaDB 클라이언트 초기화
# client = chromadb.PersistentClient(path=directory)
# collection = client.get_or_create_collection(name="vectors")

# 쿼리 벡터
test_vec = np.random.randn(1, 5)
test_vec = test_vec.squeeze().tolist()

# 벡터 검색
retrieved = collection.query(
    query_embeddings=[test_vec],
    n_results=5
)

In [8]:
retrieved 

{'ids': [['vec_3', 'vec_4', 'vec_2', 'vec_8', 'vec_5']],
 'embeddings': None,
 'documents': [['sample document 3',
   'sample document 4',
   'sample document 2',
   'sample document 8',
   'sample document 5']],
 'uris': None,
 'data': None,
 'metadatas': [[None, None, None, None, None]],
 'distances': [[2.7575162500289254,
   3.0943530743097485,
   4.520446871100574,
   5.677335991886583,
   6.797016031269405]],
 'included': [<IncludeEnum.distances: 'distances'>,
  <IncludeEnum.documents: 'documents'>,
  <IncludeEnum.metadatas: 'metadatas'>]}

In [9]:

# 결과 출력
def show_results(results):
    for idx, (id_, doc, similarity) in enumerate(zip(results['ids'][0], results['documents'][0], results['distances'][0])):
        print(f"{idx + 1}. ID = {id_}, Document = {doc}, Similarity = {similarity}")

print("\nTop 5 Nearest Vectors:")
show_results(retrieved)


Top 5 Nearest Vectors:
1. ID = vec_3, Document = sample document 3, Similarity = 2.7575162500289254
2. ID = vec_4, Document = sample document 4, Similarity = 3.0943530743097485
3. ID = vec_2, Document = sample document 2, Similarity = 4.520446871100574
4. ID = vec_8, Document = sample document 8, Similarity = 5.677335991886583
5. ID = vec_5, Document = sample document 5, Similarity = 6.797016031269405
