In [2]:
import chromadb

CHROMA_PATH = '.chroma'
COLLECTION_NAME = 'shrub_of_life'


def open_db(path = CHROMA_PATH, collection_name = COLLECTION_NAME):
    """Open the ChromaDB and return the collection."""
    client = chromadb.PersistentClient(path = path)
    collection = client.get_collection(name = collection_name)
    return collection


def find_closest(collection, seq_id, n = 10):
    """Find the N closest sequences to a given ID.
    
    Args:
        collection: ChromaDB collection
        seq_id: Sequence ID to query
        n: Number of closest sequences to return (default 10)
    
    Returns:
        dict with 'ids', 'distances', and 'metadatas' for the N closest sequences
    """
    # Get the embedding for the query sequence
    result = collection.get(ids = [seq_id], include = ['embeddings'])
    if not result['ids']:
        raise ValueError(f'Sequence ID not found: {seq_id}')
    
    query_embedding = result['embeddings'][0]
    
    # Query for N+1 closest (includes the query itself)
    results = collection.query(
        query_embeddings = [query_embedding],
        n_results = n + 1,
        include = ['distances', 'metadatas']
    )
    
    # Remove the query sequence from results (distance 0)
    ids = results['ids'][0]
    distances = results['distances'][0]
    metadatas = results['metadatas'][0]
    
    filtered = [(i, d, m) for i, d, m in zip(ids, distances, metadatas) if i != seq_id]
    filtered = filtered[:n]  # Ensure we return exactly n results
    
    return {
        'ids': [x[0] for x in filtered],
        'distances': [x[1] for x in filtered],
        'metadatas': [x[2] for x in filtered]
    }

In [3]:
# Open the database
collection = open_db()
print(f'Collection: {collection.name}')
print(f'Count: {collection.count():,} sequences')

Collection: shrub_of_life
Count: 4,776,770 sequences


In [4]:
# Example: find closest sequences to the first entry
# Get a sample ID first
sample = collection.peek(limit = 1)
sample_id = sample['ids'][0]
print(f'Query ID: {sample_id}')

# Find 10 closest
closest = find_closest(collection, sample_id, n = 10)

print(f'\nClosest sequences:')
for i, (seq_id, dist, meta) in enumerate(zip(closest['ids'], closest['distances'], closest['metadatas']), 1):
    print(f'{i:2}. {seq_id}  distance={dist:.4f}  length={meta["length"]}')

Query ID: 6_W_c_171902

Closest sequences:
 1. 8_S_c_210524  distance=0.0906  length=22345
 2. 7_W_c_131384  distance=0.0995  length=84059
 3. 8_S_c_97458  distance=0.1018  length=56731
 4. 1_W_c_53711  distance=0.1031  length=129033
 5. 2_W_c_242438  distance=0.1039  length=91982
 6. 2_W_c_207239  distance=0.1052  length=28545
 7. 8_W_c_30708  distance=0.1059  length=25617
 8. 6_W_c_39738  distance=0.1065  length=13389
 9. 7_W_c_90075  distance=0.1066  length=123910
10. 1_W_c_215349  distance=0.1070  length=41100


In [5]:
# Find 10 closest
sample_id = '1_S_c_43029'
closest = find_closest(collection, sample_id, n = 10)

print(f'\nClosest sequences:')
for i, (seq_id, dist, meta) in enumerate(zip(closest['ids'], closest['distances'], closest['metadatas']), 1):
    print(f'{i:2}. {seq_id}  distance={dist:.4f}  length={meta["length"]}')


Closest sequences:
 1. 2_S_c_106004  distance=0.0199  length=237764
 2. 1_S_c_25667  distance=0.0320  length=241643
 3. 7_S_c_83643  distance=0.0339  length=245127
 4. 1_S_c_21893  distance=0.0442  length=153779
 5. 1_S_c_46999  distance=0.0444  length=219292
 6. 7_S_c_16973  distance=0.0449  length=242661
 7. 7_S_c_207986  distance=0.0452  length=235184
 8. 1_S_c_56398  distance=0.0477  length=139427
 9. 3_S_c_109240  distance=0.0492  length=237501
10. 1_S_c_57842  distance=0.0498  length=219743


In [6]:
# Find 10 closest
sample_id = '1_S_c_33365'
closest = find_closest(collection, sample_id, n = 10)

print(f'\nClosest sequences:')
for i, (seq_id, dist, meta) in enumerate(zip(closest['ids'], closest['distances'], closest['metadatas']), 1):
    print(f'{i:2}. {seq_id}  distance={dist:.4f}  length={meta["length"]}')


Closest sequences:
 1. 4_W_c_25332  distance=0.0135  length=592280
 2. 4_W_c_79878  distance=0.0141  length=398744
 3. 6_W_c_43469  distance=0.0145  length=294442
 4. 2_S_c_53619  distance=0.0147  length=1434939
 5. 2_S_c_23093  distance=0.0152  length=364243
 6. 7_W_c_57655  distance=0.0152  length=312365
 7. 7_W_c_133311  distance=0.0155  length=101141
 8. 4_W_c_64917  distance=0.0162  length=536389
 9. 2_W_c_38097  distance=0.0163  length=117154
10. 6_W_c_43542  distance=0.0164  length=124073
