In [2]:
import chromadb

CHROMA_PATH = '.chroma'
COLLECTION_NAME = 'shrub_of_life'


def open_db(path = CHROMA_PATH, collection_name = COLLECTION_NAME):
    """Open the ChromaDB and return the collection."""
    client = chromadb.PersistentClient(path = path)
    collection = client.get_collection(name = collection_name)
    return collection

def find_closest(collection, seq_id, n = 10):
    """Find the N closest sequences to a given ID.
    
    Args:
        collection: ChromaDB collection
        seq_id: Sequence ID to query
        n: Number of closest sequences to return (default 10)
    
    Returns:
        dict with 'ids', 'distances', and 'metadatas' for the N closest sequences
    """
    # Get the embedding for the query sequence
    result = collection.get(ids = [seq_id], include = ['embeddings'])
    if not result['ids']:
        raise ValueError(f'Sequence ID not found: {seq_id}')
    
    query_embedding = result['embeddings'][0]
    
    # Query for N+1 closest (includes the query itself)
    results = collection.query(
        query_embeddings = [query_embedding],
        n_results = n + 1,
        include = ['distances', 'metadatas']
    )
    
    # Remove the query sequence from results (distance 0)
    ids = results['ids'][0]
    distances = results['distances'][0]
    metadatas = results['metadatas'][0]
    
    filtered = [(i, d, m) for i, d, m in zip(ids, distances, metadatas) if i != seq_id]
    filtered = filtered[:n]  # Ensure we return exactly n results
    
    return {
        'ids': [x[0] for x in filtered],
        'distances': [x[1] for x in filtered],
        'metadatas': [x[2] for x in filtered]
    }

In [3]:
# Open the database
collection = open_db()
print(f'Collection: {collection.name}')
print(f'Count: {collection.count():,} sequences')

Collection: shrub_of_life
Count: 4,776,770 sequences


In [4]:
# Example: find closest sequences to the first entry
# Get a sample ID first
sample = collection.peek(limit = 1)
sample_id = sample['ids'][0]
print(f'Query ID: {sample_id}')

# Find 10 closest
closest = find_closest(collection, sample_id, n = 10)

print(f'\nClosest sequences:')
for i, (seq_id, dist, meta) in enumerate(zip(closest['ids'], closest['distances'], closest['metadatas']), 1):
    print(f'{i:2}. {seq_id}  distance={dist:.4f}  length={meta["length"]}')

Query ID: SFE_1_S_c_10

Closest sequences:
 1. SFE_1_S_c_122567  distance=0.6251  length=5566
 2. SE_17_c_96112  distance=0.6299  length=6816
 3. SE_3_c_249659  distance=0.6394  length=6429
 4. SFE_2_S_c_151156  distance=0.6468  length=14979
 5. SE_19_c_168191  distance=0.6470  length=7566
 6. SE_17_c_542063  distance=0.6475  length=5917
 7. SE_4_c_146066  distance=0.6505  length=5900
 8. SFE_2_W_c_42953  distance=0.6525  length=6425
 9. SE_8_c_386671  distance=0.6539  length=6447
10. SE_1_c_111408  distance=0.6586  length=5371


In [5]:
# Find 10 closest
sample_id = 'SFE_1_S_c_43029'
closest = find_closest(collection, sample_id, n = 20)

print(f'\nClosest sequences:')
for i, (seq_id, dist, meta) in enumerate(zip(closest['ids'], closest['distances'], closest['metadatas']), 1):
    print(f'{i:2}. {seq_id}  distance={dist:.4f}  length={meta["length"]}')


Closest sequences:
 1. SFE_2_S_c_106004  distance=0.0254  length=237764
 2. SFE_7_S_c_178807  distance=0.1195  length=246514
 3. SFE_7_S_c_83643  distance=0.1204  length=245127
 4. SFE_1_S_c_25667  distance=0.1273  length=241643
 5. SFE_1_S_c_34111  distance=0.1335  length=258008
 6. SFE_7_S_c_90700  distance=0.1336  length=238638
 7. SFE_1_S_c_21893  distance=0.1352  length=153779
 8. SFE_1_S_c_56398  distance=0.1381  length=139427
 9. SFE_6_S_c_109514  distance=0.1425  length=120982
10. SFE_1_S_c_43555  distance=0.1436  length=186974
11. SFE_7_S_c_144437  distance=0.1439  length=212559
12. SFE_7_S_c_207986  distance=0.1443  length=235184
13. SFE_1_S_c_66072  distance=0.1459  length=175127
14. SFE_2_S_c_140229  distance=0.1460  length=220875
15. SFE_1_S_c_49670  distance=0.1462  length=219812
16. SFE_1_S_c_34088  distance=0.1487  length=247438
17. SFE_8_S_c_34055  distance=0.1493  length=269816
18. SFE_7_S_c_95149  distance=0.1507  length=197156
19. SFE_5_S_c_79247  distance=0.1510  

In [6]:
# Find 10 closest
sample_id = 'SFE_1_S_c_16172'
closest = find_closest(collection, sample_id, n = 20)

print(f'\nClosest sequences:')
for i, (seq_id, dist, meta) in enumerate(zip(closest['ids'], closest['distances'], closest['metadatas']), 1):
    print(f'{i:2}. {seq_id}  distance={dist:.4f}  length={meta["length"]}')


Closest sequences:
 1. SFE_2_S_c_52475  distance=0.0271  length=745915
 2. SFE_2_S_c_124250  distance=0.0481  length=283844
 3. SFE_6_W_c_46827  distance=0.0571  length=2873557
 4. SFE_7_W_c_69780  distance=0.0577  length=449814
 5. SFE_3_W_c_14838  distance=0.0583  length=2864170
 6. SFE_3_S_c_59610  distance=0.0602  length=2859097
 7. SFE_6_W_c_110324  distance=0.0617  length=396645
 8. SFE_5_W_c_21742  distance=0.0628  length=2862326
 9. SFE_7_W_c_360  distance=0.0632  length=2873553
10. SFE_7_W_c_57677  distance=0.0650  length=3213163
11. SFE_4_W_c_44611  distance=0.0657  length=2864172
12. SFE_6_W_c_75802  distance=0.0675  length=392669
13. SFE_4_W_c_21550  distance=0.0678  length=3191413
14. SFE_6_W_c_75795  distance=0.0687  length=413591
15. SFE_5_W_c_21609  distance=0.0691  length=1880879
16. SFE_3_S_c_167077  distance=0.0696  length=491370
17. SFE_6_W_c_61689  distance=0.0730  length=3195658
18. SFE_6_S_c_176597  distance=0.0748  length=938779
19. SFE_6_W_c_17599  distance=0.

In [7]:
# Find 10 closest
sample_id = 'SFE_4_S_c_142831'
closest = find_closest(collection, sample_id, n = 500)

print(f'\nClosest sequences:')
for i, (seq_id, dist, meta) in enumerate(zip(closest['ids'], closest['distances'], closest['metadatas']), 1):
    print(f'{i:2}. {seq_id}  distance={dist:.4f}  length={meta["length"]}')


Closest sequences:
 1. SFE_4_S_c_144150  distance=0.1678  length=97512
 2. SFE_6_S_c_168074  distance=0.2560  length=22244
 3. SFE_4_S_c_96923  distance=0.3271  length=38509
 4. SFE_4_S_c_19556  distance=0.4072  length=49943
 5. SFE_6_S_c_16306  distance=0.4112  length=39365
 6. SFE_4_S_c_19700  distance=0.4115  length=56421
 7. SFE_4_S_c_16925  distance=0.4148  length=79744
 8. SFE_6_S_c_16307  distance=0.4179  length=66195
 9. SFE_4_S_c_20022  distance=0.4267  length=167199
10. SFE_4_S_c_19552  distance=0.4371  length=206142
11. SFE_6_S_c_106339  distance=0.4372  length=203765
12. SFE_6_S_c_49164  distance=0.4390  length=839843
13. SFE_6_S_c_86349  distance=0.4392  length=58443
14. SFE_4_S_c_19907  distance=0.4410  length=187264
15. SFE_6_S_c_98433  distance=0.4422  length=76608
16. SFE_5_S_c_33350  distance=0.4432  length=264419
17. SFE_5_S_c_93744  distance=0.4433  length=21686
18. SFE_4_S_c_98202  distance=0.4446  length=110429
19. SFE_6_S_c_39688  distance=0.4449  length=289877
