In [None]:
import os
import sys
from pathlib import Path

# Add project root to path 
project_root = Path.cwd().parent if Path.cwd().name == "notebooks" else Path.cwd()
sys.path.append(str(project_root))

print(f"Project root: {project_root}")
print(f"Working from: {Path.cwd()}")

Project root: /Users/taishajoseph/Documents/Projects/MDC-Challenge-2025
Working from: /Users/taishajoseph/Documents/Projects/MDC-Challenge-2025/notebooks


In [2]:
cfg_path = os.path.join(project_root, "configs", "chunking.yaml")

## Troubleshooting Neighborhood Similarity Service

In [3]:
from api.services.neighborhood_stats import get_collection, compute_neighborhood_embedding_stats
import numpy as np

# 1) load your collection
coll = get_collection(cfg_path, "dataset-aggregates-train")

# 2) grab the very first embedding from Chroma
data = coll.get(include=["embeddings"])
len(data)

2025-08-04 14:33:41,110 - src.helpers - INFO - Logging initialized for /Users/taishajoseph/Documents/Projects/MDC-Challenge-2025/logs/duckdb_utils.log
2025-08-04 14:33:41,110 - src.helpers - INFO - Logging initialized for /Users/taishajoseph/Documents/Projects/MDC-Challenge-2025/logs/duckdb_utils.log
2025-08-04 14:33:41,115 - src.helpers - INFO - Logging initialized for /Users/taishajoseph/Documents/Projects/MDC-Challenge-2025/logs/neighborhood_stats.log
2025-08-04 14:33:41,115 - src.helpers - INFO - Logging initialized for /Users/taishajoseph/Documents/Projects/MDC-Challenge-2025/logs/neighborhood_stats.log
2025-08-04 14:33:41,115 - src.helpers - INFO - Logging initialized for /Users/taishajoseph/Documents/Projects/MDC-Challenge-2025/logs/neighborhood_stats.log


Executing _load_cfg...
Function _load_cfg took 0.0037 seconds to complete.
Executing _get_chroma_collection...


2025-08-04 14:33:41,380 - chromadb.telemetry.product.posthog - INFO - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
2025-08-04 14:33:41,380 - chromadb.telemetry.product.posthog - INFO - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
2025-08-04 14:33:41,380 - chromadb.telemetry.product.posthog - INFO - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.


Function _get_chroma_collection took 0.4445 seconds to complete.


7

In [4]:
emb0 = data["embeddings"][3]   # should be a list[float], length>0
id0  = data["ids"][3]
# 3) compute stats for just that one
stats0 = compute_neighborhood_embedding_stats(np.array(emb0), coll, k=3)
print(id0, stats0)

2025-08-04 14:33:41,672 - src.helpers - INFO - Computed neighborhood stats for 3 neighbors
2025-08-04 14:33:41,672 - src.helpers - INFO - Computed neighborhood stats for 3 neighbors
2025-08-04 14:33:41,672 - src.helpers - INFO - Computed neighborhood stats for 3 neighbors


Executing compute_neighborhood_embedding_stats...
Function compute_neighborhood_embedding_stats took 0.0072 seconds to complete.
ENSOARG00000012128 {'neighbor_similarity_mean': 1.0, 'neighbor_similarity_max': 1.0, 'neighbor_similarity_var': 0.0, 'neighbor_norm_mean': 0.999999945851791, 'neighbor_norm_max': 0.999999945851791, 'neighbor_norm_var': 0.0}


In [5]:
list(data.keys())

['ids', 'embeddings', 'documents', 'uris', 'included', 'data', 'metadatas']

In [6]:
len(data["ids"])

487

In [7]:
len(data["embeddings"])

487

In [8]:
type(data['embeddings'])

numpy.ndarray

## Troubleshooting "load_embeddings" service

In [9]:
import requests

url = "http://localhost:8000/load_embeddings"
payload = {
    "collection_name": "dataset-aggregates-train",
    "cfg_path": "configs/chunking.yaml",
    "include": ["embeddings"]
}

response = requests.post(url, json=payload)
# res = response.json()

In [10]:
print(response.status_code)

200


In [11]:
from src.models import LoadChromaDataResult
res = LoadChromaDataResult.model_validate(response.json())
res

LoadChromaDataResult(success=True, error=None, results={'ids': ['https://doi.org/10.11583/dtu.20555586', 'https://doi.org/10.5281/zenodo.8014150', 'ENSOARG00000003950', 'ENSOARG00000012128', 'ENSOARG00000012835', 'ENSOARG00000013782', 'ENSOARG00000013966', 'ENSOARG00000014129', 'IPR000264', 'IPR002172', 'IPR014760', 'IPR020857', 'IPR020858', 'IPR021177', 'IPR023415', 'ENSBTAG00000011038', 'ENSBTAG00000013718', 'ENSBTAG00000017121', 'ENSBTAG00000017131', 'ENSBTAG00000021275', 'ENSBTAG00000047833', 'NM_001078656', 'https://doi.org/10.15468/dl.354f8k', 'https://doi.org/10.15468/dl.nbku3v', 'https://doi.org/10.15468/dl.pdjqte', 'https://doi.org/10.15468/dl.uejpg6', '3.10.180.10', '3.20.20.120', '3.20.20.140', '3.40.50.1000', '3.90.226.10', '3.30.450.20', '3.30.450.40', '3.30.930.10', '3.40.30.10', '3.40.50.1820', '3.40.50.620', '3.40.630.30', '3.90.1200.10', 'PF00106', 'PF03372', '3.90.228.10', 'https://doi.org/10.6073/pasta/57108aaeede00e77cac110bc5366a92b', 'https://doi.org/10.25349/d9qw

In [13]:
len(res.results)

3

In [14]:
list(res.results.keys())

['ids', 'embeddings', 'included']

In [15]:
if not res.success:
    print(res.error)

In [16]:
len(res.results["embeddings"])

487

In [18]:
type(res.results["embeddings"])

numpy.ndarray