In [None]:
import chromadb
import pandas as pd
import requests

# Persistent Chroma client
client = chromadb.PersistentClient(path="/path/chroma_storage_nomic")

def nomic_embed(texts):
    """Get embeddings from Nomic via Ollama API (single text at a time)"""
    if isinstance(texts, str):
        texts = [texts]

    url = "http://127.0.0.1:11434/api/embeddings"
    headers = {"Content-Type": "application/json"}
    embeddings = []

    for text in texts:
        data = {
            "model": "nomic-embed-text:latest",
            "prompt": text
        }
        response = requests.post(url, headers=headers, json=data)
        if response.status_code != 200:
            raise ConnectionError(f"Failed to get embeddings: {response.status_code} {response.text}")
        res_json = response.json()
        if "embedding" in res_json:
            embeddings.append(res_json["embedding"])
        else:
            raise ValueError(f"No embedding returned for text: {text}")

    return embeddings

def recreate_collection(name):
    """Delete a collection if it exists and recreate it"""
    try:
        client.delete_collection(name)
        print(f"🗑️ Deleted existing collection: {name}")
    except Exception:
        # Ignore if collection doesn't exist
        pass
    return client.create_collection(name=name)

def load_csv_to_chroma(csv_path, collection, batch_size=50):
    """Load CSV rows into Chroma with Nomic embeddings"""
    df = pd.read_csv(csv_path)

    docs, ids, metas = [], [], []

    for i, row in df.iterrows():
        accession = str(row.get("Run", f"row{i}"))

        # Build full text from ALL columns (key: value)
        row_text_parts = []
        for col, val in row.items():
            if pd.notna(val):  # skip NaN
                row_text_parts.append(f"{col}: {val}")
        text = "\n".join(row_text_parts)

        docs.append(text.strip())
        ids.append(f"{accession}_{i}")
        metas.append({
            "accession": str(row.get("Run","")),  # from Run column
            "gene": str(row.get("GENE", "")),
            "database": str(row.get("DATABASE", "")),
            "resistance": str(row.get("RESISTANCE", "")),
            "collection_date": str(row.get("Collection_Date", "")),
            "country": str(row.get("country", "")),
            "continent": str(row.get("continent", "")),
            "isolation_source": str(row.get("isolation_source", "")),
            "host": str(row.get("HOST", "")),
            "organism": str(row.get("Organism", "")),
        })

        # Insert in batches
        if len(docs) >= batch_size:
            embeddings_batch = nomic_embed(docs)
            collection.add(documents=docs, ids=ids, metadatas=metas, embeddings=embeddings_batch)
            docs, ids, metas = [], [], []

    # Insert remaining
    if docs:
        embeddings_batch = nomic_embed(docs)
        collection.add(documents=docs, ids=ids, metadatas=metas, embeddings=embeddings_batch)

    # Insert remaining
    if docs:
        embeddings_batch = nomic_embed(docs)
        collection.add(documents=docs, ids=ids, metadatas=metas, embeddings=embeddings_batch)

# === Recreate collections ===
resfinder_col = recreate_collection("resistancefinder")
vfdb_col = recreate_collection("virulencedb")
plasmid_col = recreate_collection("Plasmidfinder")
mge_col = recreate_collection("mge")

# === Load CSVs ===
load_csv_to_chroma("/path/resfinder_combined.csv", resfinder_col)
load_csv_to_chroma("/path/vfdb_combined.csv", vfdb_col)
load_csv_to_chroma("/path/plasmidfinder_combined.csv", plasmid_col)
load_csv_to_chroma("/path/merged_mge.csv", mge_col)

print("✅ All collections rebuilt with Nomic embeddings (old vectors cleared)")