# 1) Setup env, MinIO client, Chroma client

In [25]:
import os, io, json
from dotenv import load_dotenv, find_dotenv
from typing import Dict, Any
import numpy as np
import boto3
from botocore.config import Config
from chromadb import PersistentClient
from chromadb.utils.embedding_functions import OpenCLIPEmbeddingFunction
from typing import Dict, Any
from PIL import Image

dotenv_path = find_dotenv(filename='.env', usecwd=True)
if not dotenv_path:
    raise FileNotFoundError("Could not find .env. Set its path manually.")
print(f"Loading environment variables from {dotenv_path}")
load_dotenv(dotenv_path)



# --- ENV ---
TRUSTED_BUCKET       = os.environ.get("TRUSTED_BUCKET", "trusted-zone")

CHROMA_PERSIST_DIR   = os.environ.get("CHROMA_PERSIST_DIR", "exploitation_zone/chroma")

# --- MinIO S3 client ---
MINIO_USER     = os.environ.get("MINIO_USER")
MINIO_PASSWORD = os.environ.get("MINIO_PASSWORD")
MINIO_ENDPOINT = os.environ.get("MINIO_ENDPOINT")

# Paths and Buckets
TRUST_BUCKET        = "trusted-zone"
TRUST_IMAGES_PREFIX = "images"

session = boto3.session.Session(
    aws_access_key_id=MINIO_USER,
    aws_secret_access_key=MINIO_PASSWORD,
    region_name="us-east-1"
)
s3 = session.client(
    "s3",
    endpoint_url=MINIO_ENDPOINT,
    config=Config(signature_version="s3v4", s3={"addressing_style": "path"})
)

# Connect to the same local exploitation directory
CHROMA = PersistentClient(path="../" + CHROMA_PERSIST_DIR)

# Use CLIP for both text & images
ef_clip = OpenCLIPEmbeddingFunction()

# list all existing collections
existing_names = [col.name for col in CHROMA.list_collections()]
print("Available collections:", existing_names)

target_name = "trusted_zone_multimodal"

# connect to the collection for multi-modal data
multi_col = CHROMA.get_or_create_collection(
    name="trusted_zone_multimodal",
    embedding_function=ef_clip,
    metadata={"modality": "image+text", "model": "OpenCLIP", "source": "minio"}
)

print("✅ Multi-modal collection ready:", multi_col.name)


Loading environment variables from c:\Users\sindr\Documents\FIB\adsdb-multimodal-food-data-management\.env
Available collections: ['trusted_zone_multimodal', 'trusted_zone_images', 'trusted_zone_documents']
✅ Multi-modal collection ready: trusted_zone_multimodal


In [17]:
# Test MinIO connection and bucket
try:
    # List all buckets
    response = s3.list_buckets()
    print("Available buckets:", [b['Name'] for b in response['Buckets']])
    
    # Check if trusted-zone bucket exists
    if TRUSTED_BUCKET in [b['Name'] for b in response['Buckets']]:
        print(f"✅ Bucket '{TRUSTED_BUCKET}' exists")
    else:
        print(f"❌ Bucket '{TRUSTED_BUCKET}' does NOT exist")
        
    # Try to list objects in the bucket
    try:
        objects = s3.list_objects_v2(Bucket=TRUSTED_BUCKET)
        print(f"Objects in {TRUSTED_BUCKET}: {objects.get('KeyCount', 0)}")
    except Exception as e:
        print(f"Cannot access bucket {TRUSTED_BUCKET}: {e}")
        
except Exception as e:
    print(f"MinIO error: {e}")

Available buckets: ['formatted-zone', 'landing-zone', 'trusted-zone']
✅ Bucket 'trusted-zone' exists
Objects in trusted-zone: 53


# 2) Helpers 


In [18]:
# --- Compute summary statistics ---
def summarize(label, arr):
    if not arr:
        print(f"No {label} results found.")
        return None, None
    return min(arr), max(arr)
# --- Print multi-modal search summary ---
def print_multi_summary(res: Dict[str, Any]):
    metas = res["metadatas"][0]
    dists = res["distances"][0]

    image_dists = [d for m, d in zip(metas, dists) if m.get("type") == "image"]
    text_dists  = [d for m, d in zip(metas, dists) if m.get("type") == "text"]

    closest_img, farthest_img = summarize("image", image_dists)
    closest_txt, farthest_txt = summarize("text", text_dists)

    # --- Print the summary neatly ---
    print("🔍 Cross-Modal Query Summary")
    print(f"Closest image match has distance  {closest_img:.3f}")
    print(f"Farthest image match has distance {farthest_img:.3f}")
    print(f"Closest recipe match has distance {closest_txt:.3f}")
    print(f"Farthest recipe match has distance {farthest_txt:.3f}")


In [19]:
# Debug: Check what collections exist
print("Available collections:")
for col_name in CHROMA.list_collections():
    print(f"  - {col_name.name}")
    
# Check the specific collection
try:
    col = CHROMA.get_collection("trusted_zone_multimodal")
    count = col.count()
    print(f"\nCollection 'trusted_zone_multimodal' has {count} entries")
    
    if count > 0:
        # Get a sample
        sample = col.get(limit=2)
        print(json.dumps(sample["metadatas"], indent=2))

except Exception as e:
    print(f"Error accessing collection: {e}")

Available collections:
  - trusted_zone_multimodal
  - trusted_zone_images
  - trusted_zone_documents

Collection 'trusted_zone_multimodal' has 85 entries
[
  {
    "object_key": "images/image$adsdb-multimodal-food-data-management$2025-10-27T21-15-52Z$11623d02da5633ddae2ec9cd3e18bf13__00059477e2_0.jpg",
    "bucket": "trusted-zone",
    "type": "image"
  },
  {
    "type": "image",
    "bucket": "trusted-zone",
    "object_key": "images/image$adsdb-multimodal-food-data-management$2025-10-27T21-15-52Z$142496c88baebda2c924c53461d0f78e__00073a6b36_0.jpg"
  }
]


# 3) Text search


In [None]:
all_data = multi_col.get(limit=10000)
metas = all_data["metadatas"]

image_count = sum(1 for m in metas if m.get("type") == "image")
text_count  = sum(1 for m in metas if m.get("type") == "text")

print(f"Total items: {len(metas)} | Images: {image_count} | Texts: {text_count}")

query = "fettuccine alfredo pasta dish with creamy sauce"

res = multi_col.query(
    query_texts=[query],
    n_results=85,
    include=["metadatas", "documents", "distances"]
)

print_multi_summary(res)



Total items: 85 | Images: 50 | Texts: 35


# 4) Image search


In [26]:
all_data = multi_col.get(limit=10000)
metas = all_data["metadatas"]

image_count = sum(1 for m in metas if m.get("type") == "image")
text_count  = sum(1 for m in metas if m.get("type") == "text")

print(f"Total items: {len(metas)} | Images: {image_count} | Texts: {text_count}")

query = np.array(Image.open("calico-beans.jpg").convert("RGB"))

res = multi_col.query(
    query_images=[query],
    n_results=85,    # top-k results
    include=["documents", "metadatas", "distances"]
)
print_multi_summary(res)



Total items: 85 | Images: 50 | Texts: 35


InternalError: Error executing plan: Internal error: Error creating hnsw segment reader: Nothing found on disk