# 1) Setup env, MinIO client, Chroma client

In [2]:
import os, json, time, hashlib
from datetime import datetime, timezone
from dotenv import load_dotenv, find_dotenv
from typing import Iterable, Dict, Any, List, Optional
import numpy

dotenv_path = find_dotenv(filename='.env', usecwd=True)
if not dotenv_path:
    raise FileNotFoundError("Could not find .env. Set its path manually.")
print(f"Loading environment variables from {dotenv_path}")
load_dotenv(dotenv_path)

import boto3
from botocore.config import Config
import chromadb

# --- ENV ---
TRUSTED_BUCKET       = os.environ.get("TRUSTED_BUCKET", "trusted-zone")

CHROMA_PERSIST_DIR   = os.environ.get("CHROMA_PERSIST_DIR", "exploitation_zone/chroma")

# --- MinIO S3 client ---
MINIO_USER     = os.environ.get("MINIO_USER")
MINIO_PASSWORD = os.environ.get("MINIO_PASSWORD")
MINIO_ENDPOINT = os.environ.get("MINIO_ENDPOINT")

# Paths and Buckets
TRUST_BUCKET        = "trusted-zone"
TRUST_IMAGES_PREFIX = "images"

session = boto3.session.Session(
    aws_access_key_id=MINIO_USER,
    aws_secret_access_key=MINIO_PASSWORD,
    region_name="us-east-1"
)
s3 = session.client(
    "s3",
    endpoint_url=MINIO_ENDPOINT,
    config=Config(signature_version="s3v4", s3={"addressing_style": "path"})
)

from chromadb import PersistentClient
from chromadb.utils.embedding_functions import OpenCLIPEmbeddingFunction

# Connect to the same local exploitation directory
CHROMA = chromadb.PersistentClient(path="../" + CHROMA_PERSIST_DIR)

# Use CLIP for both text & images
ef_clip = OpenCLIPEmbeddingFunction()

# list all existing collections
existing_names = [col.name for col in CHROMA.list_collections()]
print("Available collections:", existing_names)

target_name = "trusted_zone_multimodal"

# connect to the collection for multi-modal data
multi_col = CHROMA.get_or_create_collection(
    name="trusted_zone_multimodal",
    embedding_function=ef_clip,
    metadata={"modality": "image+text", "model": "OpenCLIP", "source": "minio"}
)

print("✅ Multi-modal collection ready:", multi_col.name)


Loading environment variables from c:\Users\sindr\Documents\FIB\adsdb-multimodal-food-data-management\.env
Available collections: ['trusted_zone_multimodal', 'trusted_zone_images', 'trusted_zone_documents']
✅ Multi-modal collection ready: trusted_zone_multimodal


In [3]:
# Test MinIO connection and bucket
try:
    # List all buckets
    response = s3.list_buckets()
    print("Available buckets:", [b['Name'] for b in response['Buckets']])
    
    # Check if trusted-zone bucket exists
    if TRUSTED_BUCKET in [b['Name'] for b in response['Buckets']]:
        print(f"✅ Bucket '{TRUSTED_BUCKET}' exists")
    else:
        print(f"❌ Bucket '{TRUSTED_BUCKET}' does NOT exist")
        
    # Try to list objects in the bucket
    try:
        objects = s3.list_objects_v2(Bucket=TRUSTED_BUCKET)
        print(f"Objects in {TRUSTED_BUCKET}: {objects.get('KeyCount', 0)}")
    except Exception as e:
        print(f"Cannot access bucket {TRUSTED_BUCKET}: {e}")
        
except Exception as e:
    print(f"MinIO error: {e}")

Available buckets: ['formatted-zone', 'landing-zone', 'trusted-zone']
✅ Bucket 'trusted-zone' exists
Objects in trusted-zone: 53


# 2) Retrieval helpers 


In [4]:
import torch
import open_clip
from PIL import Image

_DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Match Chroma's default OpenCLIPEmbeddingFunction() config
_MODEL_NAME = "ViT-B-32"
_PRETRAINED = "laion2b_s34b_b79k"

_CLIP_MODEL, _CLIP_PREPROCESS, _ = open_clip.create_model_and_transforms(
    _MODEL_NAME,
    pretrained=_PRETRAINED,
    device=_DEVICE
)

@torch.no_grad()
def encode_image_to_vec(pil_img: Image.Image) -> list[float]:
    """
    Produce an OpenCLIP image embedding compatible with the vectors stored in
    the 'trusted_zone_images' Chroma collection.
    """
    img_tensor = _CLIP_PREPROCESS(pil_img).unsqueeze(0).to(_DEVICE)
    img_features = _CLIP_MODEL.encode_image(img_tensor)
    img_features = img_features / img_features.norm(dim=-1, keepdim=True)
    return img_features.squeeze(0).cpu().tolist()

import io
import matplotlib.pyplot as plt

def show_retrieved_images(local_query_path, hits):
    """Show the query image followed by the retrieved similar images."""
    # --- display the query image ---
    plt.figure(figsize=(15, 4))
    plt.subplot(1, len(hits) + 1, 1)
    query_img = Image.open(local_query_path).convert("RGB")
    plt.imshow(query_img)
    plt.axis("off")
    plt.title("Query")

    # --- display each retrieved image from MinIO ---
    for i, hit in enumerate(hits, start=2):
        bucket = hit["image_s3_bucket"]
        key = hit["image_s3_key"]
        score = hit["score"]

        obj = s3.get_object(Bucket=bucket, Key=key)
        img = Image.open(io.BytesIO(obj["Body"].read())).convert("RGB")

        plt.subplot(1, len(hits) + 1, i)
        plt.imshow(img)
        plt.axis("off")
        plt.title(f"{i-1}. Distance={score:.3f}", fontsize=8)

    plt.tight_layout()
    plt.show()

import textwrap

def show_text_results(result):
    print(f"🔎 Query: {result['query']}\n")
    for i, hit in enumerate(result["hits"], start=1):
        text_preview = textwrap.shorten(hit["text"], width=180, placeholder="…")
        print(f"{i}.  Distance={hit['score']:.3f}")
        print(text_preview)
        print("-" * 80)

# --- Compute summary statistics ---
def summarize(label, arr):
    if not arr:
        print(f"No {label} results found.")
        return None, None
    return min(arr), max(arr)

def print_multi_summary(res: Dict[str, Any]):
    metas = res["metadatas"][0]
    dists = res["distances"][0]

    image_dists = [d for m, d in zip(metas, dists) if m.get("type") == "image"]
    text_dists  = [d for m, d in zip(metas, dists) if m.get("type") == "text"]

    closest_img, farthest_img = summarize("image", image_dists)
    closest_txt, farthest_txt = summarize("text", text_dists)

    # --- Print the summary neatly ---
    print("🔍 Cross-Modal Query Summary")
    print(f"Closest image match has distance  {closest_img:.3f}")
    print(f"Farthest image match has distance {farthest_img:.3f}")
    print(f"Closest recipe match has distance {closest_txt:.3f}")
    print(f"Farthest recipe match has distance {farthest_txt:.3f}")


In [5]:
from typing import List, Dict, Any

import base64
import io
from PIL import Image


from chromadb.utils.embedding_functions import OpenCLIPEmbeddingFunction, SentenceTransformerEmbeddingFunction

ef_clip = OpenCLIPEmbeddingFunction()

def get_multi_collection():
    return client.get_collection(
        name="trusted_zone_multimodal",
        embedding_function=ef_clip,
    )

def retrieve_query(query: str, k: int = 5) -> Dict[str, Any]:
    """
    Retrieve top-k similar text docs from Chroma for a given natural language query.
    Returns a dict with 'hits' = [{text, meta, score}, ...]
    """
    col = get_multi_collection()

    result = col.query(
        query_texts=[query],
        n_results=k,
        include=["documents", "metadatas", "distances"],
    )
    
    hits = []
    docs = result.get("documents", [[]])[0]
    metas = result.get("metadatas", [[]])[0]
    dists = result.get("distances", [[]])[0]

    for text_doc, meta, dist in zip(docs, metas, dists):
        hits.append({
            "text": text_doc,
            "meta": meta,
            "score": float(dist),
        })

    return {
        "query": query,
        "hits": hits,
    }

from PIL import Image


def load_top_images_as_base64(image_hits, max_images=3):
    """
    Takes the image_hits list from retrieve_images() (with bucket/key),
    downloads up to max_images from MinIO,
    converts to JPEG,
    returns a list of base64-encoded strings ready for Ollama llava.
    """
    b64_list = []

    for i, hit in enumerate(image_hits):
        if i >= max_images:
            break

        bucket = hit["image_s3_bucket"]
        key = hit["image_s3_key"]

        # download from MinIO
        obj = s3.get_object(Bucket=bucket, Key=key)
        img_bytes = obj["Body"].read()

        # normalize to JPEG bytes in memory
        pil_img = Image.open(io.BytesIO(img_bytes)).convert("RGB")
        buf = io.BytesIO()
        pil_img.save(buf, format="JPEG", quality=90)
        jpeg_bytes = buf.getvalue()

        # base64 encode for Ollama
        b64_img = base64.b64encode(jpeg_bytes).decode("utf-8")
        b64_list.append({
            "bucket": bucket,
            "key": key,
            "b64": b64_img,
        })

    return b64_list


In [6]:
# Debug: Check what collections exist
print("Available collections:")
for col_name in CHROMA.list_collections():
    print(f"  - {col_name.name}")
    
# Check the specific collection
try:
    col = CHROMA.get_collection("trusted_zone_multimodal")
    count = col.count()
    print(f"\nCollection 'trusted_zone_multimodal' has {count} entries")
    
    if count > 0:
        # Get a sample
        sample = col.get(limit=2)
        print("Sample entry:")
        for i, doc in enumerate(sample['documents'][:2]):
            print(f"[{i+1}] {doc[:200]}...")
except Exception as e:
    print(f"Error accessing collection: {e}")

Available collections:
  - trusted_zone_multimodal
  - trusted_zone_images
  - trusted_zone_documents

Collection 'trusted_zone_multimodal' has 85 entries
Sample entry:
Error accessing collection: 'NoneType' object is not subscriptable


# 3) Text search


In [7]:
all_data = multi_col.get(limit=10000)
metas = all_data["metadatas"]

image_count = sum(1 for m in metas if m.get("type") == "image")
text_count  = sum(1 for m in metas if m.get("type") == "text")

print(f"Total items: {len(metas)} | Images: {image_count} | Texts: {text_count}")

print(multi_col._embedding_function)

query = "fettuccine alfredo pasta dish with creamy sauce"

res = multi_col.query(
    query_texts=[query],
    n_results=85,
    include=["metadatas", "documents", "distances"]
)

print_multi_summary(res)



Total items: 85 | Images: 0 | Texts: 0
<chromadb.utils.embedding_functions.open_clip_embedding_function.OpenCLIPEmbeddingFunction object at 0x000001C40E667D70>
No image results found.
No text results found.
🔍 Cross-Modal Query Summary


TypeError: unsupported format string passed to NoneType.__format__

# 4) Image search


In [None]:
import numpy as np
all_data = multi_col.get(limit=10000)
metas = all_data["metadatas"]

image_count = sum(1 for m in metas if m.get("type") == "image")
text_count  = sum(1 for m in metas if m.get("type") == "text")

print(f"Total items: {len(metas)} | Images: {image_count} | Texts: {text_count}")

print(multi_col._embedding_function)

query = np.array(Image.open("calico-beans.jpg").convert("RGB"))

res = multi_col.query(
    query_images=[query],
    n_results=85,    # top-k results
    include=["documents", "metadatas", "distances"]
)
print_multi_summary(res)



Total items: 135 | Images: 50 | Texts: 35
<chromadb.utils.embedding_functions.open_clip_embedding_function.OpenCLIPEmbeddingFunction object at 0x00000153E21092B0>
No text results found.
🔍 Cross-Modal Query Summary
Closest image match has distance  0.361
Farthest image match has distance 0.654


TypeError: unsupported format string passed to NoneType.__format__