In [1]:
import os
from dotenv import load_dotenv
from datasets import  load_dataset
from huggingface_hub import HfApi, hf_hub_url
import s3fs
from os.path import basename
import requests
import base64

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# TODO re do code well

In [None]:
load_dotenv()
HF_TOKEN=os.getenv("HF_TOKEN")
HF_ORGA=os.getenv("HF_ORGA")
HF_DATASET=os.getenv("HF_DATASET")
HF_REV=os.getenv("HF_REV")
MINIO_PASSWORD=os.getenv("MINIO_PASSWORD")
MINIO_USER=os.getenv("MINIO_USER")
MINIO_ENDPOINT=os.getenv("MINIO_ENDPOINT")
MINIO_BUCKET=os.getenv("MINIO_LANDING_BUCKET")

In [3]:
api = HfApi()
info = api.whoami(token=HF_TOKEN)

In [15]:
fs = s3fs.S3FileSystem(
    key=MINIO_USER,
    secret=MINIO_PASSWORD,
    client_kwargs={"endpoint_url": MINIO_ENDPOINT}
)

In [21]:
datasets = api.list_datasets(author=HF_ORGA, token=HF_TOKEN) 

In [5]:
import json

In [None]:
SKIP = {".gitattributes", ".gitignore", ".gitkeep"}

for ds in datasets:
    ds_id=ds.id
    files=api.list_repo_files(repo_id=ds_id, repo_type="dataset", revision=HF_REV)
    for path in files:
        fname = basename(path)
        if fname.startswith(".") or fname.endswith(".tar") or fname in SKIP:
            continue
        url=hf_hub_url(repo_id=ds_id, filename=path, repo_type="dataset", revision=HF_REV)
        r=requests.get(url, stream=True, headers={"authorization":f"Bearer {HF_TOKEN}"})
        r.raise_for_status()
        
        if fname.endswith(".json"):
            content = r.content.decode("utf-8")
            try:
                data = json.loads(content)
                print(f"\n📄 Contenido de {fname} ({ds_id}): ")
                print(json.dumps(data, indent=2, ensure_ascii=False))
            except json.JSONDecodeError:
                print(f"\n⚠️ No se pudo parsear {fname} como JSON:")
                print(content[:500])  # mostrar los primeros caracteres
        
        with fs.open(f"{MINIO_BUCKET}/temporal_landing/{path.replace('/', '__')}","wb") as f: # TODO many files in folders can result in the same, see and ask the best way to do it
            for chunk in r.iter_content(1024*1024):
                if chunk: f.write(chunk)


📄 Contenido de det_ingrs.json (ADSDB-DYS/adsdb-multimodal-food-data-management):


In [7]:
#!pip install ijson requests pillow


Collecting ijson
  Downloading ijson-3.4.0-cp312-cp312-win_amd64.whl.metadata (22 kB)
Downloading ijson-3.4.0-cp312-cp312-win_amd64.whl (54 kB)
Installing collected packages: ijson
Successfully installed ijson-3.4.0


In [None]:
# TODO see the other files to enrich data

In [None]:
# TODO temporal landing zone
# --- Imports ---
import os, io, json, requests, ijson
from os.path import basename
from huggingface_hub import hf_hub_url

# --- Parámetros ---
SKIP = {".gitattributes", ".gitignore", ".gitkeep"}
TIMEOUT = 60
N_RECETAS = 50          # cuántas recetas tomar de layer1 para cruzar
IMGS_POR_RECETA = 5     # máx imágenes por receta a subir
BASE_DIR_MINIO = f"{MINIO_BUCKET}/temporal_landing"  # donde guardas todo

headers = {"authorization": f"Bearer {HF_TOKEN}"} if HF_TOKEN else {}

# --- Utils generales ---
def _get(url, headers, timeout=TIMEOUT):
    r = requests.get(url, stream=True, headers=headers, timeout=timeout)
    r.raise_for_status()
    return r

def save_to_minio_stream(url, dst_path):
    """Descarga por streaming y guarda en MinIO en dst_path."""
    r = _get(url, headers)
    with fs.open(dst_path, "wb") as f:
        for chunk in r.iter_content(1024 * 1024):
            if chunk:
                f.write(chunk)
    r.close()

def underscored_img_path(partition, image_id):

    base = image_id.split(".")[0]
    return f"{partition}_{image_id}"

def pick_n_from_layer1_fileobj(fileobj, n):
    """
    Lee layer1.json desde un file-like (fs.open en MinIO) y devuelve:
      recetas: [{id, title, ingredients, partition}]
      id2partition: {id: partition}
    """
    recetas, id2partition = [], {}
    for item in ijson.items(fileobj, 'item'):
        rid = item.get("id")
        if not rid:
            continue
        rec = {
            "id": rid,
            "title": item.get("title"),
            "ingredients": [x.get("text") for x in item.get("ingredients", []) if isinstance(x, dict) and "text" in x],
            "partition": item.get("partition"),
        }
        recetas.append(rec)
        id2partition[rid] = rec["partition"]
        if len(recetas) >= n:
            break
    return recetas, id2partition

def map_images_for_ids_fileobj(fileobj, ids, max_imgs_per_id=IMGS_POR_RECETA):
    """
    Lee layer2.json desde un file-like y devuelve:
      { id: [ {id: image_id, url: url}, ... ] }
    Solo llena las que están en 'ids'; se corta cuando encuentra todas.
    """
    target = set(ids)
    result = {rid: [] for rid in ids}
    for item in ijson.items(fileobj, 'item'):
        rid = item.get("id")
        if rid in target:
            lst = []
            for im in item.get("images", []):
                iid = im.get("id")
                u   = im.get("url")
                if iid and u:
                    lst.append({"id": iid, "url": u})
                if len(lst) >= max_imgs_per_id:
                    break
            result[rid] = lst
            target.discard(rid)
            if not target:
                break
    return result

def upload_image_to_minio(url, dst_path):
    """Descarga y sube una imagen a MinIO en dst_path."""
    try:
        r = requests.get(url, stream=True, timeout=TIMEOUT)
        r.raise_for_status()
        with fs.open(dst_path, "wb") as f:
            for chunk in r.iter_content(1024 * 1024):
                if chunk:
                    f.write(chunk)
        r.close()
        return True
    except Exception as e:
        print(f"⚠️ Error subiendo imagen desde {url} -> {dst_path}: {e}")
        return False

# --- Descarga TODO del/los datasets y detecta layer1/layer2 ---
# Guardamos mapping por dataset: ds_id -> {'layer1': minio_path, 'layer2': minio_path}
layers_paths = {}

for ds in datasets:
    ds_id = ds.id
    print(f"\n📦 Dataset: {ds_id}")
    files = api.list_repo_files(repo_id=ds_id, repo_type="dataset", revision=HF_REV)

    # Para registrar si encontramos layer1/layer2 en este ds
    l1_minio = None
    l2_minio = None

    for path in files:
        fname = basename(path)
        if fname.startswith(".") or fname.endswith(".tar") or fname in SKIP:
            continue

        # 1) Construye URL HF y descarga a MinIO (sustituyendo '/' por '__')
        url = hf_hub_url(repo_id=ds_id, filename=path, repo_type="dataset", revision=HF_REV)
        try:
            r = _get(url, headers)
        except Exception as e:
            print(f"⚠️ No se pudo abrir {path}: {e}")
            continue

        minio_name = path.replace("/", "__")
        dst_path = f"{BASE_DIR_MINIO}/{minio_name}"
        with fs.open(dst_path, "wb") as f:
            for chunk in r.iter_content(1024 * 1024):
                if chunk:
                    f.write(chunk)
        r.close()

        # 2) ¿Es layer1/layer2? Guardamos la ruta en MinIO
        # Priorizamos las que estén bajo 'recipe1M_layers/' si hay varias coincidencias
        normalized = path.lower()
        if normalized.endswith("/layer1.json") or normalized == "layer1.json":
            if ("/recipe1m_layers/" in normalized and (l1_minio is None or "recipe1m_layers" not in l1_minio.lower())) or l1_minio is None:
                l1_minio = dst_path
        if normalized.endswith("/layer2.json") or normalized == "layer2.json":
            if ("/recipe1m_layers/" in normalized and (l2_minio is None or "recipe1m_layers" not in l2_minio.lower())) or l2_minio is None:
                l2_minio = dst_path

    if l1_minio or l2_minio:
        layers_paths[ds_id] = {"layer1": l1_minio, "layer2": l2_minio}
        print(f"   ↳ layer1 en MinIO: {l1_minio}")
        print(f"   ↳ layer2 en MinIO: {l2_minio}")

# --- Para cada dataset con layer1 y layer2, cruzamos y subimos IMÁGENES ---
for ds_id, paths in layers_paths.items():
    layer1_minio_path = paths.get("layer1")
    layer2_minio_path = paths.get("layer2")

    if not layer1_minio_path or not layer2_minio_path:
        print(f"\n⏭️  Saltando {ds_id}: faltan layer1 o layer2 en MinIO.")
        continue

    print(f"\n🔗 Cruzando layer1 ↔ layer2 para {ds_id} (tomando {N_RECETAS} recetas)…")

    # 1) Pick N recetas de layer1 (desde el archivo en MinIO, no desde HF)
    with fs.open(layer1_minio_path, "rb") as f1:
        recetas, id2partition = pick_n_from_layer1_fileobj(f1, N_RECETAS)

    # 2) Mapear imágenes para esas recetas desde layer2 (archivo en MinIO)
    ids = [r["id"] for r in recetas]
    with fs.open(layer2_minio_path, "rb") as f2:
        imgs_map = map_images_for_ids_fileobj(f2, ids, IMGS_POR_RECETA)

    # 3) Subir SOLO las recetas que tienen al menos una imagen
    print("⬆️  Subiendo imágenes (solo recetas con imágenes)…")
    uploaded_count = 0
    for rec in recetas:
        rid = rec["id"]
        part = (rec.get("partition") or id2partition.get(rid) or "unknown").lower()
        imgs = imgs_map.get(rid, [])
        if not imgs:
            continue

        for im in imgs:
            iid = im["id"]
            url = im["url"]
            filename = underscored_img_path(partition=part, image_id=iid)
            img_dst = f"{BASE_DIR_MINIO}/{filename}"
            ok = upload_image_to_minio(url, img_dst)
            if ok:
                uploaded_count += 1

    print(f"✅ {ds_id}: Imágenes subidas: {uploaded_count}")
    print(f"📂 JSON usados desde MinIO:")
    print(f"   - {layer1_minio_path}")
    print(f"   - {layer2_minio_path}")
    print(f"📂 Imágenes guardadas en: {BASE_DIR_MINIO}/partition_<image_id>.jpg")



📦 Dataset: ADSDB-DYS/adsdb-multimodal-food-data-management
   ↳ layer1 en MinIO: landing-zone/temporal_landing/recipe1M_layers__layer1.json
   ↳ layer2 en MinIO: landing-zone/temporal_landing/recipe1M_layers__layer2.json

🔗 Cruzando layer1 ↔ layer2 para ADSDB-DYS/adsdb-multimodal-food-data-management (tomando 50 recetas)…
⬆️  Subiendo imágenes (solo recetas con imágenes)…
⚠️ Error subiendo imagen desde https://img-global.cpcdn.com/001_recipes/5806945844854784/0x0/photo.jpg -> landing-zone/temporal_landing/train_6bdca6e490.jpg: 400 Client Error: Bad Request for url: https://img-global.cpcdn.com/001_recipes/5806945844854784/0x0/photo.jpg
⚠️ Error subiendo imagen desde https://img-global.cpcdn.com/001_recipes/5205549177110528/0x0/photo.jpg -> landing-zone/temporal_landing/train_f480145da5.jpg: 400 Client Error: Bad Request for url: https://img-global.cpcdn.com/001_recipes/5205549177110528/0x0/photo.jpg
⚠️ Error subiendo imagen desde https://img-global.cpcdn.com/001_photo_reports/49045045

In [None]:
# TODO justify decisions like name convention, no history timestamp, folder structure, etc