In [1]:
import os
from dotenv import load_dotenv
from datasets import  load_dataset
from huggingface_hub import HfApi, hf_hub_url
import s3fs
from os.path import basename
import requests
import base64

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
load_dotenv()
HF_TOKEN=os.getenv("HF_TOKEN")
HF_ORGA=os.getenv("HF_ORGA")
HF_DATASET=os.getenv("HF_DATASET")
HF_REV=os.getenv("HF_REV")
MINIO_PASSWORD=os.getenv("MINIO_PASSWORD")
MINIO_USER=os.getenv("MINIO_USER")
MINIO_ENDPOINT=os.getenv("MINIO_ENDPOINT")
MINIO_BUCKET=os.getenv("MINIO_BUCKET")

In [3]:
api = HfApi()
info = api.whoami(token=HF_TOKEN)

In [15]:
fs = s3fs.S3FileSystem(
    key=MINIO_USER,
    secret=MINIO_PASSWORD,
    client_kwargs={"endpoint_url": MINIO_ENDPOINT}
)

In [5]:
import json

In [7]:
#!pip install ijson requests pillow


Collecting ijson
  Downloading ijson-3.4.0-cp312-cp312-win_amd64.whl.metadata (22 kB)
Downloading ijson-3.4.0-cp312-cp312-win_amd64.whl (54 kB)
Installing collected packages: ijson
Successfully installed ijson-3.4.0


In [None]:
# TODO persistent landing zone

# --- Reorganizar temporal_landing -> persistent_landing por tipo con naming <type>$<filename>.<format> ---
import os
from os.path import splitext

# Orígenes y destinos en MinIO
SRC_BASE = f"{MINIO_BUCKET}/temporal_landing"
DST_BASE = f"{MINIO_BUCKET}/persistent_landing"
DIR_METADATA = f"{DST_BASE}/metadata"
DIR_IMAGES   = f"{DST_BASE}/images"
DIR_OTHERS   = f"{DST_BASE}/others"

# Extensiones por categoría (case-insensitive)
META_EXTS = {".json", ".jsonl", ".ndjson", ".csv", ".tsv", ".parquet", ".yaml", ".yml", ".txt"}
IMG_EXTS  = {".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp", ".tiff", ".tif"}

def classify_ext(path: str) -> str:
    ext = splitext(path)[1].lower()
    if ext in IMG_EXTS:
        return "images"
    if ext in META_EXTS:
        return "metadata"
    return "others"

def build_dest_name(kind: str, base_name: str) -> str:
    """
    Convierte p.ej. 'ADSDB-DYS__recipe1M_layers__layer1.json'
    -> 'metadata$ADSDB-DYS__recipe1M_layers__layer1.json'
    """
    name_root, ext = splitext(base_name)
    fmt = ext[1:].lower() if ext else "bin"  # formato; si no hay extensión, usa 'bin'
    return f"{kind}${name_root}.{fmt}"

def copy_minio(src_path: str, dst_path: str, chunk_mb: int = 4):
    """
    Copia dentro del mismo fs. Si existe copy() se usa server-side.
    Si no, se hace streaming (lee y escribe en chunks).
    """
    # Evita sobreescribir si ya existe
    if fs.exists(dst_path):
        return "skipped"

    # Si el filesystem soporta copy server-side, úsalo (rápido y sin sacar datos)
    if hasattr(fs, "copy"):
        try:
            fs.copy(src_path, dst_path)
            return "copied"
        except Exception:
            # fallback a streaming
            pass

    # Streaming fallback
    with fs.open(src_path, "rb") as fr, fs.open(dst_path, "wb") as fw:
        while True:
            chunk = fr.read(chunk_mb * 1024 * 1024)
            if not chunk:
                break
            fw.write(chunk)
    return "copied"

# Listado recursivo del origen (todas las "keys" bajo temporal_landing)
paths = fs.find(SRC_BASE)

moved_meta = moved_img = moved_oth = 0
skipped = 0

for src in paths:
    # Suele listar solo ficheros; por si acaso, salta "directorios"
    if src.endswith("/") or src.endswith(".keep"):
        continue

    # Nombre base (lo que hay tras temporal_landing/)
    base_name = src.rsplit("/", 1)[-1]

    # TODO audio and images
    # Clasificación y carpeta destino
    kind = classify_ext(base_name)
    if kind == "metadata":
        dst_dir = DIR_METADATA
    elif kind == "images":
        dst_dir = DIR_IMAGES
    else:
        dst_dir = DIR_OTHERS

    # Nuevo nombre con convención <type>$<filename>.<format>
    dest_name = build_dest_name(kind, base_name)
    dst = f"{dst_dir}/{dest_name}"

    status = copy_minio(src, dst)
    if status == "skipped":
        skipped += 1
    else:
        if kind == "metadata":
            moved_meta += 1
        elif kind == "images":
            moved_img += 1
        else:
            moved_oth += 1

print("✅ Reorganización completada.")
print(f"  → metadata: {moved_meta} archivos")
print(f"  → images:   {moved_img} archivos")
print(f"  → others:   {moved_oth} archivos")
print(f"  (saltados por existir en destino: {skipped})")
print(f"Destino base: {DST_BASE}")


✅ Reorganización completada.
  → metadata: 4 archivos
  → images:   17 archivos
  → others:   0 archivos
  (saltados por existir en destino: 0)
Destino base: landing-zone/persistent_landing
