In [7]:
import os
from dotenv import load_dotenv
import s3fs

load_dotenv()

MINIO_PASSWORD=os.getenv("MINIO_PASSWORD")
MINIO_USER=os.getenv("MINIO_USER")
MINIO_ENDPOINT=os.getenv("MINIO_ENDPOINT")
LANDING_BUCKET=os.getenv("MINIO_LANDING_BUCKET")
FORMATTED_BUCKET=os.getenv("MINIO_FORMATTED_BUCKET")


In [8]:
fs = s3fs.S3FileSystem(
    key=MINIO_USER,
    secret=MINIO_PASSWORD,
    client_kwargs={"endpoint_url": MINIO_ENDPOINT}
)

In [10]:
# --- Normalización SOLO IMÁGENES (a JPG) y copia tal cual de metadata/others ---
import io
from os.path import splitext
from PIL import Image


# Bases en MinIO
SRC_BASE = f"{LANDING_BUCKET}/persistent_landing"
DST_BASE = f"{FORMATTED_BUCKET}"
DST_METADATA = f"{DST_BASE}/metadata"
DST_IMAGES   = f"{DST_BASE}/images"
DST_OTHERS   = f"{DST_BASE}/others"

# Clasificación por extensión
IMG_EXTS  = {".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp", ".tiff", ".tif"}
META_EXTS = {".json", ".jsonl", ".ndjson", ".csv", ".tsv", ".parquet", ".yaml", ".yml", ".txt"}

def classify_kind(fname: str) -> str:
    ext = splitext(fname)[1].lower()
    if ext in IMG_EXTS:
        return "images"
    if ext in META_EXTS:
        return "metadata"
    return "others"

def parse_prefixed_name(base_name: str):
    """
    Convierte '<type>$<filename>.<ext>' -> (type|None, filename_root, ext_lower)
    Si no hay '$', devuelve (None, name_root, ext_lower)
    """
    name_root, ext = splitext(base_name)
    ext = ext.lower()
    if "$" in name_root:
        typ, rest = name_root.split("$", 1)
        return typ, rest, ext
    return None, name_root, ext

def copy_minio(src_path: str, dst_path: str, chunk_mb: int = 4):
    """Copia dentro del mismo fs (server-side si existe; si no, streaming)."""
    if fs.exists(dst_path):
        return "skipped"
    if hasattr(fs, "copy"):
        try:
            fs.copy(src_path, dst_path)
            return "copied"
        except Exception:
            pass
    with fs.open(src_path, "rb") as fr, fs.open(dst_path, "wb") as fw:
        while True:
            chunk = fr.read(chunk_mb * 1024 * 1024)
            if not chunk:
                break
            fw.write(chunk)
    return "copied"

def convert_image_to_jpg(src_path: str, base_name: str):
    """
    Convierte/copia imagen a JPG en DST_IMAGES con nombre 'images$<filename>.jpg'.
    - Si ya es jpg/jpeg: copia bytes y renombra a .jpg (sin recomprimir).
    - Para formatos con alpha: compone sobre blanco.
    """
    typ, file_root, ext = parse_prefixed_name(base_name)
    # Asegura prefijo 'images$'
    dest_name_root = file_root
    dst_path = f"{DST_IMAGES}/images${dest_name_root}.jpg"

    if fs.exists(dst_path):
        return "skipped", dst_path

    if ext in (".jpg", ".jpeg"):
        # Solo renombrar/copia sin recomprimir
        status = copy_minio(src_path, dst_path)
        return status, dst_path

    # Convertir a JPG
    try:
        with fs.open(src_path, "rb") as fr:
            img_bytes = fr.read()
        img = Image.open(io.BytesIO(img_bytes))

        # Alpha -> fondo blanco
        if img.mode in ("RGBA", "LA") or (img.mode == "P" and "transparency" in img.info):
            bg = Image.new("RGB", img.size, (255, 255, 255))
            img = img.convert("RGBA")
            bg.paste(img, mask=img.split()[-1])
            img = bg
        else:
            img = img.convert("RGB")

        with fs.open(dst_path, "wb") as fw:
            img.save(fw, format="JPEG", quality=90, optimize=True, progressive=True)

        return "converted", dst_path
    except Exception as e:
        print(f"⚠️ Error convirtiendo imagen {src_path} -> JPG: {e}")
        return "error", dst_path

# --------- Recorrer persistent_landing y procesar ----------
paths = fs.find(SRC_BASE)

c_img_conv = c_img_copy = c_img_skip = c_img_err = 0
c_meta_copy = c_meta_skip = 0
c_oth_copy  = c_oth_skip  = 0

for src in paths:
    if src.endswith("/"):
        continue

    base_name = src.rsplit("/", 1)[-1]
    kind = classify_kind(base_name)

    if kind == "images":
        status, dst = convert_image_to_jpg(src, base_name)
        if status == "converted":
            c_img_conv += 1
        elif status == "copied":
            c_img_copy += 1
        elif status == "skipped":
            c_img_skip += 1
        else:
            c_img_err += 1

    elif kind == "metadata":
        # Copia tal cual (mismo nombre y extensión) a formatted-zone/metadata
        dst = f"{DST_METADATA}/{base_name}"
        status = copy_minio(src, dst)
        if status == "skipped":
            c_meta_skip += 1
        else:
            c_meta_copy += 1

    else:
        # Copia tal cual (mismo nombre y extensión) a formatted-zone/others
        dst = f"{DST_OTHERS}/{base_name}"
        status = copy_minio(src, dst)
        if status == "skipped":
            c_oth_skip += 1
        else:
            c_oth_copy += 1

print("✅ Formateo completado → formatted-zone (solo imágenes a JPG; metadata/others tal cual)")
print(f"  IMAGES   → JPG     : converted={c_img_conv}, copied_jpg={c_img_copy}, skipped={c_img_skip}, errors={c_img_err}")
print(f"  METADATA → copied  : copied={c_meta_copy}, skipped={c_meta_skip}")
print(f"  OTHERS   → copied  : copied={c_oth_copy}, skipped={c_oth_skip}")
print(f"Destino base: {DST_BASE}")


✅ Formateo completado → formatted-zone (solo imágenes a JPG; metadata/others tal cual)
  IMAGES   → JPG     : converted=0, copied_jpg=17, skipped=0, errors=0
  METADATA → copied  : copied=4, skipped=0
  OTHERS   → copied  : copied=1, skipped=0
Destino base: formatted-zone


In [None]:
# TODO ask teacher about unify metadata files 
# TODO sepearate notebook for format