In [16]:
import os
from dotenv import load_dotenv
import re
import time
import boto3
import mimetypes
from datetime import datetime, timezone
from urllib.parse import unquote
from pathlib import PurePosixPath
from botocore.config import Config
from botocore.exceptions import ClientError

In [17]:

load_dotenv()

SRC_BUCKET      = "landing-zone"
DEST_BUCKET     = SRC_BUCKET
IMG_PREFIX      = "persistent_landing/images"
DOC_PREFIX      = "persistent_landing/documents"
HF_DATASET=os.getenv("HF_DATASET")

MINIO_USER=os.getenv("MINIO_USER")
MINIO_PASSWORD=os.getenv("MINIO_PASSWORD")
MINIO_ENDPOINT=os.getenv("MINIO_ENDPOINT")

DELETE_SOURCE_AFTER_COPY = True 

IMAGE_MIME_PREFIXES = ("image/",)
IMAGE_EXTS = {"jpg", "jpeg", "png", "gif", "webp", "bmp", "tiff"}
DOC_EXTS   = {"json", "jsonl", "ndjson"}

Cargamos varialbes de entrono enecsarias de bucket de orgeun y bucekt destino, vemos si tenemos activado el borrar tras copiar para dejar el sistema de limpaod de la temproal ladnign zone

In [18]:
s3 = boto3.client(
    "s3",
    endpoint_url=MINIO_ENDPOINT,
    aws_access_key_id=MINIO_USER,
    aws_secret_access_key=MINIO_PASSWORD,
    region_name="us-east-1",
    config=Config(signature_version="s3v4", s3={"addressing_style": "path"}),
)

clietne

In [None]:
def utc_ts() -> str:
    return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H-%M-%SZ")


def guess_name_and_ext(key: str, head: dict) -> tuple[str, str]:
    p = PurePosixPath(key)
    name = p.name
    base = p.stem or "file"
    ext = p.suffix.lower().lstrip(".")

    if not ext:
        ctype = (head.get("ContentType") or "").split(";")[0].strip().lower()
        if ctype:
            guess = mimetypes.guess_extension(ctype) or ""
            ext = guess.lstrip(".")
            if ext == "jpe":
                ext = "jpg"
    if ext == "jpeg":
        ext = "jpg"
    return base, ext or "bin"

def is_image(head: dict, ext: str) -> bool:
    ctype = (head.get("ContentType") or "").lower()
    return ctype.startswith(IMAGE_MIME_PREFIXES) or ext in IMAGE_EXTS

def is_document_json(head: dict, ext: str) -> bool:
    ctype = (head.get("ContentType") or "").split(";")[0].strip().lower()
    return ext in DOC_EXTS or ctype == "application/json"

def sanitize_filename(s: str) -> str:
    return re.sub(r"[^\w\-.]+", "_", s)

def make_target_key(obj_type: str, dataset: str, ts: str, filename: str, ext: str, prefix: str) -> str:
    filename = sanitize_filename(filename)
    dataset  = sanitize_filename(dataset)
    return f"{prefix}/{obj_type}${dataset}${ts}${filename}.{ext}"

def copy_object(src_bucket: str, src_key: str, dst_bucket: str, dst_key: str, metadata: dict | None = None, content_type: str | None = None):
    extra = {"MetadataDirective": "REPLACE"}
    if metadata:
        extra["Metadata"] = metadata
    if content_type:
        extra["ContentType"] = content_type

    s3.copy_object(
        CopySource={"Bucket": src_bucket, "Key": src_key},
        Bucket=dst_bucket,
        Key=dst_key,
        **extra,
    )

def move_or_copy(src_bucket: str, src_key: str, dst_bucket: str, dst_key: str, **kwargs):
    copy_object(src_bucket, src_key, dst_bucket, dst_key, **kwargs)
    if DELETE_SOURCE_AFTER_COPY:
        try:
            s3.delete_object(Bucket=src_bucket, Key=src_key)
        except ClientError as e:
            print(f"[WARN] failed to delete from origin {src_key}: {e}")


fucnones axulaires par tiemstamp de ingestion, saber formatos,etc

In [None]:
paginator = s3.get_paginator("list_objects_v2")
ing_ts = utc_ts()

pages = paginator.paginate(Bucket=SRC_BUCKET)
total = moved_img = moved_doc = skipped = 0

for page in pages:
    for obj in page.get("Contents", []):
        key = obj["Key"]
        total += 1

        if key.endswith("/") or key.startswith("."):
            skipped += 1
            continue

        try:
            head = s3.head_object(Bucket=SRC_BUCKET, Key=key)
        except ClientError as e:
            print(f"[WARN] head_object failed in {key}: {e}")
            skipped += 1
            continue

        base, ext = guess_name_and_ext(key, head)

        if is_image(head, ext):
            dst_key = make_target_key("images", HF_DATASET, ing_ts, base, ext, prefix=IMG_PREFIX)
            move_or_copy(
                SRC_BUCKET, key, DEST_BUCKET, dst_key,
                metadata={
                    "src-bucket": SRC_BUCKET,
                    "src-key": key,
                    "dataset": HF_DATASET,
                    "ingestion-ts": ing_ts,
                },
                content_type=head.get("ContentType"),
            )
            moved_img += 1
            print(f"[IMG] {key} -> s3://{DEST_BUCKET}/{dst_key}")

        elif is_document_json(head, ext):
            dst_key = make_target_key("documents", HF_DATASET, ing_ts, base, ext, prefix=DOC_PREFIX)
            move_or_copy(
                SRC_BUCKET, key, DEST_BUCKET, dst_key,
                metadata={
                    "src-bucket": SRC_BUCKET,
                    "src-key": key,
                    "dataset": HF_DATASET,
                    "ingestion-ts": ing_ts,
                },
                content_type=head.get("ContentType") or "application/json",
            )
            moved_doc += 1
            print(f"[DOC] {key} -> s3://{DEST_BUCKET}/{dst_key}")

        else:
            skipped += 1
            print(f"[SKIP] {key} (ctype={head.get('ContentType')}, ext=.{ext})")

print(f"\n[STATS] total={total}  images={moved_img}  documents={moved_doc}  skipped={skipped}")

[IMG] temporal_landing/19214085d36fb535c7dbf24d178dea1b__000095fc1d_0.jpg -> s3://landing-zone/persistent_landing/images/images$adsdb-multimodal-food-data-management$2025-10-11T10-52-58Z$19214085d36fb535c7dbf24d178dea1b__000095fc1d_0.jpg
[IMG] temporal_landing/1a426726a1e467b8e6360e980b03f611__00003a70b1_1.jpg -> s3://landing-zone/persistent_landing/images/images$adsdb-multimodal-food-data-management$2025-10-11T10-52-58Z$1a426726a1e467b8e6360e980b03f611__00003a70b1_1.jpg
[IMG] temporal_landing/1b1a47ccbb9d3b879c21952c03111b34__00003a70b1_2.jpg -> s3://landing-zone/persistent_landing/images/images$adsdb-multimodal-food-data-management$2025-10-11T10-52-58Z$1b1a47ccbb9d3b879c21952c03111b34__00003a70b1_2.jpg
[DOC] temporal_landing/41daeff176df5471__layer2.json -> s3://landing-zone/persistent_landing/documents/documents$adsdb-multimodal-food-data-management$2025-10-11T10-52-58Z$41daeff176df5471__layer2.json
[IMG] temporal_landing/4786052a58187b0a9c875fde7cf940c9__00010c7867_0.jpg -> s3://la

explicar el cdodgo y justifaciones

In [None]:
# TODO futiro audios y videos 