In [1]:
import os
from dotenv import load_dotenv
from datasets import  load_dataset
from huggingface_hub import HfApi, hf_hub_url
import s3fs
from os.path import basename
import requests
import base64

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
load_dotenv()
HF_TOKEN=os.getenv("HF_TOKEN")
HF_ORGA=os.getenv("HF_ORGA")
HF_DATASET=os.getenv("HF_DATASET")
HF_REV=os.getenv("HF_REV")
MINIO_PASSWORD=os.getenv("MINIO_PASSWORD")
MINIO_USER=os.getenv("MINIO_USER")
MINIO_ENDPOINT=os.getenv("MINIO_ENDPOINT")
MINIO_BUCKET=os.getenv("MINIO_BUCKET")

In [3]:
api = HfApi()
info = api.whoami(token=HF_TOKEN)

In [4]:
fs = s3fs.S3FileSystem(
    key=MINIO_USER,
    secret=MINIO_PASSWORD,
    client_kwargs={"endpoint_url": MINIO_ENDPOINT}
)

In [5]:
datasets = api.list_datasets(author=HF_ORGA, token=HF_TOKEN) 

In [6]:
SKIP = {".gitattributes", ".gitignore", ".gitkeep"}

for ds in datasets:
    ds_id=ds.id
    files=api.list_repo_files(repo_id=ds_id, repo_type="dataset", revision=HF_REV)
    for path in files:
        fname = basename(path)
        if fname.startswith(".") or fname in SKIP:
            continue
        url=hf_hub_url(repo_id=ds_id, filename=path, repo_type="dataset", revision=HF_REV)
        r=requests.get(url, stream=True, headers={"authorization":f"Bearer {HF_TOKEN}"})
        r.raise_for_status()
        with fs.open(f"{MINIO_BUCKET}/temporal_landing/{path.replace('/', '__')}","wb") as f: # TODO many files in folders can result in the same, see and ask the best way to do it
            for chunk in r.iter_content(1024*1024):
                if chunk: f.write(chunk)

In [7]:
# TODO delete .keep files?
# TODO see if we need to add metadata on all zones
# TODO extract from temporal landing zone to persistent landing zone (Name convention & organization)
# TODO we need to decide if we store it by source or by domain, etc
# TODO Name convention: <source>$<dataset>$<timestamp>.<format> where <timestamp> -> dd-mm-yyyy
# TODO should be done in a separate notebook