In [1]:
import os
from huggingface_hub import HfApi, login
from dotenv import load_dotenv

load_dotenv() # loads HF_TOKEN; token should have write permissions

# Enable hf_transfer
os.environ['HF_HUB_ENABLE_HF_TRANSFER'] = "1"

DATA_PATH = 'data'

In [2]:
import zipfile
from tqdm import tqdm
from pathlib import Path

def zip_image_folders(root='data', overwrite=False, image_exts=('.jpg', '.jpeg', '.png', '.webp')):
    root = Path(root)
    
    # Iterate only over directories (excluding hidden ones)
    folders = [f for f in root.iterdir() if f.is_dir() and not f.name.startswith('.')]
    
    for folder in folders:
        # Recursively find all image files in the folder
        files = [f for f in folder.rglob('*') if f.is_file() and f.suffix.lower() in image_exts]
        if not files:
            continue  # Skip empty or non-image folders

        zip_path = folder.with_suffix('.zip')
        if not overwrite and zip_path.exists():
            print(f"Already exists: {zip_path.name}")
            continue

        print(f"\nZipping {folder.name} → {zip_path.name}")
        with zipfile.ZipFile(zip_path, 'w', compression=zipfile.ZIP_DEFLATED) as zipf:
            for file in tqdm(files, desc=f"Zipping {folder.name}"):
                arcname = Path(folder.name) / file.relative_to(folder)
                zipf.write(file, arcname=arcname)

# Usage
zip_image_folders(
    root=DATA_PATH,   # or pass your DATA_PATH
    overwrite=False
)

Already exists: images_7k.zip
Already exists: images_OZ_geo_5500.zip


In [3]:
from huggingface_hub import HfApi

api = HfApi()
api.upload_folder(
    folder_path=DATA_PATH,  # Path to the local directory
    repo_id="INDEEPA/clip-siamese",
    repo_type="dataset",
    ignore_patterns=['**/*.jpg', "**/*.webp"],
)

CommitInfo(commit_url='https://huggingface.co/datasets/INDEEPA/clip-siamese/commit/c86827dece8e2403c556067ae448bae4e83a042a', commit_message='Upload folder using huggingface_hub', commit_description='', oid='c86827dece8e2403c556067ae448bae4e83a042a', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/INDEEPA/clip-siamese', endpoint='https://huggingface.co', repo_type='dataset', repo_id='INDEEPA/clip-siamese'), pr_revision=None, pr_num=None)

In [None]:
# # Delete files matching pattern from the repo using HfFileSystem
# from huggingface_hub import HfFileSystem
# fs = HfFileSystem()

# # Delete files matching the pattern
# fs.rm("datasets/INDEEPA/clip-siamese/tables_OZ_geo_5500", recursive=True)

In [None]:
# # DELETE ALL FILES IN THE REPO COMPLETELY
# # RECOVERABLE BY COMMIT, BUT THINK TWICE!

# from huggingface_hub import HfFileSystem

# # Initialize the HfFileSystem
# fs = HfFileSystem()

# # Define the repository path
# repo_path = "datasets/INDEEPA/clip-siamese"

# # Delete all files and folders in the repository
# fs.rm(f"{repo_path}/", recursive=True)