# MinIO Utilities

Utility functions for managing MinIO buckets and objects.

## Prerequisites

- MinIO server running (locally: `task up` or `docker-compose up`)
- Port-forwarding enabled for local kind cluster (`task port-forward`)
- boto3 installed (`pip install boto3` or `uv pip install boto3`)

## Setup and Configuration

In [1]:
import boto3
import os
from typing import List, Dict, Optional
from datetime import datetime
from botocore.exceptions import ClientError
import json

In [2]:
# MinIO Configuration
MINIO_ENDPOINT = os.getenv('MINIO_ENDPOINT_URL', 'http://localhost:19000')
MINIO_ACCESS_KEY = os.getenv('MINIO_ACCESS_KEY_ID', 'minioadmin')
MINIO_SECRET_KEY = os.getenv('MINIO_SECRET_ACCESS_KEY', 'minioadmin')

print(f"MinIO Endpoint: {MINIO_ENDPOINT}")
print(f"Access Key: {MINIO_ACCESS_KEY}")

MinIO Endpoint: http://localhost:19000
Access Key: minioadmin


In [3]:
# Initialize S3 client and resource
s3_client = boto3.client(
    's3',
    endpoint_url=MINIO_ENDPOINT,
    aws_access_key_id=MINIO_ACCESS_KEY,
    aws_secret_access_key=MINIO_SECRET_KEY
)

s3_resource = boto3.resource(
    's3',
    endpoint_url=MINIO_ENDPOINT,
    aws_access_key_id=MINIO_ACCESS_KEY,
    aws_secret_access_key=MINIO_SECRET_KEY
)

print("✓ MinIO client initialized")

✓ MinIO client initialized


## Bucket Management Functions

In [4]:
def list_buckets() -> List[Dict[str, any]]:
    """
    List all buckets in MinIO.
    
    Returns:
        List of bucket dictionaries with 'Name' and 'CreationDate'
    """
    try:
        response = s3_client.list_buckets()
        buckets = response.get('Buckets', [])
        print(f"Found {len(buckets)} bucket(s):\n")
        
        for bucket in buckets:
            name = bucket['Name']
            created = bucket['CreationDate'].strftime('%Y-%m-%d %H:%M:%S')
            print(f"  • {name:30s} (created: {created})")
        
        return buckets
    except ClientError as e:
        print(f"Error listing buckets: {e}")
        return []

In [None]:
def create_bucket(bucket_name: str, region: Optional[str] = None) -> bool:
    """
    Create a new bucket in MinIO.
    
    Args:
        bucket_name: Name of the bucket to create
        region: AWS region (optional, not required for MinIO)
    
    Returns:
        True if successful, False otherwise
    """
    try:
        if region:
            s3_client.create_bucket(
                Bucket=bucket_name,
                CreateBucketConfiguration={'LocationConstraint': region}
            )
        else:
            s3_client.create_bucket(Bucket=bucket_name)
        
        print(f"✓ Bucket '{bucket_name}' created successfully")
        return True
    except ClientError as e:
        if e.response['Error']['Code'] == 'BucketAlreadyOwnedByYou':
            print(f"⚠ Bucket '{bucket_name}' already exists and is owned by you")
        elif e.response['Error']['Code'] == 'BucketAlreadyExists':
            print(f"⚠ Bucket '{bucket_name}' already exists")
        else:
            print(f"Error creating bucket: {e}")
        return False

In [5]:
def delete_bucket(bucket_name: str, force: bool = False) -> bool:
    """
    Delete a bucket from MinIO.
    
    Args:
        bucket_name: Name of the bucket to delete
        force: If True, delete all objects first. If False, bucket must be empty.
    
    Returns:
        True if successful, False otherwise
    """
    try:
        bucket = s3_resource.Bucket(bucket_name)
        
        if force:
            print(f"⚠ Force deleting bucket '{bucket_name}' and all its contents...")
            # Delete all objects and versions
            bucket.objects.all().delete()
            bucket.object_versions.all().delete()
            print(f"  ✓ All objects deleted")
        
        # Delete the bucket
        s3_client.delete_bucket(Bucket=bucket_name)
        print(f"✓ Bucket '{bucket_name}' deleted successfully")
        return True
        
    except ClientError as e:
        error_code = e.response['Error']['Code']
        if error_code == 'NoSuchBucket':
            print(f"⚠ Bucket '{bucket_name}' does not exist")
        elif error_code == 'BucketNotEmpty':
            print(f"⚠ Bucket '{bucket_name}' is not empty. Use force=True to delete contents.")
        else:
            print(f"Error deleting bucket: {e}")
        return False

In [None]:
def bucket_exists(bucket_name: str) -> bool:
    """
    Check if a bucket exists.
    
    Args:
        bucket_name: Name of the bucket to check
    
    Returns:
        True if bucket exists, False otherwise
    """
    try:
        s3_client.head_bucket(Bucket=bucket_name)
        return True
    except ClientError:
        return False

In [9]:
def get_bucket_size(bucket_name: str) -> Dict[str, any]:
    """
    Get the total size and object count of a bucket.
    
    Args:
        bucket_name: Name of the bucket
    
    Returns:
        Dictionary with 'total_size_bytes', 'total_size_mb', 'object_count'
    """
    try:
        bucket = s3_resource.Bucket(bucket_name)
        total_size = 0
        object_count = 0
        
        for obj in bucket.objects.all():
            total_size += obj.size
            object_count += 1
        
        total_size_mb = total_size / (1024 * 1024)
        total_size_gb = total_size / (1024 * 1024 * 1024)
        
        result = {
            'bucket_name': bucket_name,
            'object_count': object_count,
            'total_size_bytes': total_size,
            'total_size_mb': round(total_size_mb, 2),
            'total_size_gb': round(total_size_gb, 2)
        }
        
        print(f"Bucket: {bucket_name}")
        print(f"  Objects: {object_count}")
        print(f"  Size: {result['total_size_mb']} MB ({result['total_size_gb']} GB)")
        
        return result
        
    except ClientError as e:
        print(f"Error getting bucket size: {e}")
        return {}

## Object Management Functions

In [None]:
def list_objects(bucket_name: str, prefix: str = '', max_keys: int = 1000) -> List[Dict[str, any]]:
    """
    List objects in a bucket.
    
    Args:
        bucket_name: Name of the bucket
        prefix: Filter objects by prefix (folder path)
        max_keys: Maximum number of objects to return
    
    Returns:
        List of object dictionaries
    """
    try:
        response = s3_client.list_objects_v2(
            Bucket=bucket_name,
            Prefix=prefix,
            MaxKeys=max_keys
        )
        
        objects = response.get('Contents', [])
        
        if not objects:
            print(f"No objects found in '{bucket_name}' with prefix '{prefix}'")
            return []
        
        print(f"Found {len(objects)} object(s) in '{bucket_name}' (prefix: '{prefix}'):\n")
        
        for obj in objects[:20]:  # Show first 20
            key = obj['Key']
            size = obj['Size']
            size_mb = size / (1024 * 1024)
            modified = obj['LastModified'].strftime('%Y-%m-%d %H:%M:%S')
            print(f"  • {key:50s} {size_mb:8.2f} MB  {modified}")
        
        if len(objects) > 20:
            print(f"\n  ... and {len(objects) - 20} more objects")
        
        return objects
        
    except ClientError as e:
        print(f"Error listing objects: {e}")
        return []

In [None]:
def delete_object(bucket_name: str, object_key: str) -> bool:
    """
    Delete a single object from a bucket.
    
    Args:
        bucket_name: Name of the bucket
        object_key: Key (path) of the object to delete
    
    Returns:
        True if successful, False otherwise
    """
    try:
        s3_client.delete_object(Bucket=bucket_name, Key=object_key)
        print(f"✓ Deleted: {object_key}")
        return True
    except ClientError as e:
        print(f"Error deleting object: {e}")
        return False

In [None]:
def delete_objects_by_prefix(bucket_name: str, prefix: str, dry_run: bool = True) -> int:
    """
    Delete all objects with a given prefix (folder).
    
    Args:
        bucket_name: Name of the bucket
        prefix: Prefix (folder path) to delete
        dry_run: If True, only list objects without deleting
    
    Returns:
        Number of objects deleted (or would be deleted in dry_run mode)
    """
    try:
        bucket = s3_resource.Bucket(bucket_name)
        objects_to_delete = list(bucket.objects.filter(Prefix=prefix))
        
        if not objects_to_delete:
            print(f"No objects found with prefix '{prefix}' in bucket '{bucket_name}'")
            return 0
        
        count = len(objects_to_delete)
        
        if dry_run:
            print(f"DRY RUN: Would delete {count} object(s) with prefix '{prefix}':\n")
            for obj in objects_to_delete[:20]:
                print(f"  • {obj.key}")
            if count > 20:
                print(f"  ... and {count - 20} more objects")
            print(f"\nSet dry_run=False to actually delete these objects")
        else:
            print(f"⚠ Deleting {count} object(s) with prefix '{prefix}'...")
            for obj in objects_to_delete:
                obj.delete()
            print(f"✓ Deleted {count} object(s)")
        
        return count
        
    except ClientError as e:
        print(f"Error deleting objects: {e}")
        return 0

In [None]:
def delete_all_objects(bucket_name: str, dry_run: bool = True) -> int:
    """
    Delete ALL objects in a bucket.
    
    Args:
        bucket_name: Name of the bucket
        dry_run: If True, only list objects without deleting
    
    Returns:
        Number of objects deleted (or would be deleted in dry_run mode)
    """
    return delete_objects_by_prefix(bucket_name, '', dry_run=dry_run)

In [None]:
def copy_object(source_bucket: str, source_key: str, dest_bucket: str, dest_key: str) -> bool:
    """
    Copy an object from one location to another.
    
    Args:
        source_bucket: Source bucket name
        source_key: Source object key
        dest_bucket: Destination bucket name
        dest_key: Destination object key
    
    Returns:
        True if successful, False otherwise
    """
    try:
        copy_source = {'Bucket': source_bucket, 'Key': source_key}
        s3_client.copy_object(
            CopySource=copy_source,
            Bucket=dest_bucket,
            Key=dest_key
        )
        print(f"✓ Copied: {source_bucket}/{source_key} → {dest_bucket}/{dest_key}")
        return True
    except ClientError as e:
        print(f"Error copying object: {e}")
        return False

In [None]:
def download_object(bucket_name: str, object_key: str, local_path: str) -> bool:
    """
    Download an object to a local file.
    
    Args:
        bucket_name: Name of the bucket
        object_key: Key of the object to download
        local_path: Local file path to save to
    
    Returns:
        True if successful, False otherwise
    """
    try:
        s3_client.download_file(bucket_name, object_key, local_path)
        print(f"✓ Downloaded: {bucket_name}/{object_key} → {local_path}")
        return True
    except ClientError as e:
        print(f"Error downloading object: {e}")
        return False

In [None]:
def upload_file(local_path: str, bucket_name: str, object_key: str) -> bool:
    """
    Upload a local file to MinIO.
    
    Args:
        local_path: Path to local file
        bucket_name: Name of the destination bucket
        object_key: Key (path) for the object in the bucket
    
    Returns:
        True if successful, False otherwise
    """
    try:
        s3_client.upload_file(local_path, bucket_name, object_key)
        print(f"✓ Uploaded: {local_path} → {bucket_name}/{object_key}")
        return True
    except ClientError as e:
        print(f"Error uploading file: {e}")
        return False
    except FileNotFoundError:
        print(f"Error: File not found: {local_path}")
        return False

## Utility Functions

In [None]:
def get_all_bucket_stats() -> List[Dict[str, any]]:
    """
    Get size and object count statistics for all buckets.
    
    Returns:
        List of bucket statistics
    """
    buckets = list_buckets()
    stats = []
    
    print("\n" + "="*80)
    print("Bucket Statistics")
    print("="*80 + "\n")
    
    for bucket in buckets:
        bucket_name = bucket['Name']
        stat = get_bucket_size(bucket_name)
        stats.append(stat)
        print()
    
    return stats

In [7]:
def cleanup_empty_buckets(dry_run: bool = True) -> int:
    """
    Delete all empty buckets.
    
    Args:
        dry_run: If True, only list empty buckets without deleting
    
    Returns:
        Number of buckets deleted (or would be deleted in dry_run mode)
    """
    buckets = list_buckets()
    empty_buckets = []
    
    for bucket in buckets:
        bucket_name = bucket['Name']
        stat = get_bucket_size(bucket_name)
        if stat.get('object_count', 0) == 0:
            empty_buckets.append(bucket_name)
    
    if not empty_buckets:
        print("No empty buckets found")
        return 0
    
    count = len(empty_buckets)
    
    if dry_run:
        print(f"\nDRY RUN: Would delete {count} empty bucket(s):\n")
        for bucket_name in empty_buckets:
            print(f"  • {bucket_name}")
        print(f"\nSet dry_run=False to actually delete these buckets")
    else:
        print(f"\n⚠ Deleting {count} empty bucket(s)...\n")
        for bucket_name in empty_buckets:
            delete_bucket(bucket_name, force=False)
        print(f"\n✓ Deleted {count} empty bucket(s)")
    
    return count

## Examples and Usage

### Example 1: List all buckets

In [None]:
buckets = list_buckets()

### Example 2: Get statistics for all buckets

In [None]:
stats = get_all_bucket_stats()

### Example 3: List objects in a specific bucket

In [None]:
# Replace 'your-bucket-name' with an actual bucket name
# objects = list_objects('your-bucket-name')

# Or with a prefix to filter by folder
# objects = list_objects('your-bucket-name', prefix='data/raw/')

### Example 4: Create a new bucket

In [None]:
# Uncomment to create a test bucket
# create_bucket('test-bucket')

### Example 5: Delete objects by prefix (DRY RUN first)

In [None]:
# First do a dry run to see what would be deleted
# delete_objects_by_prefix('your-bucket-name', 'old-data/', dry_run=True)

# Then actually delete (uncomment and run after reviewing)
# delete_objects_by_prefix('your-bucket-name', 'old-data/', dry_run=False)

### Example 6: Delete a bucket (with force option)

In [None]:
# Delete empty bucket
# delete_bucket('test-bucket')

# Force delete bucket with all contents (DANGEROUS!)
# delete_bucket('test-bucket', force=True)

### Example 7: Clean up empty buckets

In [None]:
# First do a dry run
# cleanup_empty_buckets(dry_run=True)

# Then actually delete (uncomment after reviewing)
cleanup_empty_buckets(dry_run=False)

Found 2 bucket(s):

  • dlt-data                       (created: 2025-12-16 06:57:13)
  • lakefs-data                    (created: 2025-12-17 23:54:38)
Bucket: dlt-data
  Objects: 0
  Size: 0.0 MB (0.0 GB)
Bucket: lakefs-data
  Objects: 0
  Size: 0.0 MB (0.0 GB)

DRY RUN: Would delete 2 empty bucket(s):

  • dlt-data
  • lakefs-data

Set dry_run=False to actually delete these buckets


2

### Example 8: Upload and download files

In [None]:
# Upload a file
# upload_file('/path/to/local/file.txt', 'your-bucket-name', 'data/file.txt')

# Download a file
# download_object('your-bucket-name', 'data/file.txt', '/path/to/download/file.txt')

### Example 9: Copy objects between buckets

In [None]:
# Copy an object
# copy_object('source-bucket', 'data/file.txt', 'dest-bucket', 'backup/file.txt')

## Safety Reminders

⚠️ **IMPORTANT SAFETY NOTES:**

1. **Always use `dry_run=True` first** when deleting objects or buckets in bulk
2. **Deletions are irreversible** - make sure you have backups if needed
3. **Check dependencies** - deleting buckets used by LakeFS or other services will break them
4. **Use prefixes carefully** - deleting by prefix can remove entire folder structures
5. **Test on development** - test destructive operations on dev/test buckets first

### Common LakeFS-related buckets (DO NOT DELETE unless you know what you're doing):
- Buckets with names containing `lakefs`, `iceberg`, `warehouse`
- Any bucket referenced in your LakeFS configuration
- Any bucket containing Iceberg table metadata