# Storage Admin: LakeFS & MinIO Management

This notebook provides utilities for viewing and managing LakeFS and MinIO storage.

**Features:**
- View LakeFS repositories, branches, and objects
- View MinIO buckets and objects
- Delete LakeFS branches and repositories
- Delete MinIO buckets
- Complete reset of all storage

## Setup & Configuration

In [1]:
import os
import json
from datetime import datetime
from typing import Optional

import requests
import boto3
from botocore.client import Config
from botocore.exceptions import ClientError
import pandas as pd
from IPython.display import display, HTML, Markdown

# Suppress SSL warnings for local development
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

In [2]:
# Configuration - Update these values for your environment
# You can also set these as environment variables

# MinIO Configuration
MINIO_ENDPOINT = os.environ.get("MINIO_ENDPOINT_URL", "http://localhost:19000")
MINIO_ACCESS_KEY = os.environ.get("MINIO_ACCESS_KEY_ID", "minioadmin")
MINIO_SECRET_KEY = os.environ.get("MINIO_SECRET_ACCESS_KEY", "minioadmin")

# LakeFS Configuration
LAKEFS_ENDPOINT = os.environ.get("LAKEFS_ENDPOINT_URL", "http://localhost:8000")
LAKEFS_ACCESS_KEY = os.environ.get("LAKEFS_ACCESS_KEY_ID", "AKIAJWAE4BUBMLQESYDQ")
LAKEFS_SECRET_KEY = os.environ.get("LAKEFS_SECRET_ACCESS_KEY", "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")

print(f"MinIO Endpoint: {MINIO_ENDPOINT}")
print(f"LakeFS Endpoint: {LAKEFS_ENDPOINT}")

MinIO Endpoint: http://localhost:19000
LakeFS Endpoint: http://localhost:8000


In [3]:
# Initialize clients

def get_minio_client():
    """Create MinIO S3 client."""
    return boto3.client(
        's3',
        endpoint_url=MINIO_ENDPOINT,
        aws_access_key_id=MINIO_ACCESS_KEY,
        aws_secret_access_key=MINIO_SECRET_KEY,
        config=Config(signature_version='s3v4'),
        region_name='us-east-1'
    )

def get_lakefs_session():
    """Create LakeFS API session with auth."""
    session = requests.Session()
    session.auth = (LAKEFS_ACCESS_KEY, LAKEFS_SECRET_KEY)
    return session

# Test connections
try:
    minio = get_minio_client()
    minio.list_buckets()
    print("✅ MinIO connection successful")
except Exception as e:
    print(f"❌ MinIO connection failed: {e}")

try:
    lakefs = get_lakefs_session()
    resp = lakefs.get(f"{LAKEFS_ENDPOINT}/api/v1/user")
    if resp.status_code == 200:
        print(f"✅ LakeFS connection successful (user: {resp.json().get('id', 'unknown')})")
    else:
        print(f"❌ LakeFS connection failed: {resp.status_code} - {resp.text}")
except Exception as e:
    print(f"❌ LakeFS connection failed: {e}")

✅ MinIO connection successful
✅ LakeFS connection successful (user: unknown)


---
## MinIO Operations

### View MinIO Buckets

In [4]:
def list_minio_buckets():
    """List all MinIO buckets with their creation dates."""
    minio = get_minio_client()
    buckets = minio.list_buckets()
    
    data = []
    for bucket in buckets['Buckets']:
        # Count objects in bucket
        try:
            objects = minio.list_objects_v2(Bucket=bucket['Name'])
            obj_count = objects.get('KeyCount', 0)
        except:
            obj_count = 'N/A'
        
        data.append({
            'Bucket': bucket['Name'],
            'Created': bucket['CreationDate'].strftime('%Y-%m-%d %H:%M:%S'),
            'Objects': obj_count
        })
    
    if data:
        df = pd.DataFrame(data)
        display(df)
    else:
        print("No buckets found")
    
    return [b['Bucket'] for b in data]

minio_buckets = list_minio_buckets()

Unnamed: 0,Bucket,Created,Objects
0,dlt-data,2026-01-26 13:27:36,6
1,lakefs-data,2026-01-26 12:06:00,0
2,lakefs-repo,2026-01-26 12:02:34,94


### View Objects in MinIO Bucket

In [5]:
def list_minio_objects(bucket_name: str, prefix: str = "", max_keys: int = 100):
    """List objects in a MinIO bucket."""
    minio = get_minio_client()
    
    try:
        paginator = minio.get_paginator('list_objects_v2')
        data = []
        
        for page in paginator.paginate(Bucket=bucket_name, Prefix=prefix, PaginationConfig={'MaxItems': max_keys}):
            for obj in page.get('Contents', []):
                data.append({
                    'Key': obj['Key'],
                    'Size': f"{obj['Size']:,} bytes",
                    'LastModified': obj['LastModified'].strftime('%Y-%m-%d %H:%M:%S'),
                })
        
        if data:
            df = pd.DataFrame(data)
            print(f"Objects in s3://{bucket_name}/{prefix} (showing up to {max_keys}):")
            display(df)
        else:
            print(f"No objects found in s3://{bucket_name}/{prefix}")
            
    except ClientError as e:
        print(f"Error: {e}")

# Example: List objects in dlt-data bucket
if 'dlt-data' in minio_buckets:
    list_minio_objects('dlt-data')
else:
    print("dlt-data bucket not found. Available buckets:", minio_buckets)

Objects in s3://dlt-data/ (showing up to 100):


Unnamed: 0,Key,Size,LastModified
0,kronodroid_raw/_dlt_loads/kaggle_kronodroid__1...,194 bytes,2026-01-26 13:28:23
1,kronodroid_raw/_dlt_pipeline_state/kronodroid_...,523 bytes,2026-01-26 13:28:23
2,kronodroid_raw/_dlt_version/kaggle_kronodroid_...,"58,711 bytes",2026-01-26 13:28:22
3,kronodroid_raw/init,41 bytes,2026-01-26 13:28:22
4,kronodroid_raw/kronodroid_2021_emu_v1/17694340...,"18,424,235 bytes",2026-01-26 13:28:22
5,kronodroid_raw/kronodroid_2021_real_v1/1769434...,"22,869,520 bytes",2026-01-26 13:28:23


### Delete MinIO Bucket

In [6]:
def delete_minio_bucket(bucket_name: str, force: bool = False):
    """Delete a MinIO bucket. If force=True, delete all objects first."""
    minio = get_minio_client()
    
    try:
        if force:
            # Delete all objects first
            print(f"Deleting all objects in {bucket_name}...")
            paginator = minio.get_paginator('list_objects_v2')
            
            for page in paginator.paginate(Bucket=bucket_name):
                objects = page.get('Contents', [])
                if objects:
                    delete_keys = [{'Key': obj['Key']} for obj in objects]
                    minio.delete_objects(
                        Bucket=bucket_name,
                        Delete={'Objects': delete_keys}
                    )
                    print(f"  Deleted {len(delete_keys)} objects")
            
            # Also delete any versions if versioning is enabled
            try:
                version_paginator = minio.get_paginator('list_object_versions')
                for page in version_paginator.paginate(Bucket=bucket_name):
                    versions = page.get('Versions', []) + page.get('DeleteMarkers', [])
                    if versions:
                        delete_keys = [{'Key': v['Key'], 'VersionId': v['VersionId']} for v in versions]
                        minio.delete_objects(
                            Bucket=bucket_name,
                            Delete={'Objects': delete_keys}
                        )
            except:
                pass  # Versioning might not be enabled
        
        # Delete the bucket
        minio.delete_bucket(Bucket=bucket_name)
        print(f"✅ Bucket '{bucket_name}' deleted successfully")
        return True
        
    except ClientError as e:
        error_code = e.response.get('Error', {}).get('Code', '')
        if error_code == 'BucketNotEmpty':
            print(f"❌ Bucket '{bucket_name}' is not empty. Use force=True to delete all objects first.")
        else:
            print(f"❌ Error deleting bucket: {e}")
        return False

# Uncomment to delete a bucket:
# delete_minio_bucket('bucket-name', force=True)

---
## LakeFS Operations

### View LakeFS Repositories

In [7]:
def list_lakefs_repositories():
    """List all LakeFS repositories."""
    lakefs = get_lakefs_session()
    
    resp = lakefs.get(f"{LAKEFS_ENDPOINT}/api/v1/repositories")
    if resp.status_code != 200:
        print(f"Error: {resp.status_code} - {resp.text}")
        return []
    
    repos = resp.json().get('results', [])
    
    data = []
    for repo in repos:
        created = datetime.fromtimestamp(repo['creation_date']).strftime('%Y-%m-%d %H:%M:%S')
        data.append({
            'Repository': repo['id'],
            'Default Branch': repo['default_branch'],
            'Storage Namespace': repo['storage_namespace'],
            'Created': created,
            'Read Only': repo.get('read_only', False)
        })
    
    if data:
        df = pd.DataFrame(data)
        display(df)
    else:
        print("No repositories found")
    
    return [r['Repository'] for r in data]

lakefs_repos = list_lakefs_repositories()

Unnamed: 0,Repository,Default Branch,Storage Namespace,Created,Read Only
0,kronodroid,main,s3://lakefs-repo/kronodroid,2026-01-26 04:02:34,False


### View LakeFS Branches

In [8]:
def list_lakefs_branches(repository: str):
    """List all branches in a LakeFS repository."""
    lakefs = get_lakefs_session()
    
    resp = lakefs.get(f"{LAKEFS_ENDPOINT}/api/v1/repositories/{repository}/branches")
    if resp.status_code != 200:
        print(f"Error: {resp.status_code} - {resp.text}")
        return []
    
    branches = resp.json().get('results', [])
    
    data = []
    for branch in branches:
        data.append({
            'Branch': branch['id'],
            'Commit ID': branch['commit_id'][:12] + '...' if len(branch['commit_id']) > 12 else branch['commit_id']
        })
    
    if data:
        print(f"Branches in repository '{repository}':")
        df = pd.DataFrame(data)
        display(df)
    else:
        print(f"No branches found in repository '{repository}'")
    
    return [b['Branch'] for b in data]

# List branches for each repository
all_branches = {}
for repo in lakefs_repos:
    all_branches[repo] = list_lakefs_branches(repo)

Branches in repository 'kronodroid':


Unnamed: 0,Branch,Commit ID
0,main,1eec53f42f84...
1,spark-24ff1a46-eac7-40d5-aa04-6757e9c0c73b,1eec53f42f84...
2,spark-3896c6f9-b209-4516-8d2b-3abd7a849d6e,1eec53f42f84...
3,spark-6e65fade-b049-47b3-a845-a4d66fac6c13,1eec53f42f84...
4,spark-74068576-b37a-400b-84e9-c951e9118545,1eec53f42f84...
5,spark-c2b69708-fa42-4a39-ac01-d27a189d4a12,1eec53f42f84...
6,spark-c6af535d-a2e3-4298-82ec-4a8f2cdf7905,1eec53f42f84...
7,spark-ccac8f19-475b-4d05-9ec4-e25583530396,1eec53f42f84...


### View Objects in LakeFS Branch

In [9]:
def list_lakefs_objects(repository: str, ref: str = "main", prefix: str = "", max_items: int = 100):
    """List objects in a LakeFS repository branch."""
    lakefs = get_lakefs_session()
    
    params = {'amount': max_items}
    if prefix:
        params['prefix'] = prefix
    
    resp = lakefs.get(
        f"{LAKEFS_ENDPOINT}/api/v1/repositories/{repository}/refs/{ref}/objects/ls",
        params=params
    )
    
    if resp.status_code != 200:
        print(f"Error: {resp.status_code} - {resp.text}")
        return
    
    objects = resp.json().get('results', [])
    
    data = []
    for obj in objects:
        if obj['path_type'] == 'object':
            mtime = datetime.fromtimestamp(obj['mtime']).strftime('%Y-%m-%d %H:%M:%S')
            data.append({
                'Path': obj['path'],
                'Size': f"{obj['size_bytes']:,} bytes",
                'Modified': mtime,
                'Checksum': obj['checksum'][:12] + '...'
            })
        else:
            data.append({
                'Path': obj['path'] + '/',
                'Size': 'directory',
                'Modified': '',
                'Checksum': ''
            })
    
    if data:
        print(f"Objects in lakefs://{repository}/{ref}/{prefix} (showing up to {max_items}):")
        df = pd.DataFrame(data)
        display(df)
    else:
        print(f"No objects found in lakefs://{repository}/{ref}/{prefix}")

# Example: List objects in kronodroid repository
if 'kronodroid' in lakefs_repos:
    list_lakefs_objects('kronodroid', 'main')

Objects in lakefs://kronodroid/main/ (showing up to 100):


Unnamed: 0,Path,Size,Modified,Checksum
0,kronodroid_raw/_dlt_loads/kaggle_kronodroid__1...,195 bytes,2026-01-26 04:07:37,055def2ce10e...
1,kronodroid_raw/_dlt_pipeline_state/kronodroid_...,534 bytes,2026-01-26 04:07:37,5fa60a998723...
2,kronodroid_raw/_dlt_version/kaggle_kronodroid_...,"58,711 bytes",2026-01-26 04:07:36,42b255d439f7...
3,kronodroid_raw/init,41 bytes,2026-01-26 04:07:36,153adcea9fc4...
4,kronodroid_raw/kronodroid_2021_emu_v1/17694292...,"18,425,057 bytes",2026-01-26 04:07:37,9d8b0d455797...
5,kronodroid_raw/kronodroid_2021_real_v1/1769429...,"22,871,535 bytes",2026-01-26 04:07:37,f82b168460f8...


### View LakeFS Commits

In [10]:
def list_lakefs_commits(repository: str, ref: str = "main", max_items: int = 20):
    """List commits in a LakeFS branch."""
    lakefs = get_lakefs_session()
    
    resp = lakefs.get(
        f"{LAKEFS_ENDPOINT}/api/v1/repositories/{repository}/refs/{ref}/commits",
        params={'amount': max_items}
    )
    
    if resp.status_code != 200:
        print(f"Error: {resp.status_code} - {resp.text}")
        return
    
    commits = resp.json().get('results', [])
    
    data = []
    for commit in commits:
        created = datetime.fromtimestamp(commit['creation_date']).strftime('%Y-%m-%d %H:%M:%S')
        data.append({
            'Commit ID': commit['id'][:12] + '...',
            'Message': commit['message'][:50] + '...' if len(commit['message']) > 50 else commit['message'],
            'Committer': commit['committer'],
            'Created': created
        })
    
    if data:
        print(f"Commits in {repository}/{ref} (showing up to {max_items}):")
        df = pd.DataFrame(data)
        display(df)
    else:
        print(f"No commits found")

# Example: List commits
if 'kronodroid' in lakefs_repos:
    list_lakefs_commits('kronodroid', 'main')

Commits in kronodroid/main (showing up to 20):


Unnamed: 0,Commit ID,Message,Committer,Created
0,1eec53f42f84...,Repository created,,2026-01-26 04:02:34


### Delete LakeFS Branch

In [11]:
def delete_lakefs_branch(repository: str, branch: str):
    """Delete a branch from a LakeFS repository."""
    lakefs = get_lakefs_session()
    
    # Safety check - don't delete main/master by accident
    if branch in ['main', 'master']:
        print(f"⚠️ Warning: You are about to delete the '{branch}' branch!")
        confirm = input(f"Type '{branch}' to confirm: ")
        if confirm != branch:
            print("Deletion cancelled")
            return False
    
    resp = lakefs.delete(f"{LAKEFS_ENDPOINT}/api/v1/repositories/{repository}/branches/{branch}")
    
    if resp.status_code == 204:
        print(f"✅ Branch '{branch}' deleted from repository '{repository}'")
        return True
    else:
        print(f"❌ Error deleting branch: {resp.status_code} - {resp.text}")
        return False

# Uncomment to delete a branch:
# delete_lakefs_branch('kronodroid', 'branch-to-delete')

### Delete LakeFS Repository

In [12]:
def delete_lakefs_repository(repository: str, confirm: bool = False):
    """Delete an entire LakeFS repository."""
    lakefs = get_lakefs_session()
    
    if not confirm:
        print(f"⚠️ Warning: This will permanently delete repository '{repository}' and ALL its data!")
        user_confirm = input(f"Type the repository name '{repository}' to confirm: ")
        if user_confirm != repository:
            print("Deletion cancelled")
            return False
    
    resp = lakefs.delete(f"{LAKEFS_ENDPOINT}/api/v1/repositories/{repository}")
    
    if resp.status_code == 204:
        print(f"✅ Repository '{repository}' deleted successfully")
        return True
    else:
        print(f"❌ Error deleting repository: {resp.status_code} - {resp.text}")
        return False

# Uncomment to delete a repository:
# delete_lakefs_repository('repository-name')

---
## Complete Reset Operations

**⚠️ DANGER ZONE: These operations will permanently delete all data!**

In [13]:
def reset_all_minio(confirm: bool = False):
    """Delete ALL MinIO buckets and their contents."""
    minio = get_minio_client()
    
    buckets = minio.list_buckets()['Buckets']
    bucket_names = [b['Name'] for b in buckets]
    
    if not bucket_names:
        print("No buckets to delete")
        return True
    
    print(f"⚠️ WARNING: This will delete {len(bucket_names)} bucket(s):")
    for name in bucket_names:
        print(f"  - {name}")
    
    if not confirm:
        user_confirm = input("Type 'DELETE ALL MINIO' to confirm: ")
        if user_confirm != 'DELETE ALL MINIO':
            print("Reset cancelled")
            return False
    
    success = True
    for bucket_name in bucket_names:
        print(f"\nDeleting bucket: {bucket_name}")
        if not delete_minio_bucket(bucket_name, force=True):
            success = False
    
    if success:
        print("\n✅ All MinIO buckets deleted successfully")
    else:
        print("\n⚠️ Some buckets could not be deleted")
    
    return success

# Uncomment to reset MinIO:
# reset_all_minio()

In [14]:
def reset_all_lakefs(confirm: bool = False):
    """Delete ALL LakeFS repositories."""
    lakefs = get_lakefs_session()
    
    resp = lakefs.get(f"{LAKEFS_ENDPOINT}/api/v1/repositories")
    if resp.status_code != 200:
        print(f"Error listing repositories: {resp.text}")
        return False
    
    repos = resp.json().get('results', [])
    repo_names = [r['id'] for r in repos]
    
    if not repo_names:
        print("No repositories to delete")
        return True
    
    print(f"⚠️ WARNING: This will delete {len(repo_names)} repository(ies):")
    for name in repo_names:
        print(f"  - {name}")
    
    if not confirm:
        user_confirm = input("Type 'DELETE ALL LAKEFS' to confirm: ")
        if user_confirm != 'DELETE ALL LAKEFS':
            print("Reset cancelled")
            return False
    
    success = True
    for repo_name in repo_names:
        print(f"\nDeleting repository: {repo_name}")
        if not delete_lakefs_repository(repo_name, confirm=True):
            success = False
    
    if success:
        print("\n✅ All LakeFS repositories deleted successfully")
    else:
        print("\n⚠️ Some repositories could not be deleted")
    
    return success

# Uncomment to reset LakeFS:
# reset_all_lakefs()

In [None]:
def complete_storage_reset(confirm: bool = False):
    """Completely reset ALL storage (MinIO AND LakeFS)."""
    print("="*60)
    print("⚠️  COMPLETE STORAGE RESET  ⚠️")
    print("="*60)
    print("\nThis will PERMANENTLY DELETE:")
    print("  - All MinIO buckets and objects")
    print("  - All LakeFS repositories, branches, and data")
    print("\nThis action CANNOT be undone!\n")
    
    if not confirm:
        user_confirm = input("Type 'RESET EVERYTHING' to confirm: ")
        if user_confirm != 'RESET EVERYTHING':
            print("\nReset cancelled")
            return False
    
    print("\n" + "="*60)
    print("Resetting MinIO...")
    print("="*60)
    minio_success = reset_all_minio(confirm=True)
    
    print("\n" + "="*60)
    print("Resetting LakeFS...")
    print("="*60)
    lakefs_success = reset_all_lakefs(confirm=True)
    
    print("\n" + "="*60)
    if minio_success and lakefs_success:
        print("✅ COMPLETE STORAGE RESET SUCCESSFUL")
    else:
        print("⚠️ STORAGE RESET COMPLETED WITH ERRORS")
    print("="*60)
    
    return minio_success and lakefs_success

# Uncomment to perform complete reset:
complete_storage_reset(True)

⚠️  COMPLETE STORAGE RESET  ⚠️

This will PERMANENTLY DELETE:
  - All MinIO buckets and objects
  - All LakeFS repositories, branches, and data

This action CANNOT be undone!



KeyboardInterrupt: Interrupted by user

---
## Quick Actions

Uncomment and run the cells below to perform common operations.

In [16]:
# Quick view of all storage
print("=" * 60)
print("MinIO Buckets")
print("=" * 60)
list_minio_buckets()

print("\n" + "=" * 60)
print("LakeFS Repositories")
print("=" * 60)
for repo in list_lakefs_repositories():
    print(f"\n--- Branches in {repo} ---")
    list_lakefs_branches(repo)

MinIO Buckets
No buckets found

LakeFS Repositories
No repositories found


In [17]:
# Delete a specific branch (uncomment and modify)
# delete_lakefs_branch('kronodroid', 'spark-test-branch')

In [18]:
# Delete a specific bucket (uncomment and modify)
# delete_minio_bucket('test-bucket', force=True)

In [19]:
# DANGER: Complete reset (uncomment to use)
# complete_storage_reset()

---
## Recreate Default Resources

After a reset, use these to recreate the default buckets and repositories.

In [20]:
def create_minio_bucket(bucket_name: str):
    """Create a MinIO bucket."""
    minio = get_minio_client()
    try:
        minio.create_bucket(Bucket=bucket_name)
        print(f"✅ Created bucket: {bucket_name}")
        return True
    except ClientError as e:
        error_code = e.response.get('Error', {}).get('Code', '')
        if error_code == 'BucketAlreadyOwnedByYou':
            print(f"ℹ️ Bucket already exists: {bucket_name}")
            return True
        print(f"❌ Error creating bucket: {e}")
        return False

def create_lakefs_repository(repository: str, storage_namespace: str, default_branch: str = "main"):
    """Create a LakeFS repository."""
    lakefs = get_lakefs_session()
    
    resp = lakefs.post(
        f"{LAKEFS_ENDPOINT}/api/v1/repositories",
        json={
            "name": repository,
            "storage_namespace": storage_namespace,
            "default_branch": default_branch
        }
    )
    
    if resp.status_code == 201:
        print(f"✅ Created repository: {repository}")
        return True
    elif resp.status_code == 409:
        print(f"ℹ️ Repository already exists: {repository}")
        return True
    else:
        print(f"❌ Error creating repository: {resp.status_code} - {resp.text}")
        return False

In [21]:
def recreate_default_resources():
    """Recreate the default MinIO buckets and LakeFS repository."""
    print("Creating default MinIO buckets...")
    create_minio_bucket('dlt-data')
    create_minio_bucket('lakefs-data')
    
    print("\nCreating default LakeFS repository...")
    create_lakefs_repository(
        repository='kronodroid',
        storage_namespace='s3://lakefs-data/kronodroid',
        default_branch='main'
    )
    
    print("\n✅ Default resources created")

# Uncomment to recreate defaults:
# recreate_default_resources()