# View lakeFS-tracked data (MinIO-backed)

This notebook lists objects tracked by lakeFS for a given `repo` + `ref` (branch/commit), and optionally previews files by reading through the lakeFS S3 gateway.

## Prereqs
- lakeFS reachable at `LAKEFS_ENDPOINT_URL` (default: `http://localhost:8000`)
- Credentials set: `LAKEFS_ACCESS_KEY_ID`, `LAKEFS_SECRET_ACCESS_KEY`
- Optional for previews: `s3fs`, `pyarrow`, `pandas` (included in this repo's `pyproject.toml`)


In [1]:
import os
import sys
from pathlib import Path

import pandas as pd

# Ensure repo root is importable when running from the notebooks directory.
REPO_ROOT = Path.cwd().parent if Path.cwd().name == "notebooks" else Path.cwd()
if str(REPO_ROOT) not in sys.path:
    sys.path.insert(0, str(REPO_ROOT))

from core.dfp_core.lakefs_client_utils import LakeFSConfig, get_lakefs_client

cfg = LakeFSConfig.from_env()
display(
    {
        "LAKEFS_ENDPOINT_URL": cfg.endpoint,
        "LAKEFS_REPOSITORY": cfg.repository,
        "LAKEFS_BRANCH": cfg.branch,
        "LAKEFS_ACCESS_KEY_ID_set": bool(cfg.access_key_id),
        "LAKEFS_SECRET_ACCESS_KEY_set": bool(cfg.secret_access_key),
    }
)

try:
    client = get_lakefs_client(cfg)
except ModuleNotFoundError as e:
    raise RuntimeError(
        "Missing dependency `lakefs-client`. Install the repo deps (recommended: `uv sync`) "
        "or `uv pip install lakefs-client`."
    ) from e


{'LAKEFS_ENDPOINT_URL': 'http://localhost:8000',
 'LAKEFS_REPOSITORY': 'kronodroid',
 'LAKEFS_BRANCH': 'main',
 'LAKEFS_ACCESS_KEY_ID_set': True,
 'LAKEFS_SECRET_ACCESS_KEY_set': True}

In [2]:
from lakefs_client.api import branches_api, objects_api, repositories_api

repos = repositories_api.RepositoriesApi(client)
branches = branches_api.BranchesApi(client)
objects = objects_api.ObjectsApi(client)

# Repository details (includes storage_namespace which points to the underlying object store).
repo = repos.get_repository(cfg.repository)
display({"repository": repo.id, "storage_namespace": repo.storage_namespace, "default_branch": repo.default_branch})

# Branches
branch_list = branches.list_branches(cfg.repository)
display(pd.DataFrame([{ "id": b.id, "commit_id": b.commit_id } for b in branch_list.results]))


ApiTypeError: Invalid type for variable 'received_data'. Required value type is Repository and passed type was str at ['received_data']

In [None]:
def list_objects_df(
    repository: str,
    ref: str,
    prefix: str = "",
    amount: int = 200,
    after: str | None = None,
) -> pd.DataFrame:
    """List lakeFS-tracked objects under a prefix."""
    resp = objects.list_objects(repository, ref, prefix=prefix, amount=amount, after=after)
    rows: list[dict] = []
    for r in resp.results:
        rows.append(
            {
                "path": getattr(r, "path", None),
                "path_type": getattr(r, "path_type", None),
                "size_bytes": getattr(r, "size_bytes", None),
                "mtime": getattr(r, "mtime", None),
                "checksum": getattr(r, "checksum", None),
            }
        )
    return pd.DataFrame(rows)


# Change this to narrow down (e.g. "iceberg/" or "datasets/")
PREFIX = os.getenv("LAKEFS_PREFIX", "")

df = list_objects_df(cfg.repository, cfg.branch, prefix=PREFIX, amount=200)
print(f"Listed {len(df)} entries under prefix='{PREFIX}' in {cfg.repository}@{cfg.branch}")
display(df)


## Preview a file through the lakeFS S3 gateway

lakeFS exposes an S3-compatible API; in this repo itâ€™s typically used via `s3a://<repo>/<branch>/...` (Spark) or `s3://<repo>/<branch>/...` (Python).

Pick a `path` from the table above, set `OBJECT_PATH`, and run the cell below.

In [None]:
from urllib.parse import urlparse

import s3fs


def _lakefs_storage_options(cfg: LakeFSConfig) -> dict:
    parsed = urlparse(cfg.endpoint)
    endpoint_url = cfg.endpoint
    use_ssl = parsed.scheme == "https"
    return {
        "key": cfg.access_key_id,
        "secret": cfg.secret_access_key,
        "client_kwargs": {"endpoint_url": endpoint_url},
        "config_kwargs": {"s3": {"addressing_style": "path"}},
        "use_ssl": use_ssl,
    }


OBJECT_PATH = os.getenv("LAKEFS_OBJECT_PATH", "")  # e.g. "main/iceberg/..." is WRONG; pass path *within* ref.
if not OBJECT_PATH:
    raise ValueError(
        "Set LAKEFS_OBJECT_PATH to a 'path' value from the objects listing (do not include '<branch>/')."
    )

s3 = s3fs.S3FileSystem(**_lakefs_storage_options(cfg))
s3_url = f"s3://{cfg.repository}/{cfg.branch}/{OBJECT_PATH}"
display({"s3_url": s3_url})

# Basic preview: CSV/JSON via pandas; Parquet via pandas+pyarrow.
if OBJECT_PATH.endswith(".csv"):
    preview = pd.read_csv(s3_url, storage_options=_lakefs_storage_options(cfg))
    display(preview.head(50))
elif OBJECT_PATH.endswith(".json"):
    preview = pd.read_json(s3_url, lines=True, storage_options=_lakefs_storage_options(cfg))
    display(preview.head(50))
elif OBJECT_PATH.endswith(".parquet"):
    preview = pd.read_parquet(s3_url, storage_options=_lakefs_storage_options(cfg))
    display(preview.head(50))
else:
    # Fallback: show first bytes for unknown formats (avoid huge downloads).
    with s3.open(f"{cfg.repository}/{cfg.branch}/{OBJECT_PATH}", "rb") as f:
        head = f.read(4096)
    print(head)
