# Iceberg Data Explorer

This notebook demonstrates how to read Iceberg tables stored in LakeFS using PySpark.

**Prerequisites:**
- Port-forwards running: `task port-forward`
- LakeFS credentials in `.env` file

In [1]:
import os
from pathlib import Path

# Load environment variables from .env
env_path = Path.cwd().parent / ".env"
if env_path.exists():
    with open(env_path) as f:
        for line in f:
            line = line.strip()
            if line and not line.startswith("#") and "=" in line:
                key, value = line.split("=", 1)
                os.environ[key] = value
    print(f"Loaded environment from {env_path}")
else:
    print(f"Warning: {env_path} not found")

Loaded environment from /Users/benjaminbrown/Documents/GitHub/mlops/.env


In [None]:
# Configuration
LAKEFS_ENDPOINT = os.getenv("LAKEFS_ENDPOINT_URL", "http://localhost:8000")
LAKEFS_ACCESS_KEY = os.getenv("LAKEFS_ACCESS_KEY_ID", "")
LAKEFS_SECRET_KEY = os.getenv("LAKEFS_SECRET_ACCESS_KEY", "")
LAKEFS_REPOSITORY = os.getenv("LAKEFS_REPOSITORY", "kronodroid")
LAKEFS_BRANCH = os.getenv("LAKEFS_BRANCH", "dev")  # Use 'dev' branch where Iceberg tables exist

print(f"LakeFS Endpoint: {LAKEFS_ENDPOINT}")
print(f"Repository: {LAKEFS_REPOSITORY}")
print(f"Branch: {LAKEFS_BRANCH}")
print(f"Access Key: {LAKEFS_ACCESS_KEY[:8]}..." if LAKEFS_ACCESS_KEY else "Access Key: NOT SET")

In [3]:
from pyspark.sql import SparkSession
import pyspark

# Detect PySpark version for Iceberg compatibility
spark_version = pyspark.__version__
spark_major_minor = ".".join(spark_version.split(".")[:2])
print(f"PySpark version: {spark_version}")

# Select appropriate Iceberg runtime based on Spark version
if spark_major_minor.startswith("4."):
    iceberg_runtime = "org.apache.iceberg:iceberg-spark-runtime-4.0_2.13:1.10.1"
    iceberg_aws = "org.apache.iceberg:iceberg-aws:1.10.1"
    hadoop_aws = "org.apache.hadoop:hadoop-aws:3.4.1"
    aws_sdk = "software.amazon.awssdk:bundle:2.29.51"
elif spark_major_minor.startswith("3.5"):
    iceberg_runtime = "org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.5.2"
    iceberg_aws = "org.apache.iceberg:iceberg-aws:1.5.2"
    hadoop_aws = "org.apache.hadoop:hadoop-aws:3.3.4"
    aws_sdk = "com.amazonaws:aws-java-sdk-bundle:1.12.262"
else:
    raise RuntimeError(f"Unsupported Spark version: {spark_version}")

packages = f"{iceberg_runtime},{iceberg_aws},{hadoop_aws},{aws_sdk}"
print(f"Using packages: {packages}")

PySpark version: 4.0.1
Using packages: org.apache.iceberg:iceberg-spark-runtime-4.0_2.13:1.10.1,org.apache.iceberg:iceberg-aws:1.10.1,org.apache.hadoop:hadoop-aws:3.4.1,software.amazon.awssdk:bundle:2.29.51


In [None]:
# Build Spark session with Iceberg and LakeFS configuration
# Using Hadoop catalog (reads directly from S3A) since REST catalog requires LakeFS Enterprise
warehouse_path = f"s3a://{LAKEFS_REPOSITORY}/{LAKEFS_BRANCH}/iceberg"

print(f"Warehouse path: {warehouse_path}")

spark = (
    SparkSession.builder
    .appName("IcebergDataExplorer")
    .master("local[*]")
    .config("spark.jars.packages", packages)
    # Iceberg extensions
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions")
    # LakeFS Iceberg Hadoop catalog (reads directly from S3A)
    .config("spark.sql.catalog.lakefs", "org.apache.iceberg.spark.SparkCatalog")
    .config("spark.sql.catalog.lakefs.type", "hadoop")
    .config("spark.sql.catalog.lakefs.warehouse", warehouse_path)
    # S3A filesystem for LakeFS S3 gateway
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    .config("spark.hadoop.fs.s3a.path.style.access", "true")
    .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false")
    # Per-bucket config for LakeFS repository
    .config(f"spark.hadoop.fs.s3a.bucket.{LAKEFS_REPOSITORY}.endpoint", LAKEFS_ENDPOINT)
    .config(f"spark.hadoop.fs.s3a.bucket.{LAKEFS_REPOSITORY}.access.key", LAKEFS_ACCESS_KEY)
    .config(f"spark.hadoop.fs.s3a.bucket.{LAKEFS_REPOSITORY}.secret.key", LAKEFS_SECRET_KEY)
    .getOrCreate()
)

print(f"Spark session created: {spark.version}")

## List Available Namespaces and Tables

In [5]:
# List namespaces (databases) in the LakeFS catalog
print("Available namespaces in lakefs catalog:")
print("=" * 40)
namespaces = spark.sql("SHOW NAMESPACES IN lakefs").collect()
for ns in namespaces:
    print(f"  - {ns[0]}")

if not namespaces:
    print("  (no namespaces found)")

Available namespaces in lakefs catalog:


26/01/29 18:25:42 WARN ErrorHandlers: Unable to parse error response
java.lang.IllegalArgumentException: Cannot parse missing string: error
	at org.apache.iceberg.relocated.com.google.common.base.Preconditions.checkArgument(Preconditions.java:217)
	at org.apache.iceberg.util.JsonUtil.getString(JsonUtil.java:171)
	at org.apache.iceberg.rest.responses.OAuthErrorResponseParser.fromJson(OAuthErrorResponseParser.java:47)
	at org.apache.iceberg.rest.responses.OAuthErrorResponseParser.lambda$fromJson$0(OAuthErrorResponseParser.java:39)
	at org.apache.iceberg.util.JsonUtil.parse(JsonUtil.java:104)
	at org.apache.iceberg.rest.responses.OAuthErrorResponseParser.fromJson(OAuthErrorResponseParser.java:39)
	at org.apache.iceberg.rest.ErrorHandlers$OAuthErrorHandler.parseResponse(ErrorHandlers.java:260)
	at org.apache.iceberg.rest.HTTPClient.throwFailure(HTTPClient.java:210)
	at org.apache.iceberg.rest.HTTPClient.execute(HTTPClient.java:336)
	at org.apache.iceberg.rest.HTTPClient.execute(HTTPClient.

Py4JJavaError: An error occurred while calling o57.sql.
: org.apache.iceberg.exceptions.RESTException: Unable to process: {"message":"invalid API endpoint"}

	at org.apache.iceberg.rest.ErrorHandlers$OAuthErrorHandler.accept(ErrorHandlers.java:283)
	at org.apache.iceberg.rest.ErrorHandlers$OAuthErrorHandler.accept(ErrorHandlers.java:254)
	at org.apache.iceberg.rest.HTTPClient.throwFailure(HTTPClient.java:240)
	at org.apache.iceberg.rest.HTTPClient.execute(HTTPClient.java:336)
	at org.apache.iceberg.rest.HTTPClient.execute(HTTPClient.java:297)
	at org.apache.iceberg.rest.BaseHTTPClient.postForm(BaseHTTPClient.java:136)
	at org.apache.iceberg.rest.auth.OAuth2Util.fetchToken(OAuth2Util.java:291)
	at org.apache.iceberg.rest.auth.OAuth2Manager.initSession(OAuth2Manager.java:87)
	at org.apache.iceberg.rest.auth.OAuth2Manager.initSession(OAuth2Manager.java:40)
	at org.apache.iceberg.rest.RESTSessionCatalog.initialize(RESTSessionCatalog.java:204)
	at org.apache.iceberg.rest.RESTCatalog.initialize(RESTCatalog.java:82)
	at org.apache.iceberg.CatalogUtil.loadCatalog(CatalogUtil.java:280)
	at org.apache.iceberg.CatalogUtil.buildIcebergCatalog(CatalogUtil.java:337)
	at org.apache.iceberg.spark.SparkCatalog.buildIcebergCatalog(SparkCatalog.java:155)
	at org.apache.iceberg.spark.SparkCatalog.initialize(SparkCatalog.java:753)
	at org.apache.spark.sql.connector.catalog.Catalogs$.load(Catalogs.scala:65)
	at org.apache.spark.sql.connector.catalog.CatalogManager.$anonfun$catalog$1(CatalogManager.scala:56)
	at scala.collection.mutable.HashMap.getOrElseUpdate(HashMap.scala:469)
	at org.apache.spark.sql.connector.catalog.CatalogManager.catalog(CatalogManager.scala:56)
	at org.apache.spark.sql.connector.catalog.LookupCatalog$CatalogAndNamespace$.unapply(LookupCatalog.scala:86)
	at org.apache.spark.sql.catalyst.analysis.ResolveCatalogs$$anonfun$apply$1.applyOrElse(ResolveCatalogs.scala:92)
	at org.apache.spark.sql.catalyst.analysis.ResolveCatalogs$$anonfun$apply$1.applyOrElse(ResolveCatalogs.scala:38)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.$anonfun$resolveOperatorsDownWithPruning$2(AnalysisHelper.scala:200)
	at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(origin.scala:86)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.$anonfun$resolveOperatorsDownWithPruning$1(AnalysisHelper.scala:200)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper$.allowInvokingTransformsInAnalyzer(AnalysisHelper.scala:416)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.resolveOperatorsDownWithPruning(AnalysisHelper.scala:198)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.resolveOperatorsDownWithPruning$(AnalysisHelper.scala:194)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.resolveOperatorsDownWithPruning(LogicalPlan.scala:37)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.$anonfun$resolveOperatorsDownWithPruning$4(AnalysisHelper.scala:205)
	at org.apache.spark.sql.catalyst.trees.UnaryLike.mapChildren(TreeNode.scala:1231)
	at org.apache.spark.sql.catalyst.trees.UnaryLike.mapChildren$(TreeNode.scala:1230)
	at org.apache.spark.sql.catalyst.plans.logical.ShowNamespaces.mapChildren(v2Commands.scala:662)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.$anonfun$resolveOperatorsDownWithPruning$1(AnalysisHelper.scala:205)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper$.allowInvokingTransformsInAnalyzer(AnalysisHelper.scala:416)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.resolveOperatorsDownWithPruning(AnalysisHelper.scala:198)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.resolveOperatorsDownWithPruning$(AnalysisHelper.scala:194)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.resolveOperatorsDownWithPruning(LogicalPlan.scala:37)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.resolveOperatorsDown(AnalysisHelper.scala:190)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.resolveOperatorsDown$(AnalysisHelper.scala:189)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.resolveOperatorsDown(LogicalPlan.scala:37)
	at org.apache.spark.sql.catalyst.analysis.ResolveCatalogs.apply(ResolveCatalogs.scala:38)
	at org.apache.spark.sql.catalyst.analysis.ResolveCatalogs.apply(ResolveCatalogs.scala:35)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor.$anonfun$execute$2(RuleExecutor.scala:242)
	at scala.collection.LinearSeqOps.foldLeft(LinearSeq.scala:183)
	at scala.collection.LinearSeqOps.foldLeft$(LinearSeq.scala:179)
	at scala.collection.immutable.List.foldLeft(List.scala:79)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor.$anonfun$execute$1(RuleExecutor.scala:239)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor.$anonfun$execute$1$adapted(RuleExecutor.scala:231)
	at scala.collection.immutable.List.foreach(List.scala:334)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor.execute(RuleExecutor.scala:231)
	at org.apache.spark.sql.catalyst.analysis.Analyzer.org$apache$spark$sql$catalyst$analysis$Analyzer$$executeSameContext(Analyzer.scala:340)
	at org.apache.spark.sql.catalyst.analysis.Analyzer.$anonfun$execute$1(Analyzer.scala:336)
	at org.apache.spark.sql.catalyst.analysis.AnalysisContext$.withNewAnalysisContext(Analyzer.scala:234)
	at org.apache.spark.sql.catalyst.analysis.Analyzer.execute(Analyzer.scala:336)
	at org.apache.spark.sql.catalyst.analysis.Analyzer.execute(Analyzer.scala:299)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor.$anonfun$executeAndTrack$1(RuleExecutor.scala:201)
	at org.apache.spark.sql.catalyst.QueryPlanningTracker$.withTracker(QueryPlanningTracker.scala:89)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor.executeAndTrack(RuleExecutor.scala:201)
	at org.apache.spark.sql.catalyst.analysis.resolver.HybridAnalyzer.resolveInFixedPoint(HybridAnalyzer.scala:190)
	at org.apache.spark.sql.catalyst.analysis.resolver.HybridAnalyzer.$anonfun$apply$1(HybridAnalyzer.scala:76)
	at org.apache.spark.sql.catalyst.analysis.resolver.HybridAnalyzer.withTrackedAnalyzerBridgeState(HybridAnalyzer.scala:111)
	at org.apache.spark.sql.catalyst.analysis.resolver.HybridAnalyzer.apply(HybridAnalyzer.scala:71)
	at org.apache.spark.sql.catalyst.analysis.Analyzer.$anonfun$executeAndCheck$1(Analyzer.scala:330)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper$.markInAnalyzer(AnalysisHelper.scala:423)
	at org.apache.spark.sql.catalyst.analysis.Analyzer.executeAndCheck(Analyzer.scala:330)
	at org.apache.spark.sql.execution.QueryExecution.$anonfun$lazyAnalyzed$2(QueryExecution.scala:110)
	at org.apache.spark.sql.catalyst.QueryPlanningTracker.measurePhase(QueryPlanningTracker.scala:148)
	at org.apache.spark.sql.execution.QueryExecution.$anonfun$executePhase$2(QueryExecution.scala:278)
	at org.apache.spark.sql.execution.QueryExecution$.withInternalError(QueryExecution.scala:654)
	at org.apache.spark.sql.execution.QueryExecution.$anonfun$executePhase$1(QueryExecution.scala:278)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:804)
	at org.apache.spark.sql.execution.QueryExecution.executePhase(QueryExecution.scala:277)
	at org.apache.spark.sql.execution.QueryExecution.$anonfun$lazyAnalyzed$1(QueryExecution.scala:110)
	at scala.util.Try$.apply(Try.scala:217)
	at org.apache.spark.util.Utils$.doTryWithCallerStacktrace(Utils.scala:1378)
	at org.apache.spark.util.Utils$.getTryWithCallerStacktrace(Utils.scala:1439)
	at org.apache.spark.util.LazyTry.get(LazyTry.scala:58)
	at org.apache.spark.sql.execution.QueryExecution.analyzed(QueryExecution.scala:121)
	at org.apache.spark.sql.execution.QueryExecution.assertAnalyzed(QueryExecution.scala:80)
	at org.apache.spark.sql.classic.Dataset$.$anonfun$ofRows$5(Dataset.scala:139)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:804)
	at org.apache.spark.sql.classic.Dataset$.ofRows(Dataset.scala:136)
	at org.apache.spark.sql.classic.SparkSession.$anonfun$sql$1(SparkSession.scala:462)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:804)
	at org.apache.spark.sql.classic.SparkSession.sql(SparkSession.scala:449)
	at org.apache.spark.sql.classic.SparkSession.sql(SparkSession.scala:467)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:569)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:184)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:108)
	at java.base/java.lang.Thread.run(Thread.java:840)
	Suppressed: org.apache.spark.util.Utils$OriginalTryStackTraceException: Full stacktrace of original doTryWithCallerStacktrace caller
		at org.apache.iceberg.rest.ErrorHandlers$OAuthErrorHandler.accept(ErrorHandlers.java:283)
		at org.apache.iceberg.rest.ErrorHandlers$OAuthErrorHandler.accept(ErrorHandlers.java:254)
		at org.apache.iceberg.rest.HTTPClient.throwFailure(HTTPClient.java:240)
		at org.apache.iceberg.rest.HTTPClient.execute(HTTPClient.java:336)
		at org.apache.iceberg.rest.HTTPClient.execute(HTTPClient.java:297)
		at org.apache.iceberg.rest.BaseHTTPClient.postForm(BaseHTTPClient.java:136)
		at org.apache.iceberg.rest.auth.OAuth2Util.fetchToken(OAuth2Util.java:291)
		at org.apache.iceberg.rest.auth.OAuth2Manager.initSession(OAuth2Manager.java:87)
		at org.apache.iceberg.rest.auth.OAuth2Manager.initSession(OAuth2Manager.java:40)
		at org.apache.iceberg.rest.RESTSessionCatalog.initialize(RESTSessionCatalog.java:204)
		at org.apache.iceberg.rest.RESTCatalog.initialize(RESTCatalog.java:82)
		at org.apache.iceberg.CatalogUtil.loadCatalog(CatalogUtil.java:280)
		at org.apache.iceberg.CatalogUtil.buildIcebergCatalog(CatalogUtil.java:337)
		at org.apache.iceberg.spark.SparkCatalog.buildIcebergCatalog(SparkCatalog.java:155)
		at org.apache.iceberg.spark.SparkCatalog.initialize(SparkCatalog.java:753)
		at org.apache.spark.sql.connector.catalog.Catalogs$.load(Catalogs.scala:65)
		at org.apache.spark.sql.connector.catalog.CatalogManager.$anonfun$catalog$1(CatalogManager.scala:56)
		at scala.collection.mutable.HashMap.getOrElseUpdate(HashMap.scala:469)
		at org.apache.spark.sql.connector.catalog.CatalogManager.catalog(CatalogManager.scala:56)
		at org.apache.spark.sql.connector.catalog.LookupCatalog$CatalogAndNamespace$.unapply(LookupCatalog.scala:86)
		at org.apache.spark.sql.catalyst.analysis.ResolveCatalogs$$anonfun$apply$1.applyOrElse(ResolveCatalogs.scala:92)
		at org.apache.spark.sql.catalyst.analysis.ResolveCatalogs$$anonfun$apply$1.applyOrElse(ResolveCatalogs.scala:38)
		at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.$anonfun$resolveOperatorsDownWithPruning$2(AnalysisHelper.scala:200)
		at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(origin.scala:86)
		at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.$anonfun$resolveOperatorsDownWithPruning$1(AnalysisHelper.scala:200)
		at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper$.allowInvokingTransformsInAnalyzer(AnalysisHelper.scala:416)
		at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.resolveOperatorsDownWithPruning(AnalysisHelper.scala:198)
		at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.resolveOperatorsDownWithPruning$(AnalysisHelper.scala:194)
		at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.resolveOperatorsDownWithPruning(LogicalPlan.scala:37)
		at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.$anonfun$resolveOperatorsDownWithPruning$4(AnalysisHelper.scala:205)
		at org.apache.spark.sql.catalyst.trees.UnaryLike.mapChildren(TreeNode.scala:1231)
		at org.apache.spark.sql.catalyst.trees.UnaryLike.mapChildren$(TreeNode.scala:1230)
		at org.apache.spark.sql.catalyst.plans.logical.ShowNamespaces.mapChildren(v2Commands.scala:662)
		at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.$anonfun$resolveOperatorsDownWithPruning$1(AnalysisHelper.scala:205)
		at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper$.allowInvokingTransformsInAnalyzer(AnalysisHelper.scala:416)
		at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.resolveOperatorsDownWithPruning(AnalysisHelper.scala:198)
		at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.resolveOperatorsDownWithPruning$(AnalysisHelper.scala:194)
		at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.resolveOperatorsDownWithPruning(LogicalPlan.scala:37)
		at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.resolveOperatorsDown(AnalysisHelper.scala:190)
		at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.resolveOperatorsDown$(AnalysisHelper.scala:189)
		at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.resolveOperatorsDown(LogicalPlan.scala:37)
		at org.apache.spark.sql.catalyst.analysis.ResolveCatalogs.apply(ResolveCatalogs.scala:38)
		at org.apache.spark.sql.catalyst.analysis.ResolveCatalogs.apply(ResolveCatalogs.scala:35)
		at org.apache.spark.sql.catalyst.rules.RuleExecutor.$anonfun$execute$2(RuleExecutor.scala:242)
		at scala.collection.LinearSeqOps.foldLeft(LinearSeq.scala:183)
		at scala.collection.LinearSeqOps.foldLeft$(LinearSeq.scala:179)
		at scala.collection.immutable.List.foldLeft(List.scala:79)
		at org.apache.spark.sql.catalyst.rules.RuleExecutor.$anonfun$execute$1(RuleExecutor.scala:239)
		at org.apache.spark.sql.catalyst.rules.RuleExecutor.$anonfun$execute$1$adapted(RuleExecutor.scala:231)
		at scala.collection.immutable.List.foreach(List.scala:334)
		at org.apache.spark.sql.catalyst.rules.RuleExecutor.execute(RuleExecutor.scala:231)
		at org.apache.spark.sql.catalyst.analysis.Analyzer.org$apache$spark$sql$catalyst$analysis$Analyzer$$executeSameContext(Analyzer.scala:340)
		at org.apache.spark.sql.catalyst.analysis.Analyzer.$anonfun$execute$1(Analyzer.scala:336)
		at org.apache.spark.sql.catalyst.analysis.AnalysisContext$.withNewAnalysisContext(Analyzer.scala:234)
		at org.apache.spark.sql.catalyst.analysis.Analyzer.execute(Analyzer.scala:336)
		at org.apache.spark.sql.catalyst.analysis.Analyzer.execute(Analyzer.scala:299)
		at org.apache.spark.sql.catalyst.rules.RuleExecutor.$anonfun$executeAndTrack$1(RuleExecutor.scala:201)
		at org.apache.spark.sql.catalyst.QueryPlanningTracker$.withTracker(QueryPlanningTracker.scala:89)
		at org.apache.spark.sql.catalyst.rules.RuleExecutor.executeAndTrack(RuleExecutor.scala:201)
		at org.apache.spark.sql.catalyst.analysis.resolver.HybridAnalyzer.resolveInFixedPoint(HybridAnalyzer.scala:190)
		at org.apache.spark.sql.catalyst.analysis.resolver.HybridAnalyzer.$anonfun$apply$1(HybridAnalyzer.scala:76)
		at org.apache.spark.sql.catalyst.analysis.resolver.HybridAnalyzer.withTrackedAnalyzerBridgeState(HybridAnalyzer.scala:111)
		at org.apache.spark.sql.catalyst.analysis.resolver.HybridAnalyzer.apply(HybridAnalyzer.scala:71)
		at org.apache.spark.sql.catalyst.analysis.Analyzer.$anonfun$executeAndCheck$1(Analyzer.scala:330)
		at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper$.markInAnalyzer(AnalysisHelper.scala:423)
		at org.apache.spark.sql.catalyst.analysis.Analyzer.executeAndCheck(Analyzer.scala:330)
		at org.apache.spark.sql.execution.QueryExecution.$anonfun$lazyAnalyzed$2(QueryExecution.scala:110)
		at org.apache.spark.sql.catalyst.QueryPlanningTracker.measurePhase(QueryPlanningTracker.scala:148)
		at org.apache.spark.sql.execution.QueryExecution.$anonfun$executePhase$2(QueryExecution.scala:278)
		at org.apache.spark.sql.execution.QueryExecution$.withInternalError(QueryExecution.scala:654)
		at org.apache.spark.sql.execution.QueryExecution.$anonfun$executePhase$1(QueryExecution.scala:278)
		at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:804)
		at org.apache.spark.sql.execution.QueryExecution.executePhase(QueryExecution.scala:277)
		at org.apache.spark.sql.execution.QueryExecution.$anonfun$lazyAnalyzed$1(QueryExecution.scala:110)
		at scala.util.Try$.apply(Try.scala:217)
		at org.apache.spark.util.Utils$.doTryWithCallerStacktrace(Utils.scala:1378)
		at org.apache.spark.util.LazyTry.tryT$lzycompute(LazyTry.scala:46)
		at org.apache.spark.util.LazyTry.tryT(LazyTry.scala:46)
		... 22 more


In [6]:
# List tables in each namespace
print("\nTables by namespace:")
print("=" * 40)

all_tables = []
for ns in namespaces:
    ns_name = ns[0]
    print(f"\n{ns_name}:")
    tables = spark.sql(f"SHOW TABLES IN lakefs.`{ns_name}`").collect()
    for table in tables:
        table_name = table["tableName"]
        full_name = f"lakefs.`{ns_name}`.`{table_name}`"
        all_tables.append((ns_name, table_name, full_name))
        print(f"  - {table_name}")
    if not tables:
        print("  (no tables)")

print(f"\nTotal tables found: {len(all_tables)}")


Tables by namespace:


NameError: name 'namespaces' is not defined

## Explore Table Schema and Sample Data

In [7]:
def explore_table(table_full_name: str, sample_rows: int = 10):
    """Display schema and sample rows from an Iceberg table."""
    print(f"\n{'='*60}")
    print(f"Table: {table_full_name}")
    print("=" * 60)
    
    # Read table
    df = spark.table(table_full_name)
    
    # Show schema
    print("\nSchema:")
    df.printSchema()
    
    # Row count
    count = df.count()
    print(f"Total rows: {count:,}")
    
    # Show sample rows
    print(f"\nSample ({sample_rows} rows):")
    df.show(sample_rows, truncate=50)
    
    # Basic statistics for numeric columns
    numeric_cols = [f.name for f in df.schema.fields 
                    if str(f.dataType) in ["DoubleType()", "FloatType()", "IntegerType()", "LongType()"]]
    if numeric_cols:
        print(f"\nNumeric column statistics ({len(numeric_cols)} columns):")
        df.select(numeric_cols[:5]).describe().show()  # Limit to first 5 numeric columns
    
    return df

In [None]:
# Explore all discovered tables
dataframes = {}

for ns_name, table_name, full_name in all_tables:
    try:
        df = explore_table(full_name)
        dataframes[f"{ns_name}.{table_name}"] = df
    except Exception as e:
        print(f"\nError reading {full_name}: {e}")

## Query a Specific Table

You can also query tables directly using SQL or the DataFrame API.

In [None]:
# Example: Query a specific table by name
# Modify the table name as needed

# Table format: lakefs.<namespace>.<table_name>
# Common namespaces in this project: kronodroid
TABLE_NAME = "lakefs.kronodroid.fct_training_dataset"  # Change this to your table

try:
    df = spark.table(TABLE_NAME)
    print(f"Loaded table: {TABLE_NAME}")
    print(f"Columns: {df.columns}")
    print(f"Row count: {df.count():,}")
    df.show(5)
except Exception as e:
    print(f"Could not load table {TABLE_NAME}: {e}")
    print("\nAvailable tables:")
    for t in all_tables:
        print(f"  - {t[2]}")

In [None]:
# Example: SQL query
SQL_QUERY = """
SELECT *
FROM lakefs.kronodroid.fct_training_dataset
LIMIT 20
"""

try:
    result = spark.sql(SQL_QUERY)
    result.show(truncate=30)
except Exception as e:
    print(f"Query failed: {e}")

## View Iceberg Table Metadata

Iceberg provides metadata tables for history, snapshots, and partitions.

In [None]:
def show_iceberg_metadata(table_full_name: str):
    """Display Iceberg-specific metadata for a table."""
    print(f"\nIceberg metadata for: {table_full_name}")
    print("=" * 60)
    
    # Table history (commits/snapshots)
    try:
        print("\n-- History (recent snapshots) --")
        spark.sql(f"SELECT * FROM {table_full_name}.history").show(5, truncate=False)
    except Exception as e:
        print(f"Could not get history: {e}")
    
    # Snapshots
    try:
        print("\n-- Snapshots --")
        spark.sql(f"SELECT snapshot_id, committed_at, operation FROM {table_full_name}.snapshots").show(5)
    except Exception as e:
        print(f"Could not get snapshots: {e}")
    
    # Files
    try:
        print("\n-- Data files --")
        spark.sql(f"SELECT file_path, file_format, record_count, file_size_in_bytes FROM {table_full_name}.files").show(5, truncate=50)
    except Exception as e:
        print(f"Could not get files: {e}")

# Show metadata for first table
if all_tables:
    show_iceberg_metadata(all_tables[0][2])

## Time Travel Query

Iceberg supports querying historical snapshots.

In [None]:
# Example: Query a table at a specific snapshot
# First, get available snapshots

if all_tables:
    table_name = all_tables[0][2]
    print(f"Snapshots for {table_name}:")
    
    try:
        snapshots_df = spark.sql(f"SELECT snapshot_id, committed_at FROM {table_name}.snapshots ORDER BY committed_at DESC")
        snapshots = snapshots_df.collect()
        
        for snap in snapshots[:5]:
            print(f"  Snapshot {snap['snapshot_id']} at {snap['committed_at']}")
        
        # Query at a specific snapshot (if available)
        if snapshots:
            snapshot_id = snapshots[0]["snapshot_id"]
            print(f"\nQuerying at snapshot {snapshot_id}:")
            spark.sql(f"SELECT * FROM {table_name} VERSION AS OF {snapshot_id} LIMIT 5").show()
    except Exception as e:
        print(f"Time travel query failed: {e}")

## Cleanup

In [None]:
# Stop Spark session
spark.stop()
print("Spark session stopped.")