In [1]:
import pyspark
from pyspark.sql import SparkSession
import os

## DEFINE SENSITIVE VARIABLES
NESSIE_URI      = os.environ["NESSIE_URI"]      # e.g. http://nessie:19120/api/v1
WAREHOUSE       = os.environ["WAREHOUSE"]       # e.g. s3a://lakehouse/
AWS_ACCESS_KEY_ID  = os.environ["AWS_ACCESS_KEY_ID"]  # from .env
AWS_SECRET_ACCESS_KEY  = os.environ["AWS_SECRET_ACCESS_KEY"]
AWS_S3_ENDPOINT = os.environ["AWS_S3_ENDPOINT"] # e.g. http://minio:9000

conf = (
    pyspark.SparkConf()
        .setAppName('app_name')
        .set(
        'spark.jars.packages',
        'org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.4.2,'
        'org.projectnessie.nessie-integrations:nessie-spark-extensions-3.5_2.12:0.83.1,'
        'software.amazon.awssdk:bundle:2.17.178,'
        'software.amazon.awssdk:url-connection-client:2.17.178,'
        'org.slf4j:slf4j-simple:2.0.7,'
        'org.apache.hadoop:hadoop-aws:3.3.4,'
        'com.amazonaws:aws-java-sdk-bundle:1.12.365'
        )
        .set(
            'spark.sql.extensions',
            'org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions,'
            'org.projectnessie.spark.extensions.NessieSparkSessionExtensions'
        )

        # Nessie catalog
        .set('spark.sql.catalog.nessie',            'org.apache.iceberg.spark.SparkCatalog')
        .set('spark.sql.catalog.nessie.uri',        NESSIE_URI)
        .set('spark.sql.catalog.nessie.ref',        'main')
        .set('spark.sql.catalog.nessie.authentication.type', 'NONE')
        .set('spark.sql.catalog.nessie.catalog-impl','org.apache.iceberg.nessie.NessieCatalog')

        # Iceberg S3FileIO (AWS SDK) → must supply keys + region
        .set('spark.sql.catalog.nessie.s3.endpoint',        AWS_S3_ENDPOINT)
        .set('spark.sql.catalog.nessie.s3.path-style-access','true')
        .set('spark.sql.catalog.nessie.s3.access-key',      AWS_ACCESS_KEY_ID)
        .set('spark.sql.catalog.nessie.s3.secret-key',      AWS_SECRET_ACCESS_KEY)
        .set('spark.sql.catalog.nessie.s3.region',          os.environ.get("AWS_REGION", "us-east-1"))

        # Warehouse & IO impl
        .set('spark.sql.catalog.nessie.warehouse', WAREHOUSE)
        .set('spark.sql.catalog.nessie.io-impl',   'org.apache.iceberg.aws.s3.S3FileIO')

        # Hadoop S3A (for any s3a:// URI)
        .set('spark.hadoop.fs.s3a.endpoint',               AWS_S3_ENDPOINT)
        .set('spark.hadoop.fs.s3a.path.style.access',      'true')
        .set('spark.hadoop.fs.s3a.connection.ssl.enabled', 'false')
        .set('spark.hadoop.fs.s3a.access.key',             AWS_ACCESS_KEY_ID)
        .set('spark.hadoop.fs.s3a.secret.key',             AWS_SECRET_ACCESS_KEY)
)

## Start Spark Session
spark = SparkSession.builder.config(conf=conf).getOrCreate()
print("Spark Running")


## Create a Table
spark.sql("CREATE OR REPLACE TABLE nessie.names (name STRING) USING iceberg;").show()

## Insert Some Data
spark.sql("INSERT INTO nessie.names VALUES ('Alex Merced'), ('Dipankar Mazumdar'), ('Jason Hughes')").show()


## Query the Data
spark.sql("SELECT * FROM nessie.names;").show()

:: loading settings :: url = jar:file:/opt/spark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
org.apache.iceberg#iceberg-spark-runtime-3.5_2.12 added as a dependency
org.projectnessie.nessie-integrations#nessie-spark-extensions-3.5_2.12 added as a dependency
software.amazon.awssdk#bundle added as a dependency
software.amazon.awssdk#url-connection-client added as a dependency
org.slf4j#slf4j-simple added as a dependency
org.apache.hadoop#hadoop-aws added as a dependency
com.amazonaws#aws-java-sdk-bundle added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-5a83bb84-b7c0-4387-8455-3b4b7666d5fc;1.0
	confs: [default]
	found org.apache.iceberg#iceberg-spark-runtime-3.5_2.12;1.4.2 in central
	found org.projectnessie.nessie-integrations#nessie-spark-extensions-3.5_2.12;0.83.1 in central
	found software.amazon.awssdk#bundle;2.17.178 in central
	found software.amazon.eventstream#eventstream;1.0.1 in central
	found software.amazon.awssdk#url-connection-clien

Spark Running


SLF4J: Failed to load class "org.slf4j.impl.StaticLoggerBinder".
SLF4J: Defaulting to no-operation (NOP) logger implementation
SLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details.


++
||
++
++



                                                                                

++
||
++
++

+-----------------+
|             name|
+-----------------+
|      Alex Merced|
|Dipankar Mazumdar|
|     Jason Hughes|
+-----------------+



In [2]:
spark.sql("SHOW TABLES FROM nessie;").show()

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
|         |    names|      false|
+---------+---------+-----------+



In [3]:
df = spark.read.parquet("s3a://seed/flights-1m.parquet")
df.show(5)

25/04/18 07:23:54 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties


+----------+---------+---------+--------+--------+---------+---------+
|   FL_DATE|DEP_DELAY|ARR_DELAY|AIR_TIME|DISTANCE| DEP_TIME| ARR_TIME|
+----------+---------+---------+--------+--------+---------+---------+
|2006-01-01|        5|       19|     350|    2475| 9.083333|12.483334|
|2006-01-02|      167|      216|     343|    2475|11.783334|15.766666|
|2006-01-03|       -7|       -2|     344|    2475| 8.883333|12.133333|
|2006-01-04|       -5|      -13|     331|    2475| 8.916667|    11.95|
|2006-01-05|       -3|      -17|     321|    2475|     8.95|11.883333|
+----------+---------+---------+--------+--------+---------+---------+
only showing top 5 rows

