In [1]:
import findspark
findspark.init("/opt/spark/")

In [2]:
from pyspark.sql import SparkSession, functions as F

In [3]:
# Full url of the Nessie API endpoint to nessie
url = "http://nessie:19120/api/v1"
# Where to store nessie tables
full_path_to_warehouse = 's3a://warehouse'
# The ref or context that nessie will operate on (if different from default branch).
# Can be the name of a Nessie branch or tag name.
ref = "main"
# Nessie authentication type (NONE, BEARER, OAUTH2 or AWS)
auth_type = "NONE"

s3_endpoint = "http://minio:9000"

accessKeyId='minioadmin'
secretAccessKey='minioadmin'

In [4]:
# Find compatible versions here: https://projectnessie.org/tools/iceberg/spark/
spark = (
    SparkSession.builder
    .master("local[2]")
    .appName("Spark Nessie Iceberg Demo")
    .config("spark.driver.memory", "2g")
    .config('spark.jars.packages',
            'org.apache.hadoop:hadoop-aws:3.3.0,io.delta:delta-core_2.12:2.4.0,org.apache.iceberg:iceberg-spark-runtime-3.4_2.12:1.3.0,org.projectnessie.nessie-integrations:nessie-spark-extensions-3.4_2.12:0.75.0')
    .config("spark.hadoop.fs.s3a.access.key", accessKeyId)
    .config("spark.hadoop.fs.s3a.secret.key", secretAccessKey)
    .config("spark.hadoop.fs.s3a.path.style.access", True)
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    .config("spark.hadoop.fs.s3a.endpoint", s3_endpoint)
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions,org.projectnessie.spark.extensions.NessieSparkSessionExtensions")
    .config("spark.sql.catalog.nessie.uri", url)
    .config("spark.sql.catalog.nessie.ref", ref)
    .config("spark.sql.catalog.nessie.authentication.type", auth_type)
    .config("spark.sql.catalog.nessie.catalog-impl", "org.apache.iceberg.nessie.NessieCatalog")
    .config("spark.sql.catalog.nessie.warehouse", full_path_to_warehouse)
    .config("spark.sql.catalog.nessie", "org.apache.iceberg.spark.SparkCatalog")
    .getOrCreate()
)

:: loading settings :: url = jar:file:/opt/spark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
org.apache.hadoop#hadoop-aws added as a dependency
io.delta#delta-core_2.12 added as a dependency
org.apache.iceberg#iceberg-spark-runtime-3.4_2.12 added as a dependency
org.projectnessie.nessie-integrations#nessie-spark-extensions-3.4_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-c7c2c9a8-e626-42ba-b6f7-793140e86de8;1.0
	confs: [default]
	found org.apache.hadoop#hadoop-aws;3.3.0 in central
	found com.amazonaws#aws-java-sdk-bundle;1.11.563 in central
	found org.wildfly.openssl#wildfly-openssl;1.0.7.Final in central
	found io.delta#delta-core_2.12;2.4.0 in central
	found io.delta#delta-storage;2.4.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
	found org.apache.iceberg#iceberg-spark-runtime-3.4_2.12;1.3.0 in central
	found org.projectnessie.nessie-integrations#nessie-spark-extensions-3.4_2.12;0.75.0 in central
:: resolution repor

In [5]:
spark.version

'3.4.1'

In [17]:
from pyspark import SparkFiles
sc = spark.sparkContext
github_url="https://raw.githubusercontent.com/erkansirin78/datasets/master/Churn_Modelling.csv"
sc.addFile(github_url)
df = spark.read.csv(SparkFiles.get("Churn_Modelling.csv"),header= True, inferSchema=True)
df.show(3)

23/12/26 06:34:58 WARN SparkContext: The path https://raw.githubusercontent.com/erkansirin78/datasets/master/Churn_Modelling.csv has been added already. Overwriting of added paths is not supported in the current version.


+---------+----------+--------+-----------+---------+------+---+------+--------+-------------+---------+--------------+---------------+------+
|RowNumber|CustomerId| Surname|CreditScore|Geography|Gender|Age|Tenure| Balance|NumOfProducts|HasCrCard|IsActiveMember|EstimatedSalary|Exited|
+---------+----------+--------+-----------+---------+------+---+------+--------+-------------+---------+--------------+---------------+------+
|        1|  15634602|Hargrave|        619|   France|Female| 42|     2|     0.0|            1|        1|             1|      101348.88|     1|
|        2|  15647311|    Hill|        608|    Spain|Female| 41|     1|83807.86|            1|        0|             1|      112542.58|     0|
|        3|  15619304|    Onio|        502|   France|Female| 42|     8|159660.8|            3|        1|             0|      113931.57|     1|
+---------+----------+--------+-----------+---------+------+---+------+--------+-------------+---------+--------------+---------------+------+

# Create namespace nessie catalog

In [18]:
spark.sql("CREATE NAMESPACE IF NOT EXISTS nessie.demo;")

DataFrame[]

# Create table in nessie namespace

In [25]:
spark.sql("DROP TABLE IF EXISTS nessie.demo.churn;")

DataFrame[]

In [26]:
spark.createDataFrame([], df.schema).writeTo("nessie.demo.churn").create()
# df.writeTo("nessie.demo.churn").create()


# Write dataframe to iceberg table

In [27]:
df.write.format("iceberg").mode("overwrite") \
    .save("nessie.demo.churn")

# Read table

In [28]:
df_from_iceberg = spark.table("nessie.demo.churn")

In [29]:
df_from_iceberg.show()

+---------+----------+---------+-----------+---------+------+---+------+---------+-------------+---------+--------------+---------------+------+
|RowNumber|CustomerId|  Surname|CreditScore|Geography|Gender|Age|Tenure|  Balance|NumOfProducts|HasCrCard|IsActiveMember|EstimatedSalary|Exited|
+---------+----------+---------+-----------+---------+------+---+------+---------+-------------+---------+--------------+---------------+------+
|        1|  15634602| Hargrave|        619|   France|Female| 42|     2|      0.0|            1|        1|             1|      101348.88|     1|
|        2|  15647311|     Hill|        608|    Spain|Female| 41|     1| 83807.86|            1|        0|             1|      112542.58|     0|
|        3|  15619304|     Onio|        502|   France|Female| 42|     8| 159660.8|            3|        1|             0|      113931.57|     1|
|        4|  15701354|     Boni|        699|   France|Female| 39|     1|      0.0|            2|        0|             0|       93