In [None]:
from pyspark.sql.functions import to_json, struct, col, expr, row_number, from_json, get_json_object, explode, when
from pyspark.sql.window import Window
from pyspark.sql.types import StructType, StringType, IntegerType, MapType, StructField
from pyspark.sql import SparkSession

# Creamos la sesión de Spark con configuración para Kubernetes
spark = (
    SparkSession.builder
    .appName("JupyterSparkApp")
    .master("k8s://https://192.168.1.150:6443")
    .config("spark.submit.deployMode", "client")
    .config("spark.driver.host", "spark-driver-headless.default.svc.cluster.local")
    .config("spark.driver.port", "7077")
    .config("spark.driver.bindAddress", "0.0.0.0")
    .config("spark.executor.instances", "2")
    .config("spark.kubernetes.container.image", "docker.io/bitnami/spark:3.5.6")
    .config("spark.kubernetes.executor.deleteOnTermination", "true")
    .config("spark.jars.packages", "org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.9.2,org.projectnessie.nessie-integrations:nessie-spark-extensions-3.5_2.12:0.103.3,org.apache.hadoop:hadoop-aws:3.3.4,com.amazonaws:aws-java-sdk-bundle:1.12.262,org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.1")
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions,org.projectnessie.spark.extensions.NessieSparkSessionExtensions")
    .config("spark.sql.catalog.nessie", "org.apache.iceberg.spark.SparkCatalog")
    .config("spark.sql.catalog.nessie.uri", "http://nessie.nessie-ns.svc.cluster.local:19120/api/v1")
    .config("spark.sql.catalog.nessie.ref", "main")
    .config("spark.sql.catalog.nessie.authentication.type", "NONE")
    .config("spark.sql.catalog.nessie.catalog-impl", "org.apache.iceberg.nessie.NessieCatalog")
    .config("spark.sql.catalog.nessie.warehouse", "s3a://synthetic")
    .config("spark.hadoop.fs.s3a.access.key", "qVgFWBabQmQrSuWTJGhj")
    .config("spark.hadoop.fs.s3a.secret.key", "l2GjPEVu22SfiqtaAU2zj3lBptEIoG1iRXGucn3o")
    .config("spark.hadoop.fs.s3a.endpoint", "http://myminio-hl.minio-tenant.svc.cluster.local:9000")
    .config("spark.hadoop.fs.s3a.path.style.access", "true")
    .getOrCreate()
)

In [None]:
spark.sql("DROP TABLE IF EXISTS nessie.orders_bronze")

In [None]:
# Bronze layer
df = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "my-cluster-kafka-bootstrap.kafka.svc.cluster.local:9092") \
    .option("subscribe", "mongo.synthetic.orders") \
    .option("startingOffsets", "earliest") \
    .load()

df_iceberg = df.select(
    col("key").cast("string"),
    col("value").cast("string"),
    "topic",
    "partition",
    "offset",
    "timestamp",
    "timestampType"
)

spark.sql("""
CREATE TABLE IF NOT EXISTS nessie.orders_bronze (
    key STRING,
    value STRING,
    topic STRING,
    partition INT,
    offset LONG,
    timestamp TIMESTAMP,
    timestampType INT
)
USING iceberg
PARTITIONED BY (days(timestamp))
LOCATION 's3a://synthetic/orders_bronze'
TBLPROPERTIES (
    'format-version'='2',
    'write.format.default'='parquet'
)
""")

query = df_iceberg.writeStream \
    .format("iceberg") \
    .outputMode("append") \
    .option("checkpointLocation", "s3a://synthetic/checkpoints/orders_bronze") \
    .toTable("nessie.orders_bronze")
query.awaitTermination()

In [None]:
query = spark.sql("""
       SELECT * FROM nessie.orders_bronze
    """)
query.show(100)