In [4]:
from pyspark.sql import SparkSession

sparkPackages = [
    "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.1",
    "org.apache.spark:spark-avro_2.12:3.5.1",
    "org.apache.spark:spark-streaming-kafka-0-10_2.12:3.5.1",
    "org.apache.kafka:kafka-clients:2.8.2",
    "org.apache.kafka:kafka_2.13:2.8.2",
    "io.delta:delta-spark_2.12:3.2.0"
]
spark = (
    SparkSession.builder.appName("ingest-kafka-data")
    .config('spark.jars.packages', ",".join(sparkPackages))
    .config('spark.sql.extensions','io.delta.sql.DeltaSparkSessionExtension')
    .config('spark.sql.catalog.spark_catalog','org.apache.spark.sql.delta.catalog.DeltaCatalog')
).getOrCreate()

In [6]:
kafka_bootstrap_server = "kafka:9092"
kafka_topic = "sv-uploads-default-topic"
target_table = "default.events_delta"
json_avro_schema = """{
  "namespace": "com.tfgco.eventsgateway",
  "type": "record",
  "name": "Event",
  "fields": [
    {
      "name": "id",
      "type": "string"
    },
    {
      "name": "name",
      "type": "string"
    },
    {
      "name": "props",
      "default": {},
      "type": {
        "type": "map",
        "values": "string"
      }
    },
    {
      "name": "serverTimestamp",
      "type": "long"
    },
    {
      "name": "clientTimestamp",
      "type": "long"
    }
  ]
}"""

In [7]:
source_df = spark \
  .readStream \
  .format("kafka") \
  .option("kafka.bootstrap.servers", kafka_bootstrap_server) \
  .option("subscribe", kafka_topic) \
  .option("startingOffsets", "earliest") \
  .option("kafka.group_id", "stream-spark-kafka") \
  .option("includeHeaders", True) \
  .load()

In [8]:
from pyspark.sql.functions import date_format, to_date, from_unixtime, col
from pyspark.sql.avro.functions import from_avro

df = (
    source_df
    .withColumn("event", from_avro("value", json_avro_schema))
    .withColumn("id", col("event.id"))
    .withColumn("name", col("event.name"))
    .withColumn("props", col("event.props"))
    .withColumn("clienttimestamp", col("event.clientTimestamp"))
    .withColumn("servertimestamp", col("event.serverTimestamp"))
    .withColumn("date", to_date(from_unixtime(col("event.clientTimestamp") / 1000)))
    .withColumn("year", date_format("date", "yyyy"))
    .withColumn("month", date_format("date", "MM"))
    .withColumn("day", date_format("date", "dd"))
    .select("id", "name", "props", "clienttimestamp", "servertimestamp", "year", "month", "day")
 )

In [11]:
if not spark.catalog.tableExists(target_table):
    spark.catalog.createTable(tableName=target_table, schema=df.schema, souce='delta', path=f"/tmp/{target_table}")

In [12]:
stream = (
    df
    .writeStream
    .outputMode("append")
    .format("parquet")
    .option("checkpointLocation", "/tmp/")
    .toTable(target_table)
)

In [16]:
stream.status

{'message': 'Waiting for data to arrive',
 'isDataAvailable': False,
 'isTriggerActive': False}

In [17]:
spark.sql(f"select * from {target_table}").show()

+--------------------+----------+--------------------+---------------+---------------+----+-----+---+
|                  id|      name|               props|clienttimestamp|servertimestamp|year|month|day|
+--------------------+----------+--------------------+---------------+---------------+----+-----+---+
|479958b3-157a-4dd...|test-event|{some-prop -> som...|  1718381790997|  1718381791007|2024|   06| 14|
|cc4871a6-da30-455...|test-event|{some-prop -> som...|  1718381794469|  1718381794473|2024|   06| 14|
|cf7dff53-34e0-45d...|test-event|{some-prop -> som...|  1718381797699|  1718381797703|2024|   06| 14|
+--------------------+----------+--------------------+---------------+---------------+----+-----+---+



In [None]:
stream.stop()