In [None]:
from confluent_kafka.schema_registry import SchemaRegistryClient
from libs.configuration import configure
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import types as T
from pyspark.sql.avro import functions as AF

from shared.spark_config import create_spark_config

env = configure()
conf = create_spark_config("M2_Processors.flight_signal.raw")

In [None]:
sr_client = SchemaRegistryClient({"url": env.KAFKA_SCHEMA_REGISTRY_URL})
in_schema = sr_client.get_latest_version(f"{env.KAFKA_TOPIC_RAW_FLIGHT_SIGNAL}-value")
in_schema.schema.schema_str

In [None]:
spark = SparkSession.builder.config(conf=conf).getOrCreate()

In [None]:
df = (
    spark.readStream.format("kafka")
    .option("kafka.bootstrap.servers", env.KAFKA_BOOTSTRAP_SERVERS)
    .option("subscribe", env.KAFKA_TOPIC_RAW_FLIGHT_SIGNAL)
    .load()
)

In [None]:
df = df.selectExpr("substring(value, 6) as value")
df = df.select(AF.from_avro("value", in_schema.schema.schema_str).alias("d")).select("d.*")

In [None]:
df = df.withColumn("created_ts", F.current_timestamp())

In [None]:
write_stream = (
    df.writeStream.format("iceberg")
    .outputMode("append")
    .trigger(processingTime="10 seconds")
    .option(
        "checkpointLocation",
        f"abfss://warehouse@{env.DATASTORAGE_AZURE_ACCOUNTNAME}.dfs.core.windows.net/_checkpoints/{env.KAFKA_TOPIC_RAW_FLIGHT_SIGNAL}",
    )
    .toTable("dev.raw.flight_signals")
)

In [None]:
console_write_stream = df.writeStream.format("console").start()

In [None]:
write_stream.stop()
console_write_stream.stop()