In [None]:
import logging

from confluent_kafka.schema_registry import SchemaRegistryClient
from libs.configuration import configure
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import types as T
from pyspark.sql.avro import functions as AF

from shared.spark_config import create_spark_config

__MODULE = "M2_Processors.airline.raw"
logger = logging.getLogger("notebook")
env = configure()
conf = create_spark_config().setAppName(__MODULE)

In [None]:
spark = SparkSession.builder.config(conf=conf).getOrCreate()

In [None]:
sr_client = SchemaRegistryClient({"url": env.KAFKA_SCHEMA_REGISTRY_URL})
in_schema = sr_client.get_latest_version(f"{env.KAFKA_TOPIC_RAW_AIRLINE}-value")
in_schema

In [None]:
df = (
    spark.readStream.format("kafka")
    .option("kafka.bootstrap.servers", env.KAFKA_BOOTSTRAP_SERVERS)
    .option("subscribe", env.KAFKA_TOPIC_RAW_AIRLINE)
    # .option("minOffsetsPerTrigger", 20)
    # .option("maxOffsetsPerTrigger", 20)
    # .option("fetchOffset.numRetries", 0)
    .option("startingOffsets", "earliest")
    # .option("endingOffsets", "latest")
    .load()
)
df.show()

In [None]:
kafka_write_stream = (
    df.writeStream.format("kafka")
    .trigger(once=True)
    .option("kafka.bootstrap.servers", env.KAFKA_BOOTSTRAP_SERVERS)
    .option("topic", env.KAFKA_TOPIC_T1_AIRLINE)
    .start()
)

In [None]:
df = df.select(AF.from_avro("value", in_schema.schema.schema_str).alias("unflattened"))
df = df.select("unflattened.*")
df.show()

In [None]:
iceberg_write_stream = (
    df.writeStream.format("iceberg")
    .outputMode("append")
    .trigger(once=True)
    .option("fanout-enabled", "true")
    .toTable("dev.raw.airlines")
)