In [None]:
from confluent_kafka.schema_registry import SchemaRegistryClient
from libs.configuration import configure
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.avro import functions as AF

from shared.spark_config import create_spark_config

env = configure()
conf = create_spark_config("M2_Processors.aircraft_model.raw")

In [None]:
spark = SparkSession.builder.config(conf=conf).getOrCreate()

In [None]:
sr_client = SchemaRegistryClient({"url": env.KAFKA_SCHEMA_REGISTRY_URL})
in_schema = sr_client.get_latest_version(f"{env.KAFKA_TOPIC_RAW_AIRCRAFT}-value")
in_schema.schema.schema_str

In [None]:
df = (
    spark.read.format("kafka")
    .option("kafka.bootstrap.servers", env.KAFKA_BOOTSTRAP_SERVERS)
    .option("subscribe", env.KAFKA_TOPIC_RAW_AIRCRAFT)
    # .option("minOffsetsPerTrigger", 20)
    # .option("maxOffsetsPerTrigger", 20)
    # .option("fetchOffset.numRetries", 0)
    .option("startingOffsets", "earliest")
    # .option("endingOffsets", "latest")
    .load()
)

In [None]:
df = df.selectExpr("substring(value, 6) as value")
df = df.select(AF.from_avro("value", in_schema.schema.schema_str).alias("d")).select("d.*")
df.show()

In [None]:
df = df.withColumn("created_ts", F.current_timestamp())
df.show()

In [None]:
df.writeTo("dev.raw.aircraft_models").append()