In [1]:
from confluent_kafka.schema_registry import SchemaRegistryClient
from libs.configuration import configure
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.avro import functions as AF

from shared.spark_config import create_spark_config

env = configure()
conf = create_spark_config("M2_Processors.airline.raw")

{'DATASTORAGE_AZURE_ACCESSKEY': 'dYj3PCGRyYHYR9pRo2xPa9BQ7t/sUonACJ29wcPdpa+1IC70rMbZmpfpHJJWJtk7OeASL+eQDIWq+AStGg3CCA==',
 'DATASTORAGE_AZURE_ACCOUNTNAME': 'icebergpe6t5cug7x9f45p8',
 'KAFKA_BOOTSTRAP_SERVERS': 'localhost:9092',
 'KAFKA_SCHEMA_REGISTRY_URL': 'http://localhost:8081',
 'KAFKA_SESSION_TIMEOUT_MS': '45000',
 'KAFKA_TOPIC_RAW_AIRCRAFT': 'raw.aircraft',
 'KAFKA_TOPIC_RAW_AIRLINE': 'raw.airline',
 'KAFKA_TOPIC_RAW_AIRPORT': 'raw.airport',
 'KAFKA_TOPIC_RAW_FLIGHT': 'raw.flight',
 'KAFKA_TOPIC_T1_AIRCRAFT': 't1.aircraft',
 'KAFKA_TOPIC_T1_AIRLINE': 't1.airline',
 'KAFKA_TOPIC_T1_AIRPORT': 't1.airport',
 'KAFKA_TOPIC_T1_FLIGHT': 't1.flight',
 'KAFKA_TOPIC_T2_AIRCRAFT': 't2.aircraft',
 'KAFKA_TOPIC_T2_AIRLINE': 't2.airline',
 'KAFKA_TOPIC_T2_AIRPORT': 't2.airport',
 'KAFKA_TOPIC_T2_FLIGHT': 't2.flight',
 'POSTGRE_JDBC_URL': 'jdbc:postgresql://localhost:5432/iceberg-catalog',
 'POSTGRE_PASSWORD': 'grebeci',
 'POSTGRE_USER': 'iceberg',
 'SPARK_MASTER_URL': 'local[1]',
 'file': '

In [2]:
spark = SparkSession.builder.config(conf=conf).getOrCreate()

In [3]:
sr_client = SchemaRegistryClient({"url": env.KAFKA_SCHEMA_REGISTRY_URL})
in_schema = sr_client.get_latest_version(f"{env.KAFKA_TOPIC_RAW_AIRCRAFT}-value")
in_schema.schema.schema_str

'{"type":"record","name":"RawAircraft","namespace":"neot.de.rbearthgaze","doc":"Typical Aircraft message after crawling from https://doc8643.com.","fields":[{"name":"icao","type":"string","doc":"ICAO 4 letters"},{"name":"classification","type":"string","doc":""},{"name":"category","type":"string","doc":""},{"name":"manufacturers","type":{"type":"array","items":"string"},"doc":""},{"name":"wing_span","type":["string","null"],"doc":"float: Wing Span (m)"},{"name":"length","type":["string","null"],"doc":"float: Length (m)"},{"name":"height","type":["string","null"],"doc":"float: Height (m)"},{"name":"mtow","type":["string","null"],"doc":"float: MTOW (t)"},{"name":"fuel_capacity","type":["string","null"],"doc":"int: Fuel Capacity (ltr)"},{"name":"maximum_range","type":["string","null"],"doc":"int: Maximum Range (Nm)"},{"name":"persons_on_board","type":["string","null"],"doc":"int: Persons On Board"},{"name":"take_off_distance","type":["string","null"],"doc":"int: Take Off Distance (m)"},{"

In [4]:
df = (
    spark.read.format("kafka")
    .option("kafka.bootstrap.servers", env.KAFKA_BOOTSTRAP_SERVERS)
    .option("subscribe", env.KAFKA_TOPIC_RAW_AIRCRAFT)
    # .option("minOffsetsPerTrigger", 20)
    # .option("maxOffsetsPerTrigger", 20)
    # .option("fetchOffset.numRetries", 0)
    .option("startingOffsets", "earliest")
    # .option("endingOffsets", "latest")
    .load()
)

In [5]:
df = df.selectExpr("substring(value, 6) as value")
df = df.select(AF.from_avro("value", in_schema.schema.schema_str).alias("d")).select("d.*")
df.show()

+----+--------------+--------+--------------------+---------+------+------+----+-------------+-------------+----------------+-----------------+----------------+----------------+---------------+-------------+-------------+------------------+
|icao|classification|category|       manufacturers|wing_span|length|height|mtow|fuel_capacity|maximum_range|persons_on_board|take_off_distance|landing_distance|absolute_ceiling|optimum_ceiling|maximum_speed|optimum_speed|maximum_climb_rate|
+----+--------------+--------+--------------------+---------+------+------+----+-------------+-------------+----------------+-----------------+----------------+----------------+---------------+-------------+-------------+------------------+
|A002|           G1P|     L/G|       [IRKUT A-002]|        -|   5.9|   3.3| 0.9|             |          270|               3|               90|              10|             100|             80|          113|             |              1100|
|  A1|           L1P|     M/G|[DOUGL

In [6]:
df = df.withColumn("created_ts", F.current_timestamp())
df.show()

+----+--------------+--------+--------------------+---------+------+------+----+-------------+-------------+----------------+-----------------+----------------+----------------+---------------+-------------+-------------+------------------+--------------------+
|icao|classification|category|       manufacturers|wing_span|length|height|mtow|fuel_capacity|maximum_range|persons_on_board|take_off_distance|landing_distance|absolute_ceiling|optimum_ceiling|maximum_speed|optimum_speed|maximum_climb_rate|          created_ts|
+----+--------------+--------+--------------------+---------+------+------+----+-------------+-------------+----------------+-----------------+----------------+----------------+---------------+-------------+-------------+------------------+--------------------+
|A002|           G1P|     L/G|       [IRKUT A-002]|        -|   5.9|   3.3| 0.9|             |          270|               3|               90|              10|             100|             80|          113|       

In [7]:
df.writeTo("dev.raw.aircrafts").append()