In [None]:
from libs.configuration import configure
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import types as T

from shared.spark_config import create_spark_config

env = configure()
conf = create_spark_config("M2_Processors.aircraft_model.tier1")

In [None]:
spark = SparkSession.builder.config(conf=conf).getOrCreate()

In [None]:
df = spark.read.format("iceberg").load("dev.raw.aircraft_models")
df.show()

In [None]:
df.schema

In [None]:
df.describe().show()

In [None]:
for c in df.columns:
    if c in ["created_ts", "icao", "classification", "category", "manufacturers"]:
        continue

    df.select(c).where(F.col(c).rlike(r"[^\d\.]")).describe().show()

In [None]:
df = df.replace(
    to_replace={"-": None, "": None},
    subset=[
        "wing_span",
        "length",
        "height",
        "mtow",
        "fuel_capacity",
        "maximum_range",
        "take_off_distance",
        "landing_distance",
        "absolute_ceiling",
        "optimum_ceiling",
        "maximum_climb_rate",
    ],
)

In [None]:
df.select("icao").where(F.col("icao").rlike(r"[^a-zA-Z0-9]")).describe().show()
df.select("classification").where(F.col("classification").rlike(r"[^a-zA-Z0-9]")).describe().show()
df.select("category").where(F.col("category").rlike(r"[^a-zA-Z0-9\/]")).describe().show()

In [None]:
df = df.replace(to_replace={"-": None, "": None}, subset=["classification"])

In [None]:
df = df.replace(to_replace={"-/": None, "-/-": None, "": None}, subset=["category"])

In [None]:
df = (
    df.withColumn("capacity_caps", F.split("persons_on_board", "-"))
    .withColumn("capacity_lower_cap", F.try_element_at("capacity_caps", F.lit(-2)))
    .withColumn("capacity_upper_cap", F.element_at("capacity_caps", F.lit(-1)))
    .drop("persons_on_board", "capacity_caps")
)

df.show()

In [None]:
df = (
    df.withColumn(
        "capacity_upper_capx",
        F.split("capacity_upper_cap", r"\+").cast("array<int>"),
    )
    .withColumn(
        "capacity_upper_cap", F.aggregate("capacity_upper_capx", F.lit(0), lambda acc, x: acc + x)
    )
    .drop("capacity_upper_capx")
)

In [None]:
df = df.withColumns(
    {
        "wing_span": F.col("wing_span").cast(T.FloatType()),
        "length": F.col("length").cast(T.FloatType()),
        "height": F.col("height").cast(T.FloatType()),
        "mtow": F.col("mtow").cast(T.FloatType()),
        "fuel_capacity": F.col("fuel_capacity").cast(T.FloatType()),
        "maximum_range": F.col("maximum_range").cast(T.IntegerType()),
        "take_off_distance": F.col("take_off_distance").cast(T.FloatType()),
        "landing_distance": F.col("landing_distance").cast(T.FloatType()),
        "absolute_ceiling": F.col("absolute_ceiling").cast(T.FloatType()),
        "optimum_ceiling": F.col("optimum_ceiling").cast(T.FloatType()),
        "maximum_climb_rate": F.col("maximum_climb_rate").cast(T.FloatType()),
        "capacity_lower_cap": F.col("capacity_lower_cap").cast(T.IntegerType()),
    }
).drop("maximum_speed", "optimum_speed")

In [None]:
df.schema

In [None]:
df.describe().show()

In [None]:
df = df.withColumn("created_ts", F.current_timestamp()).withColumn(
    "updated_ts", F.current_timestamp()
)
df.show()

In [None]:
df.writeTo("dev.tier1.aircraft_models").append()