In [None]:
import pandera.pyspark as pa
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from shared.spark_config import create_spark_config

from libs.configuration import configure

env = configure()
conf = create_spark_config("M2_Processors.airline.tier1")

In [None]:
spark = SparkSession.builder.config(conf=conf).getOrCreate()

In [None]:
df = spark.read.format("iceberg").load("dev.raw.airlines")
df.show()

In [None]:
df.describe().show()

In [None]:
df = (
    df.drop("id")
    .withColumn(
        "alias",
        F.when((F.col("alias") == "\\N"), None)
        .when(F.col("alias") == "N/A", None)
        .otherwise(F.col("alias")),
    )
    .withColumn(
        "icao",
        F.when(F.col("icao") == "\\N", None)
        .when(F.col("icao") == "N/A", None)
        .otherwise(F.col("icao")),
    )
    .withColumn(
        "callsign",
        F.when(F.col("callsign") == "\\N", None)
        .when(F.col("callsign") == "N/A", None)
        .otherwise(F.col("callsign")),
    )
    .withColumn(
        "country",
        F.when(F.col("country") == "\\N", None)
        .when(F.col("country") == "N/A", None)
        .otherwise(F.col("country")),
    )
)
df.show()

In [None]:
df = df.where(F.isnotnull("icao"))

In [None]:
df = df.withColumn("created_ts", F.current_timestamp()).withColumn(
    "updated_ts", F.current_timestamp()
)

In [None]:
schema = pa.DataFrameSchema(
    {
        "icao": pa.Column(str),
        "iata": pa.Column(str, nullable=True),
        "name": pa.Column(str),
        "city": pa.Column(str, nullable=True),
        "subd": pa.Column(str, nullable=True),
        "elevation": pa.Column(float),
        "lat": pa.Column(float),
        "lon": pa.Column(float),
        "tz": pa.Column(str),
        "lid": pa.Column(str, nullable=True),
    },
)

schema.validate(df).pandera.errors

In [None]:
df.writeTo("dev.tier1.airlines").append()