In [2]:
from libs.configuration import configure
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import types as T
from pyspark.sql.avro import functions as AF

from shared.spark_config import create_spark_config

env = configure()
conf = create_spark_config("M2_Processors.airline.raw")

In [3]:
spark = SparkSession.builder.config(conf=conf).getOrCreate()

In [4]:
schema = T.StructType(
    [
        T.StructField("id", T.IntegerType(), True),
        T.StructField("name", T.StringType(), True),
        T.StructField("alias", T.StringType(), True),
        T.StructField("iata", T.StringType(), True),
        T.StructField("icao", T.StringType(), True),
        T.StructField("callsign", T.StringType(), True),
        T.StructField("country", T.StringType(), True),
        T.StructField("active", T.StringType(), True),
    ]
)
df = spark.read.csv(
    f"abfss://sampledata@{env.DATASTORAGE_AZURE_ACCOUNTNAME}.dfs.core.windows.net/airlines.csv",
    schema,
    header=False,
    quote='"',
)
df.show()

+---+--------------------+-----+----+----+--------------+--------------+------+
| id|                name|alias|iata|icao|      callsign|       country|active|
+---+--------------------+-----+----+----+--------------+--------------+------+
| -1|             Unknown|   \N|   -| N/A|            \N|            \N|     Y|
|  1|      Private flight|   \N|   -| N/A|          NULL|          NULL|     Y|
|  2|         135 Airways|   \N|NULL| GNL|       GENERAL| United States|     N|
|  3|       1Time Airline|   \N|  1T| RNX|       NEXTIME|  South Africa|     Y|
|  4|2 Sqn No 1 Elemen...|   \N|NULL| WYT|          NULL|United Kingdom|     N|
|  5|     213 Flight Unit|   \N|NULL| TFU|          NULL|        Russia|     N|
|  6|223 Flight Unit S...|   \N|NULL| CHD|CHKALOVSK-AVIA|        Russia|     N|
|  7|   224th Flight Unit|   \N|NULL| TTF|    CARGO UNIT|        Russia|     N|
|  8|         247 Jet Ltd|   \N|NULL| TWF|  CLOUD RUNNER|United Kingdom|     N|
|  9|         3D Aviation|   \N|NULL| SE

In [10]:
df = df.withColumn("created_ts", F.current_timestamp())
df.show()

[Stage 3:>                                                          (0 + 1) / 1]

+---+--------------------+-----+----+----+--------------+--------------+------+--------------------+
| id|                name|alias|iata|icao|      callsign|       country|active|          created_ts|
+---+--------------------+-----+----+----+--------------+--------------+------+--------------------+
| -1|             Unknown|   \N|   -| N/A|            \N|            \N|     Y|2024-01-07 20:22:...|
|  1|      Private flight|   \N|   -| N/A|          NULL|          NULL|     Y|2024-01-07 20:22:...|
|  2|         135 Airways|   \N|NULL| GNL|       GENERAL| United States|     N|2024-01-07 20:22:...|
|  3|       1Time Airline|   \N|  1T| RNX|       NEXTIME|  South Africa|     Y|2024-01-07 20:22:...|
|  4|2 Sqn No 1 Elemen...|   \N|NULL| WYT|          NULL|United Kingdom|     N|2024-01-07 20:22:...|
|  5|     213 Flight Unit|   \N|NULL| TFU|          NULL|        Russia|     N|2024-01-07 20:22:...|
|  6|223 Flight Unit S...|   \N|NULL| CHD|CHKALOVSK-AVIA|        Russia|     N|2024-01-07 2

                                                                                

In [11]:
df.writeTo("dev.raw.airlines").append()

                                                                                