In [17]:
# In this Spark ETL process we are creating dim Airplanes

In [1]:
from pyspark.sql import SparkSession
from datetime import datetime, timedelta
from pyspark.sql.functions import *
import os

In [2]:
cwd = os.getcwd()
spark = SparkSession.builder.appName('DimAirplanes').getOrCreate()

In [3]:
# Step 1 - Load flights data (OpenSky data)

file = f'{cwd}/data/raw_data/flights/flightlist_*.csv.gz'
fdf = spark.read.csv(path=file, header=True)
fdf = fdf.withColumn("filename", input_file_name())
fdf.createOrReplaceTempView('FDF')
# fdf.printSchema()

# flightlist_*.csv.gz (downloaded from - https://zenodo.org/record/4601479#.YLaEU5MzYUH).
# Schema:
# root
#  |-- callsign: string (nullable = true)
#  |-- number: string (nullable = true)
#  |-- icao24: string (nullable = true)
#  |-- registration: string (nullable = true)
#  |-- typecode: string (nullable = true)
#  |-- origin: string (nullable = true)
#  |-- destination: string (nullable = true)
#  |-- firstseen: string (nullable = true)
#  |-- lastseen: string (nullable = true)
#  |-- day: string (nullable = true)
#  |-- latitude_1: string (nullable = true)
#  |-- longitude_1: string (nullable = true)
#  |-- altitude_1: string (nullable = true)
#  |-- latitude_2: string (nullable = true)
#  |-- longitude_2: string (nullable = true)
#  |-- altitude_2: string (nullable = true)
#  |-- filename: string (nullable = false)


# Select all distinct airplanes ICAO code (for filtering later on)

In [16]:
# Filter out all none international flights or filights with incomplate data
icao_df = spark.sql("""
select typecode as icao_code,
       count(*) as icao_count
from FDF 
where   1=1 
        and origin is not null and destination is not null
        and origin<>destination
group by 1        
""")

icao_df.createOrReplaceTempView('icao')

# icao_df.printSchema()

# root
#  |-- icao_code: string (nullable = true)
#  |-- icao_count: long (nullable = false)

root
 |-- icao_code: string (nullable = true)
 |-- icao_count: long (nullable = false)



In [11]:
# Step 3 - Read Aircraft csv file 
# I created this file downloaded from - https://www.faa.gov/airports/engineering/aircraft_char_database/)

a_file = f'{cwd}/data/raw_data/airplanes/Aircraft.csv'
adf = spark.read.csv(path=a_file, header=True)
# tmp_df = tmp_df.withColumn("filename", input_file_name())
adf.createOrReplaceTempView('ADF')
adf.printSchema()

# root
#  |-- manufacturer: string (nullable = true)
#  |-- model: string (nullable = true)
#  |-- engine_type: string (nullable = true)
#  |-- engine_count: string (nullable = true)
#  |-- icao_Code: string (nullable = true)

root
 |-- manufacturer: string (nullable = true)
 |-- model: string (nullable = true)
 |-- engine_type: string (nullable = true)
 |-- engine_count: string (nullable = true)
 |-- icao_Code: string (nullable = true)



In [23]:
# Step 4 - filter all none relevant airplanes (not in flights data) 
# Trying to prevent performance issues
airplanes = spark.sql("""
With stg
as (select  distinct 
            coalesce(I.icao_code   , 'N/A') as icao_code,
            coalesce(A.manufacturer, 'N/A') as manufacturer,
            coalesce(A.model       , 'N/A') as model,
            coalesce(A.engine_type , 'N/A') as engine_type,
            coalesce(A.engine_count, 'N/A') as engine_count,
            I.icao_count
            --,        
            --coalesce(null, 'N/A')           as passenger_capacity,
            --coalesce(null, 'N/A')           as plane_type        
    from    icao I left join ADF A 
            on I.icao_Code = A.icao_Code 
    where   1=1
    order by I.icao_count desc)

select  icao_code,
        case when manufacturer = 'tbd' then 'N/A' else manufacturer end as manufacturer,
        case when model = 'tbd' then 'N/A' else model end               as model,
        case when engine_type = 'tbd' then 'N/A' else engine_type end   as engine_type,
        case when engine_count = 'tbd' then 'N/A' else engine_count end as engine_count      
        --,
        --passenger_capacity,
        --plane_type        
from    stg   
where   icao_code<>'N/A'
""")

In [29]:
airplanes.repartition(1).write \
.format("com.databricks.spark.csv") \
.option("header", "true") \
.option("quoteAll", "true") \
.save("airplanes")

# Copy new file into proper data folder location
!cp ./airplanes/*.csv ./data/dwh/dim_airplanes.csv
# Delete Spark output folder
!rm -rf ./airplanes
# Check that the folder deleted
!ls ./airplanes/*.csv

zsh:1: no matches found: ./airplanes/*.csv
