In [1]:
# In this Spark ETL process we are creating dim Airplanes

In [1]:
from pyspark.sql import SparkSession
from datetime import datetime, timedelta
from pyspark.sql.functions import *
import os

In [2]:
cwd = os.getcwd()
spark = SparkSession.builder.appName('FactFlights').getOrCreate()

In [3]:
# Step 1 - Load flights data (OpenSky data)

file = f'{cwd}/data/raw_data/flights/flightlist_*.csv.gz'
fdf = spark.read.csv(path=file, header=True)
fdf = fdf.withColumn("filename", input_file_name())
fdf.createOrReplaceTempView('FDF')
# fdf.printSchema()

# flightlist_*.csv.gz (downloaded from - https://zenodo.org/record/4601479#.YLaEU5MzYUH).
# Schema:
# root
#  |-- callsign: string (nullable = true)
#  |-- number: string (nullable = true)
#  |-- icao24: string (nullable = true)
#  |-- registration: string (nullable = true)
#  |-- typecode: string (nullable = true)
#  |-- origin: string (nullable = true)
#  |-- destination: string (nullable = true)
#  |-- firstseen: string (nullable = true)
#  |-- lastseen: string (nullable = true)
#  |-- day: string (nullable = true)
#  |-- latitude_1: string (nullable = true)
#  |-- longitude_1: string (nullable = true)
#  |-- altitude_1: string (nullable = true)
#  |-- latitude_2: string (nullable = true)
#  |-- longitude_2: string (nullable = true)
#  |-- altitude_2: string (nullable = true)
#  |-- filename: string (nullable = false)


In [4]:
# Step 2 - Load dim airports

# Covid19 
file = f'{cwd}/data/dwh/dim_airports.csv'
adf = spark.read.csv(path=file, header=True)
adf.createOrReplaceTempView('ADF')
# adf.printSchema()

# owid-covid-data.csv (downloaded from - https://github.com/owid/covid-19-data/blob/master/public/data/README.md).
# Schema:
# root
#  |-- airport_id: string (nullable = true)
#  |-- airport_name: string (nullable = true)
#  |-- airport_type: string (nullable = true)
#  |-- airport_lat: string (nullable = true)
#  |-- airport_lng: string (nullable = true)
#  |-- country_id: string (nullable = true)
#  |-- country_iso_3_code: string (nullable = true)
#  |-- country_name: string (nullable = true)
#  |-- country_lat: string (nullable = true)
#  |-- country_lng: string (nullable = true)

In [5]:
# Step 1 - Load the Covid19 data set

# Covid19 
file = f'{cwd}/data/raw_data/covid19/owid-covid-data.csv'
cdf = spark.read.csv(path=file, header=True)
cdf.createOrReplaceTempView('CDF')
# cdf.printSchema()

# owid-covid-data.csv (downloaded from - https://github.com/owid/covid-19-data/blob/master/public/data/README.md).
# Schema:
# root
#  |-- iso_code: string (nullable = true)
#  |-- continent: string (nullable = true)
#  |-- location: string (nullable = true)
#  |-- date: string (nullable = true)
#  |-- total_cases: string (nullable = true)
#  |-- new_cases: string (nullable = true)
#  |-- new_cases_smoothed: string (nullable = true)
#  |-- total_deaths: string (nullable = true)
#  |-- new_deaths: string (nullable = true)
#  |-- new_deaths_smoothed: string (nullable = true)
#  |-- total_cases_per_million: string (nullable = true)
#  |-- new_cases_per_million: string (nullable = true)
#  |-- new_cases_smoothed_per_million: string (nullable = true)
#  |-- total_deaths_per_million: string (nullable = true)
#  |-- new_deaths_per_million: string (nullable = true)
#  |-- new_deaths_smoothed_per_million: string (nullable = true)
#  |-- reproduction_rate: string (nullable = true)
#  |-- icu_patients: string (nullable = true)
#  |-- icu_patients_per_million: string (nullable = true)
#  |-- hosp_patients: string (nullable = true)
#  |-- hosp_patients_per_million: string (nullable = true)
#  |-- weekly_icu_admissions: string (nullable = true)
#  |-- weekly_icu_admissions_per_million: string (nullable = true)
#  |-- weekly_hosp_admissions: string (nullable = true)
#  |-- weekly_hosp_admissions_per_million: string (nullable = true)
#  |-- new_tests: string (nullable = true)
#  |-- total_tests: string (nullable = true)
#  |-- total_tests_per_thousand: string (nullable = true)
#  |-- new_tests_per_thousand: string (nullable = true)
#  |-- new_tests_smoothed: string (nullable = true)
#  |-- new_tests_smoothed_per_thousand: string (nullable = true)
#  |-- positive_rate: string (nullable = true)
#  |-- tests_per_case: string (nullable = true)
#  |-- tests_units: string (nullable = true)
#  |-- total_vaccinations: string (nullable = true)
#  |-- people_vaccinated: string (nullable = true)
#  |-- people_fully_vaccinated: string (nullable = true)
#  |-- new_vaccinations: string (nullable = true)
#  |-- new_vaccinations_smoothed: string (nullable = true)
#  |-- total_vaccinations_per_hundred: string (nullable = true)
#  |-- people_vaccinated_per_hundred: string (nullable = true)
#  |-- people_fully_vaccinated_per_hundred: string (nullable = true)
#  |-- new_vaccinations_smoothed_per_million: string (nullable = true)
#  |-- stringency_index: string (nullable = true)
#  |-- population: string (nullable = true)
#  |-- population_density: string (nullable = true)
#  |-- median_age: string (nullable = true)
#  |-- aged_65_older: string (nullable = true)
#  |-- aged_70_older: string (nullable = true)
#  |-- gdp_per_capita: string (nullable = true)
#  |-- extreme_poverty: string (nullable = true)
#  |-- cardiovasc_death_rate: string (nullable = true)
#  |-- diabetes_prevalence: string (nullable = true)
#  |-- female_smokers: string (nullable = true)
#  |-- male_smokers: string (nullable = true)
#  |-- handwashing_facilities: string (nullable = true)
#  |-- hospital_beds_per_thousand: string (nullable = true)
#  |-- life_expectancy: string (nullable = true)
#  |-- human_development_index: string (nullable = true)

In [6]:
covid19 = spark.sql("""
select  distinct
        A.airport_id as airport_id,
        A.country_id as country_id,
--        L.iso_3_code as iso_3_code,
        cast(C.date as date)       as report_date,
        coalesce(C.total_cases, 0) as total_cases,
        C.population               as population,
        cast(C.total_cases as double)/cast(C.population as double) as color_weight
from    ADF A left join CDF C
        ON A.country_iso_3_code = C.iso_code
where   1=1
        and C.population is not null
""")

covid19.createOrReplaceTempView('covid19')

In [12]:
spark.sql("""
select  cast(firstseen as date) as take_off_date,
        year(firstseen)   as partition_key,
        month(firstseen)   as partit_key,
        left(firstseen, 7) as partition_key
        
from    FDF""").show(10,False)

+-------------+-------------+----------+-------+
|take_off_date|partition_key|partit_key|ddd    |
+-------------+-------------+----------+-------+
|2019-07-31   |2019         |7         |2019-07|
|2019-07-31   |2019         |7         |2019-07|
|2019-07-31   |2019         |7         |2019-07|
|2019-07-31   |2019         |7         |2019-07|
|2019-07-31   |2019         |7         |2019-07|
|2019-07-31   |2019         |7         |2019-07|
|2019-07-31   |2019         |7         |2019-07|
|2019-07-31   |2019         |7         |2019-07|
|2019-07-31   |2019         |7         |2019-07|
|2019-07-31   |2019         |7         |2019-07|
+-------------+-------------+----------+-------+
only showing top 10 rows



In [14]:
flights = spark.sql("""
select  F.callsign      as callsign,
        F.origin        as origin_airport_id,
        CO.country_id   as origin_country_id,
        CO.color_weight as origin_country_color_weight,
        F.destination   as destination_airport_id,
        CD.country_id   as destination_country_id,
        CD.total_cases  as total_cases,
        CD.population   as population,        
        CD.color_weight as destination_country_color_weight,        
        F.typecode      as aircraft_icao_code,
        count(*)        as Weight,
        cast(F.firstseen as date) as take_off_date,
        left(F.firstseen, 7) as partition_key
from    FDF F left join covid19 CO
        ON  F.origin = CO.airport_id
            and cast(F.firstseen as date) = CO.report_date
        left join covid19 CD
        ON  F.destination = CD.airport_id
            and cast(F.firstseen as date) = CD.report_date
where   1=1
        and F.origin is not null and F.destination is not null
        and F.origin<>F.destination
group by 1,2,3,4,5,6,7,8,9,10,12,13   
""")

In [16]:
flights.repartition(1).write \
.partitionBy('partition_key') \
.mode('overwrite') \
.format("com.databricks.spark.csv") \
.option("header", "true") \
.option("quoteAll", "true") \
.save("flights")

# # Copy new file into proper data folder location
# !cp ./flights/*.csv ./data/dwh/fact_flights.csv
# # Delete Spark output folder
# !rm -rf ./flights
# # Check that the folder deleted
# !ls ./flights/*.csv

In [19]:
flights.createOrReplaceTempView('flights')

In [21]:
flights_by_airports = spark.sql("""
select  origin_airport_id,
        AVG(origin_country_color_weight) as origin_country_color_weight,
        destination_airport_id,
        AVG(destination_country_color_weight) as destination_country_color_weight,        
        Sum(Weight) as Weight,
        partition_key
from    flights
group by 1,3,6
""")

In [23]:
flights_by_airports.repartition(1).write \
.partitionBy('partition_key') \
.mode('overwrite') \
.format("com.databricks.spark.csv") \
.option("header", "true") \
.option("quoteAll", "true") \
.save("flights_by_airports")

# # Copy new file into proper data folder location
# !cp ./flights/*.csv ./data/dwh/fact_flights.csv
# # Delete Spark output folder
# !rm -rf ./flights
# # Check that the folder deleted
# !ls ./flights/*.csv

In [20]:
flights_by_country = spark.sql("""
select  origin_country_id,
        AVG(origin_country_color_weight) as origin_country_color_weight,
        destination_country_id,
        AVG(destination_country_color_weight) as destination_country_color_weight,        
        Sum(Weight) as Weight,
        partition_key
from    flights
group by 1,3,4,6
""")

In [None]:
flights_by_country.repartition(1).write \
.partitionBy('partition_key') \
.mode('overwrite') \
.format("com.databricks.spark.csv") \
.option("header", "true") \
.option("quoteAll", "true") \
.save("flights_by_country")

# # Copy new file into proper data folder location
# !cp ./flights/*.csv ./data/dwh/fact_flights.csv
# # Delete Spark output folder
# !rm -rf ./flights
# # Check that the folder deleted
# !ls ./flights/*.csv