In [1]:
# In this Spark ETL process we are creating dim Airplanes

In [12]:
from pyspark.sql import SparkSession
from datetime import datetime, timedelta
from pyspark.sql.functions import *
import os

In [13]:
cwd = os.getcwd()
spark = SparkSession.builder.appName('AirportsNodes').getOrCreate()

In [14]:
# Step 1 - Load airports data set

# Airports 
airports_file = f'{cwd}/data/raw_data/labels/airports.csv'
adf = spark.read.csv(path=airports_file, header=True)
adf.createOrReplaceTempView('ADF')
# adf.printSchema()

# airports.csv (downloaded from - https://ourairports.com/data/).
# Schema:
# root
#  |-- id: string (nullable = true)
#  |-- ident: string (nullable = true)
#  |-- type: string (nullable = true)
#  |-- name: string (nullable = true)
#  |-- latitude_deg: string (nullable = true)
#  |-- longitude_deg: string (nullable = true)
#  |-- elevation_ft: string (nullable = true)
#  |-- continent: string (nullable = true)
#  |-- iso_country: string (nullable = true)
#  |-- iso_region: string (nullable = true)
#  |-- municipality: string (nullable = true)
#  |-- scheduled_service: string (nullable = true)
#  |-- gps_code: string (nullable = true)
#  |-- iata_code: string (nullable = true)
#  |-- local_code: string (nullable = true)
#  |-- home_link: string (nullable = true)
#  |-- wikipedia_link: string (nullable = true)
#  |-- keywords: string (nullable = true)


In [15]:
# Step 2 - Load reduced flights data (tau-network-science/data/stg/flights_each_day_week_14_data)

file = f'{cwd}/data/stg/flights_each_day_week_14_data/*.csv'
fdf = spark.read.csv(path=file, header=True)
fdf = fdf.withColumn("filename", input_file_name())
fdf.createOrReplaceTempView('FDF')
# fdf.printSchema()

# flightlist_*.csv.gz (downloaded from - https://zenodo.org/record/4601479#.YLaEU5MzYUH).
# Schema:
# root
#  |-- flight_date: string (nullable = true)
#  |-- flight_dayofweek: string (nullable = true)
#  |-- airplane_code: string (nullable = true)
#  |-- airport_source_code: string (nullable = true)
#  |-- airport_target_code: string (nullable = true)
#  |-- flights_count: string (nullable = true)
#  |-- flight_year: string (nullable = true)
#  |-- filename: string (nullable = false)


In [16]:
# Step 3 - Create distinct list of active airports during week 14 in years 2019|2020|2021
# In the end our nodes will be countries and the edges international flights between countries.
# So first we are creating a new data set from airports.csv and countries.csv (joined by iso_country code)
distinct_airports = spark.sql("""

select  airport_source_code as airport_Id, flight_year as flight_year
from    FDF
union
select  airport_target_code as airport_Id, flight_year as flight_year
from    FDF

""")

distinct_airports.createOrReplaceTempView('distinct_airports')


In [17]:
# Load countries and covid19 data set
countries_file = f'{cwd}/data/stg/covid19/countries_covid_19_week_14_agg.csv'
cdf = spark.read.csv(path=countries_file, header=True)
cdf.createOrReplaceTempView('countries_covid19')
cdf.printSchema()

# Schema:
# root
#  |-- country_iso_2_code: string (nullable = true)
#  |-- c19_report_year: string (nullable = true)
#  |-- total_cases: string (nullable = true)
#  |-- population: string (nullable = true)
#  |-- avg_cases_population_ratio: string (nullable = true)
#  |-- country_iso_3_code: string (nullable = true)
#  |-- country_name: string (nullable = true)
#  |-- continent_name: string (nullable = true)
#  |-- country_lat: string (nullable = true)
#  |-- country_lng: string (nullable = true)

root
 |-- country_iso_2_code: string (nullable = true)
 |-- report_year: string (nullable = true)
 |-- country_iso_3_code: string (nullable = true)
 |-- country_name: string (nullable = true)
 |-- continent_name: string (nullable = true)
 |-- country_lat: string (nullable = true)
 |-- country_lng: string (nullable = true)
 |-- total_cases: string (nullable = true)
 |-- population: string (nullable = true)
 |-- cases_population_ratio: string (nullable = true)
 |-- covid19_percentage: string (nullable = true)
 |-- covid19_percentage_label: string (nullable = true)



In [18]:
# In the end our nodes will be countries and the edges international flights between countries.
# Create a new data set from airports and countries_covid19 (joined by country_iso_2_code code)
airports = spark.sql("""
select  distinct 
        A.ident              as airport_id,
        A.name               as airport_name,
        A.type               as airport_type,
        A.latitude_deg       as airport_lat,
        A.longitude_deg      as airport_lng,
        C.country_iso_2_code as country_iso_2_code,
        C.country_iso_3_code as country_iso_3_code,
        C.country_name       as country_name,  
        C.continent_name     as continent_name,                
        C.country_lat        as country_lat,        
        C.country_lng        as country_lng,
        C.total_cases        as cases_per_week_14,
        C.population         as population,
        C.cases_population_ratio as covid19_percentage,
        cast(cast(C.cases_population_ratio as decimal(3, 2))as string) || "%" as covid19_percentage_label,
        C.report_year as report_year,
        C.report_year as year        
from    ADF as A 
        JOIN 
        distinct_airports as D
        ON A.ident = D.airport_Id
        left join
        countries_covid19 as C 
        ON A.iso_country = C.country_iso_2_code
           and D.flight_year = C.report_year
""")
airports.createOrReplaceTempView('airports')

In [19]:
airports.repartition(1).write \
.partitionBy('year') \
.mode('overwrite') \
.format("com.databricks.spark.csv") \
.option("header", "true") \
.option("quoteAll", "true") \
.save("./data/nodes/airports")


In [20]:
# Copy new file into proper data folder location
!cp ./data/nodes/airports/*2019*/*.csv ./data/nodes/airports/airports_nodes_2019.csv
# Delete Spark output folder
!rm -rf ./data/nodes/airports/*2019*/
# Check that the folder deleted
!ls ./data/nodes/airports/*2019*/*.csv

zsh:1: no matches found: ./data/nodes/airports/*2019*/*.csv


In [21]:
# Copy new file into proper data folder location
!cp ./data/nodes/airports/*2020*/*.csv ./data/nodes/airports/airports_nodes_2020.csv
# Delete Spark output folder
!rm -rf ./data/nodes/airports/*2020*/
# Check that the folder deleted
!ls ./data/nodes/airports/*2020*/*.csv

zsh:1: no matches found: ./data/nodes/airports/*2020*/*.csv


In [22]:
# Copy new file into proper data folder location
!cp ./data/nodes/airports/*2021*/*.csv ./data/nodes/airports/airports_nodes_2021.csv
# Delete Spark output folder
!rm -rf ./data/nodes/airports/*2021*/
# Check that the folder deleted
!ls ./data/nodes/airports/*2021*/*.csv

zsh:1: no matches found: ./data/nodes/airports/*2021*/*.csv


In [23]:
# Tests
# adf.count()      #65,562
# airports.count() #12,503


# spark.sql("""

# select  count(*) as row_count,
#         cast(sum( case when  airport_id                             is not null then 1 else 0 end) as double)/count(*) *100 as NN_airport_id,
#         cast(sum( case when  airport_name                           is not null then 1 else 0 end) as double)/count(*) *100 as NN_airport_name,
#         cast(sum( case when  airport_type                           is not null then 1 else 0 end) as double)/count(*) *100 as NN_airport_type,
#         cast(sum( case when  airport_lat                            is not null then 1 else 0 end) as double)/count(*) *100 as NN_airport_lat,
#         cast(sum( case when  airport_lng                            is not null then 1 else 0 end) as double)/count(*) *100 as NN_airport_lng,
#         cast(sum( case when  country_iso_2_code                     is not null then 1 else 0 end) as double)/count(*) *100 as NN_country_iso_2_code,
#         cast(sum( case when  country_iso_3_code                     is not null then 1 else 0 end) as double)/count(*) *100 as NN_country_iso_3_code,
#         cast(sum( case when  country_name                           is not null then 1 else 0 end) as double)/count(*) *100 as NN_country_name,
#         cast(sum( case when  country_lat                            is not null then 1 else 0 end) as double)/count(*) *100 as NN_country_lat,
#         cast(sum( case when  country_lng                            is not null then 1 else 0 end) as double)/count(*) *100 as NN_country_lng,
#         cast(sum( case when  avg_cases_per_week_14                  is not null then 1 else 0 end) as double)/count(*) *100 as NN_avg_cases_per_week_14,
#         cast(sum( case when  population                             is not null then 1 else 0 end) as double)/count(*) *100 as NN_population,
#         cast(sum( case when  avg_cases_population_ratio_per_week_14 is not null then 1 else 0 end) as double)/count(*) *100 as NN_avg_cases_population_ratio_per_week_14
# from    airports


# """).show(10,False)

# spark.sql("""

# select  avg_cases_population_ratio_per_week_14,
#         covid19_percentage_label
# from    airports


# """).show(10,False)



# No null values



zsh:1: no matches found: ./airports/*.csv
