In [13]:
# In this Spark ETL process we are creating nodes and adges files (from the OpenSky data set) 
# for creating our network in Gephi.
# ------------------------------------------------------------------------------------------
# IMPORTENT!!! --> Before runing this code make sure to run - "00_download_flight_data.sh"
# ------------------------------------------------------------------------------------------

In [1]:
from pyspark.sql import SparkSession
from datetime import datetime, timedelta
from pyspark.sql.functions import *
import os

In [2]:
cwd = os.getcwd()
spark = SparkSession.builder.appName('International').getOrCreate()

In [7]:
# Step 1 - Load Flights data (OpenSky data)

file = f'{cwd}/data/stg/flights_each_day_week_14_data/flights_*.csv'
fdf = spark.read.csv(path=file, header=True)
fdf = fdf.withColumn("filename", input_file_name())
fdf.createOrReplaceTempView('FDF')
# fdf.printSchema()

# root
#  |-- flight_date: string (nullable = true)
#  |-- flight_dayofweek: string (nullable = true)
#  |-- airplane_code: string (nullable = true)
#  |-- airport_source_code: string (nullable = true)
#  |-- airport_target_code: string (nullable = true)
#  |-- flights_count: string (nullable = true)
#  |-- flight_year: string (nullable = true)
#  |-- filename: string (nullable = false)

In [8]:
# Step 2 - Load dim airports data

file = f'{cwd}/data/nodes/airports/airports_nodes_*.csv'
adf = spark.read.csv(path=file, header=True)
adf = adf.withColumn("filename", input_file_name())
adf.createOrReplaceTempView('ADF')
# adf.printSchema()

# root
#  |-- airport_id: string (nullable = true)
#  |-- airport_name: string (nullable = true)
#  |-- airport_type: string (nullable = true)
#  |-- airport_lat: string (nullable = true)
#  |-- airport_lng: string (nullable = true)
#  |-- country_iso_2_code: string (nullable = true)
#  |-- country_iso_3_code: string (nullable = true)
#  |-- country_name: string (nullable = true)
#  |-- continent_name: string (nullable = true)
#  |-- country_lat: string (nullable = true)
#  |-- country_lng: string (nullable = true)
#  |-- avg_cases_per_week_14: string (nullable = true)
#  |-- population: string (nullable = true)
#  |-- covid19_percentage: string (nullable = true)
#  |-- covid19_percentage_label: string (nullable = true)
#  |-- report_year: string (nullable = true)
#  |-- filename: string (nullable = false)


adf = spark.sql("""

select  airport_id as airport_id,
        country_iso_2_code as country_iso_2_code
from    ADF as A
group by 1,2

""")
adf.createOrReplaceTempView('ADF')

In [18]:
# Step 3 - check distinct values in data set

# spark.sql("""

# select  count(*) as row_count,
#         count(distinct F.flight_date         ) as flight_date,
#         count(distinct F.flight_dayofweek    ) as flight_dayofweek,
#         count(distinct F.airplane_code       ) as airplane_code,
#         count(distinct F.airport_source_code ) as airport_source_code,
#         count(distinct F.airport_target_code ) as airport_target_code,
#         count(distinct F.flights_count       ) as flights_count
# from    FDF as F

# """).show(10,False)

# +---------+-----------+----------------+-------------+-------------------+-------------------+-------------+
# |row_count|flight_date|flight_dayofweek|airplane_code|airport_source_code|airport_target_code|flights_count|
# +---------+-----------+----------------+-------------+-------------------+-------------------+-------------+
# |502819   |21         |7               |681          |10363              |11886              |119          |
# +---------+-----------+----------------+-------------+-------------------+-------------------+-------------+



# Now lets check how many we are losing after removing none international flights

# spark.sql("""

# select  count(*) as row_count,
#         count(distinct F.flight_date         ) as flight_date,
#         count(distinct F.flight_dayofweek    ) as dayofweek,
#         count(distinct F.airplane_code       ) as plane_code,
#         count(distinct F.airport_source_code ) as airport_source_code,
#         count(distinct F.airport_target_code ) as airport_target_code,
#         count(distinct S.country_name        ) as s_country_name,
#         count(distinct T.country_name        ) as t_country_name
# from    FDF as F
#         Left join ADF as S
#         On F.airport_source_code = S.airport_id
#         Left join ADF as T
#         On F.airport_target_code = T.airport_id
# where   S.country_name<>T.country_name
# """).show(10,False)

# +---------+-----------+----------------+-------------+-------------------+-------------------+-------------+
# |row_count|flight_date|flight_dayofweek|airplane_code|airport_source_code|airport_target_code|flights_count|
# +---------+-----------+----------------+-------------+-------------------+-------------------+-------------+
# |502819   |21         |7               |681          |10363              |11886              |119          |
# +---------+-----------+----------------+-------------+-------------------+-------------------+-------------+

# All flights after join to airports countries
# +---------+-----------+---------+----------+-------------------+-------------------+--------------+--------------+
# |row_count|flight_date|dayofweek|plane_code|airport_source_code|airport_target_code|s_country_name|t_country_name|
# +---------+-----------+---------+----------+-------------------+-------------------+--------------+--------------+
# |502819   |21         |7        |681       |10363              |11886              |111           |116           |
# +---------+-----------+---------+----------+-------------------+-------------------+--------------+--------------+

# where   S.country_name<>T.country_name
# +---------+-----------+---------+----------+-------------------+-------------------+--------------+--------------+
# |row_count|flight_date|dayofweek|plane_code|airport_source_code|airport_target_code|s_country_name|t_country_name|
# +---------+-----------+---------+----------+-------------------+-------------------+--------------+--------------+
# |117741   |21         |7        |324       |1490               |1908               |109           |114           |
# +---------+-----------+---------+----------+-------------------+-------------------+--------------+--------------+



In [9]:
# Step 4 - Join with dim airports and filter out all none international flights (for better performance)

international_flights_each_day_week_14_data = spark.sql("""

select  F.flight_date         as flight_date,
        F.flight_dayofweek    as dayofweek,
        F.airplane_code       as airplane_code,
        F.airport_source_code as airport_source_code,
        F.airport_target_code as airport_target_code,
        cast(F.flights_count as bigint) as flights_count,
        year(flight_date)     as flight_year
from    FDF as F
        Left join ADF as S
        On F.airport_source_code = S.airport_id
        Left join ADF as T
        On F.airport_target_code = T.airport_id
where   S.country_iso_2_code<>T.country_iso_2_code
""")

In [10]:
international_flights_each_day_week_14_data.repartition(1).write \
.partitionBy('flight_year') \
.mode('overwrite') \
.format("com.databricks.spark.csv") \
.option("header", "true") \
.option("quoteAll", "true") \
.save("./data/edges/international_flights_each_day_week_14_data")

In [11]:
# Copy new file into proper data folder location
!cp ./data/edges/international_flights_each_day_week_14_data/*2019*/*.csv ./data/edges/international_flights_each_day_week_14_data/flights_2019.csv 
# Delete Spark output folder
!rm -rf ./data/edges/international_flights_each_day_week_14_data/flight_year=2019*
# Check that the folder deleted
!ls ./data/edges/international_flights_each_day_week_14_data/flight_year=2019*/*.csv

zsh:1: no matches found: ./data/edges/international_flights_each_day_week_14_data/flight_year=2019*/*.csv


In [12]:
# Copy new file into proper data folder location
!cp ./data/edges/international_flights_each_day_week_14_data/*2020*/*.csv ./data/edges/international_flights_each_day_week_14_data/flights_2020.csv 
# Delete Spark output folder
!rm -rf ./data/edges/international_flights_each_day_week_14_data/flight_year=2020*
# Check that the folder deleted
!ls ./data/edges/international_flights_each_day_week_14_data/flight_year=2020*/*.csv

zsh:1: no matches found: ./data/edges/international_flights_each_day_week_14_data/flight_year=2020*/*.csv


In [13]:
# Copy new file into proper data folder location
!cp ./data/edges/international_flights_each_day_week_14_data/*2021*/*.csv ./data/edges/international_flights_each_day_week_14_data/flights_2021.csv 
# Delete Spark output folder
!rm -rf ./data/edges/international_flights_each_day_week_14_data/flight_year=2021*
# Check that the folder deleted
!ls ./data/edges/international_flights_each_day_week_14_data/flight_year=2021*/*.csv

zsh:1: no matches found: ./data/edges/international_flights_each_day_week_14_data/flight_year=2021*/*.csv
