In [1]:
# In this Spark ETL process we are creating nodes and adges files (from the OpenSky data set) 
# for creating our network in Gephi.
# ------------------------------------------------------------------------------------------
# IMPORTENT!!! --> Before runing this code make sure to run - "00_download_flight_data.sh"
# ------------------------------------------------------------------------------------------

In [1]:
from pyspark.sql import SparkSession
from datetime import datetime, timedelta
from pyspark.sql.functions import *
import os

In [2]:
cwd = os.getcwd()
spark = SparkSession.builder.appName('InternationalST').getOrCreate()

In [5]:
# Step 1 - Load Flights data (OpenSky data)

file = f'{cwd}/data/stg/source_target_count_week_14_per_year/flights_*.csv'
fdf = spark.read.csv(path=file, header=True)
fdf = fdf.withColumn("filename", input_file_name())
fdf.createOrReplaceTempView('FDF')
# fdf.printSchema()

# root
#  |-- Source: string (nullable = true)
#  |-- Target: string (nullable = true)
#  |-- flights_count: string (nullable = true)
#  |-- flight_year: string (nullable = true)
#  |-- filename: string (nullable = false)

In [11]:
# Step 2 - Load dim airports data

file = f'{cwd}/data/nodes/airports/airports_nodes_*.csv'
adf = spark.read.csv(path=file, header=True)
adf = adf.withColumn("filename", input_file_name())
adf.createOrReplaceTempView('ADF')
# adf.printSchema()

# root
#  |-- airport_id: string (nullable = true)
#  |-- airport_name: string (nullable = true)
#  |-- airport_type: string (nullable = true)
#  |-- airport_lat: string (nullable = true)
#  |-- airport_lng: string (nullable = true)
#  |-- country_iso_2_code: string (nullable = true)
#  |-- country_iso_3_code: string (nullable = true)
#  |-- country_name: string (nullable = true)
#  |-- continent_name: string (nullable = true)
#  |-- country_lat: string (nullable = true)
#  |-- country_lng: string (nullable = true)
#  |-- cases_per_week_14: string (nullable = true)
#  |-- population: string (nullable = true)
#  |-- covid19_percentage: string (nullable = true)
#  |-- covid19_percentage_label: string (nullable = true)
#  |-- report_year: string (nullable = true)
#  |-- filename: string (nullable = false)

adf = spark.sql("""

select  airport_id as airport_id,
        country_iso_2_code as country_iso_2_code
from    ADF as A
group by 1,2

""")
adf.createOrReplaceTempView('ADF')

In [17]:
# Step 3 - check distinct values in data set

# spark.sql("""

# select  count(*) as row_count,
#         count(distinct F.Source        ) as Source,
#         count(distinct F.Target        ) as Target,
#         count(distinct F.flights_count ) as flights_count
# from    FDF as F

# """).show(10,False)

# +---------+------+------+-------------+
# |row_count|Source|Target|flights_count|
# +---------+------+------+-------------+
# |187989   |10363 |11886 |278          |
# +---------+------+------+-------------+

# Now lets check how many we are losing after removing none international flights

# spark.sql("""

# select  count(*) as row_count,
#         count(distinct F.Source        ) as Source,
#         count(distinct F.Target        ) as Target,
#         count(distinct F.flights_count ) as flights_count,
#         count(distinct S.country_name  ) as s_country_name,
#         count(distinct T.country_name  ) as t_country_name
# from    FDF as F
#         Left join ADF as S
#         On F.Source = S.airport_id
#         Left join ADF as T
#         On F.Target = T.airport_id
# where   S.country_name<>T.country_name

# """).show(10,False)

# +---------+------+------+-------------+
# |row_count|Source|Target|flights_count|
# +---------+------+------+-------------+
# |187989   |10363 |11886 |278          |
# +---------+------+------+-------------+

# All flights after join to airports countries
# +---------+------+------+-------------+--------------+--------------+
# |row_count|Source|Target|flights_count|s_country_name|t_country_name|
# +---------+------+------+-------------+--------------+--------------+
# |187989   |10363 |11886 |278          |111           |116           |
# +---------+------+------+-------------+--------------+--------------+

# where   S.country_name<>T.country_name
# +---------+------+------+-------------+--------------+--------------+
# |row_count|Source|Target|flights_count|s_country_name|t_country_name|
# +---------+------+------+-------------+--------------+--------------+
# |30416    |1490  |1908  |119          |109           |114           |
# +---------+------+------+-------------+--------------+--------------+


In [13]:
# Step 4 - Join with dim airports and filter out all none international flights (for better performance)

international_source_target_count_week_14_per_year = spark.sql("""

select  F.Source        Source,
        F.Target        Target,
        cast(F.flights_count as bigint) flights_count,
        F.flight_year as flight_year
from    FDF as F
        Left join ADF as S
        On F.Source = S.airport_id           
        Left join ADF as T
        On F.Target = T.airport_id
where   S.country_iso_2_code<>T.country_iso_2_code

""")

international_source_target_count_week_14_per_year.createOrReplaceTempView('international_source_target')

In [19]:
# Tests

# spark.sql("""

# select  count(Source) as c_source,
#         count(Target) as c_target,
#         count(flights_count) as c_flights_count,
#         flight_year
# from    international_source_target
# group by 4


# """).show(10,False)

# +--------+--------+---------------+-----------+
# |c_source|c_target|c_flights_count|flight_year|
# +--------+--------+---------------+-----------+
# |15965   |15965   |15965          |2019       |
# |4187    |4187    |4187           |2020       |
# |10264   |10264   |10264          |2021       |
# +--------+--------+---------------+-----------+


In [14]:
international_source_target_count_week_14_per_year.repartition(1).write \
.partitionBy('flight_year') \
.mode('overwrite') \
.format("com.databricks.spark.csv") \
.option("header", "true") \
.option("quoteAll", "true") \
.save("./data/edges/international_source_target_count_week_14_per_year")

In [15]:
# Copy new file into proper data folder location
!cp ./data/edges/international_source_target_count_week_14_per_year/*2019*/*.csv ./data/edges/international_source_target_count_week_14_per_year/flights_2019.csv 
# Delete Spark output folder
!rm -rf ./data/edges/international_source_target_count_week_14_per_year/flight_year=2019*
# Check that the folder deleted
!ls ./data/edges/international_source_target_count_week_14_per_year/flight_year=2019*/*.csv

zsh:1: no matches found: ./data/edges/international_source_target_count_week_14_per_year/flight_year=2019*/*.csv


In [16]:
# Copy new file into proper data folder location
!cp ./data/edges/international_source_target_count_week_14_per_year/*2020*/*.csv ./data/edges/international_source_target_count_week_14_per_year/flights_2020.csv 
# Delete Spark output folder
!rm -rf ./data/edges/international_source_target_count_week_14_per_year/flight_year=2020*
# Check that the folder deleted
!ls ./data/edges/international_source_target_count_week_14_per_year/flight_year=2020*/*.csv

zsh:1: no matches found: ./data/edges/international_source_target_count_week_14_per_year/flight_year=2020*/*.csv


In [17]:
# Copy new file into proper data folder location
!cp ./data/edges/international_source_target_count_week_14_per_year/*2021*/*.csv ./data/edges/international_source_target_count_week_14_per_year/flights_2021.csv 
# Delete Spark output folder
!rm -rf ./data/edges/international_source_target_count_week_14_per_year/flight_year=2021*
# Check that the folder deleted
!ls ./data/edges/international_source_target_count_week_14_per_year/flight_year=2021*/*.csv

zsh:1: no matches found: ./data/edges/international_source_target_count_week_14_per_year/flight_year=2021*/*.csv
