In [1]:
# In this Spark ETL process we are creating nodes and adges files (from the OpenSky data set) 
# for creating our network in Gephi.
# ------------------------------------------------------------------------------------------
# IMPORTENT!!! --> Before runing this code make sure to run - "00_download_flight_data.sh"
# ------------------------------------------------------------------------------------------

In [1]:
from pyspark.sql import SparkSession
from datetime import datetime, timedelta
from pyspark.sql.functions import *
import os

In [2]:
cwd = os.getcwd()
spark = SparkSession.builder.appName('Week14').getOrCreate()

In [3]:
# Step 1 - Load Flights data (OpenSky data)

file = f'{cwd}/data/raw_data/flights/flightlist_*.csv.gz'
fdf = spark.read.csv(path=file, header=True)
fdf = fdf.withColumn("filename", input_file_name())
fdf.createOrReplaceTempView('FDF')
# fdf.printSchema()

# flightlist_*.csv.gz (downloaded from - https://zenodo.org/record/4893103#.YMHhLDYzYUE).
# Schema:
# root
#  |-- callsign: string (nullable = true)
#  |-- number: string (nullable = true)
#  |-- icao24: string (nullable = true)
#  |-- registration: string (nullable = true)
#  |-- typecode: string (nullable = true)
#  |-- origin: string (nullable = true)
#  |-- destination: string (nullable = true)
#  |-- firstseen: string (nullable = true)
#  |-- lastseen: string (nullable = true)
#  |-- day: string (nullable = true)
#  |-- latitude_1: string (nullable = true)
#  |-- longitude_1: string (nullable = true)
#  |-- altitude_1: string (nullable = true)
#  |-- latitude_2: string (nullable = true)
#  |-- longitude_2: string (nullable = true)
#  |-- altitude_2: string (nullable = true)
#  |-- filename: string (nullable = false)


In [4]:
# Step 2 - For each yaer filtering out week 14 data and creating a smaller file (for better performance)

flights_each_day_week_14_data = spark.sql("""

select  date(firstseen)       as flight_date,
        dayofweek(firstseen)  as flight_dayofweek,
        typecode              as airplane_code,
        origin                as airport_source_code,
        destination           as airport_target_code,
        count(*)              as flights_count,       
        year(firstseen)       as flight_year,
        year(firstseen)       as year
from    FDF
where   1=1
        and origin is not null and destination is not null
        and weekofyear(firstseen)=14
group by 1,2,3,4,5,7,8

""")

In [5]:
flights_each_day_week_14_data.repartition(1).write \
.partitionBy('year') \
.mode('overwrite') \
.format("com.databricks.spark.csv") \
.option("header", "true") \
.option("quoteAll", "true") \
.save("./data/stg/flights_each_day_week_14_data")

In [6]:
# Copy new file into proper data folder location
!cp ./data/stg/flights_each_day_week_14_data/*2019*/*.csv ./data/stg/flights_each_day_week_14_data/flights_2019.csv 
# Delete Spark output folder
!rm -rf ./data/stg/flights_each_day_week_14_data/year=2019*
# Check that the folder deleted
!ls ./data/stg/flights_each_day_week_14_data/year=2019*/*.csv

zsh:1: no matches found: ./data/stg/flights_each_day_week_14_data/year=2019*/*.csv


In [7]:
# Copy new file into proper data folder location
!cp ./data/stg/flights_each_day_week_14_data/*2020*/*.csv ./data/stg/flights_each_day_week_14_data/flights_2020.csv 
# Delete Spark output folder
!rm -rf ./data/stg/flights_each_day_week_14_data/year=2020*
# Check that the folder deleted
!ls ./data/stg/flights_each_day_week_14_data/year=2020*/*.csv

zsh:1: no matches found: ./data/stg/flights_each_day_week_14_data/year=2020*/*.csv


In [8]:
# Copy new file into proper data folder location
!cp ./data/stg/flights_each_day_week_14_data/*2021*/*.csv ./data/stg/flights_each_day_week_14_data/flights_2021.csv 
# Delete Spark output folder
!rm -rf ./data/stg/flights_each_day_week_14_data/year=2021*
# Check that the folder deleted
!ls ./data/stg/flights_each_day_week_14_data/year=2021*/*.csv

zsh:1: no matches found: ./data/stg/flights_each_day_week_14_data/year=2021*/*.csv
