In [17]:
# In this Spark ETL process we are creating nodes and adges files (from the OpenSky data set) 
# for creating our network in Gephi.

In [8]:
from pyspark.sql import SparkSession
from datetime import datetime, timedelta
from pyspark.sql.functions import *
import os

In [9]:
cwd = os.getcwd()
spark = SparkSession.builder.appName('GraphData').getOrCreate()

In [25]:
# Step 1 - Load files with labels/attributes for Nodes enrichment 

# Airports 
airports_file = f'{cwd}/data/raw_data/labels/airports.csv'
adf = spark.read.csv(path=airports_file, header=True)
adf.createOrReplaceTempView('ADF')
# adf.printSchema()

# airports.csv (downloaded from - https://ourairports.com/data/).
# Schema:
# root
#  |-- id: string (nullable = true)
#  |-- ident: string (nullable = true)
#  |-- type: string (nullable = true)
#  |-- name: string (nullable = true)
#  |-- latitude_deg: string (nullable = true)
#  |-- longitude_deg: string (nullable = true)
#  |-- elevation_ft: string (nullable = true)
#  |-- continent: string (nullable = true)
#  |-- iso_country: string (nullable = true)
#  |-- iso_region: string (nullable = true)
#  |-- municipality: string (nullable = true)
#  |-- scheduled_service: string (nullable = true)
#  |-- gps_code: string (nullable = true)
#  |-- iata_code: string (nullable = true)
#  |-- local_code: string (nullable = true)
#  |-- home_link: string (nullable = true)
#  |-- wikipedia_link: string (nullable = true)
#  |-- keywords: string (nullable = true)


In [26]:
# Countries
countries_file = f'{cwd}/data/raw_data/labels/countries.csv'
cdf = spark.read.csv(path=countries_file, header=True)
cdf.createOrReplaceTempView('CDF')
# cdf.printSchema()

# countries.csv (downloaded from - https://github.com/google/dspl/blob/master/samples/google/canonical/countries.csv).
# Schema:
# root
#  |-- country: string (nullable = true)
#  |-- latitude: string (nullable = true)
#  |-- longitude: string (nullable = true)
#  |-- name: string (nullable = true)

In [10]:
# Step 2 - Load Edges data (OpenSky data)

file = f'{cwd}/data/raw_data/flights/flightlist_*.csv.gz'
fdf = spark.read.csv(path=file, header=True)
fdf = fdf.withColumn("filename", input_file_name())
fdf.createOrReplaceTempView('FDF')
# fdf.printSchema()

# flightlist_*.csv.gz (downloaded from - https://zenodo.org/record/4601479#.YLaEU5MzYUH).
# Schema:
# root
#  |-- callsign: string (nullable = true)
#  |-- number: string (nullable = true)
#  |-- icao24: string (nullable = true)
#  |-- registration: string (nullable = true)
#  |-- typecode: string (nullable = true)
#  |-- origin: string (nullable = true)
#  |-- destination: string (nullable = true)
#  |-- firstseen: string (nullable = true)
#  |-- lastseen: string (nullable = true)
#  |-- day: string (nullable = true)
#  |-- latitude_1: string (nullable = true)
#  |-- longitude_1: string (nullable = true)
#  |-- altitude_1: string (nullable = true)
#  |-- latitude_2: string (nullable = true)
#  |-- longitude_2: string (nullable = true)
#  |-- altitude_2: string (nullable = true)
#  |-- filename: string (nullable = false)


In [30]:
# In the end our nodes will be countries and the edges international flights between countries.
# So first we are creating a new data set from airports.csv and countries.csv (joined by iso_country code)

nodes_df = spark.sql("""
select  A.ident         as Airport_Id,
        A.name          as Airport_Name,
        A.type          as Airport_Type,
        A.latitude_deg  as Airport_lat,
        A.longitude_deg as Airport_lng,
        C.country       as Country_Id,
        C.name          as Country_Name,        
        C.latitude      as Country_latitude,        
        C.longitude     as Country_longitude        
from    ADF as A JOIN CDF as C 
        ON A.iso_country = C.country
""")

nodes_df.createOrReplaceTempView('NDF')


In [36]:
# Create Countries (Nodes) CSV file
countries = spark.sql("""
select  distinct Country_Id as Id,
        Country_Name        as Label,        
        Country_latitude    as lat,        
        Country_longitude   as lng        
FROM    CDF
""")

countries.repartition(1).write.format("com.databricks.spark.csv").option("header", "true").save("nodes_countries")


# Copy new file into proper data folder location
!cp ./nodes_countries/*.csv ./data/nodes/nodes_countries.csv
# Delete Spark output folder
!rm -rf ./nodes_countries
# Check that the folder deleted
!ls ./nodes_countries/*.csv

In [79]:
nodes_df.show(10,False)

+----------+----------------------------------+-------------+------------------+-------------------+----------+-------------+----------------+-----------------+
|Airport_Id|Airport_Name                      |Airport_Type |Airport_lat       |Airport_lng        |Country_Id|Country_Name |Country_latitude|Country_longitude|
+----------+----------------------------------+-------------+------------------+-------------------+----------+-------------+----------------+-----------------+
|00A       |Total Rf Heliport                 |heliport     |40.07080078125    |-74.93360137939453 |US        |United States|37.09024        |-95.712891       |
|00AA      |Aero B Ranch Airport              |small_airport|38.704022         |-101.473911        |US        |United States|37.09024        |-95.712891       |
|00AK      |Lowell Field                      |small_airport|59.94919968       |-151.695999146     |US        |United States|37.09024        |-95.712891       |
|00AL      |Epps Airpark          

In [85]:
# Each query aggregate one month flights data into a data fream, we are:
# Counting for each route the number of flights per month
# filtering incomplate filght rows (missing one of the origin/destination attribute)
# filtering domestic flights

def preper_edges_query(f_year, f_month):
    query = f"""With stg 
                as (select origin                   as Source,
                           destination              as Target,
                           count(*)                 as Weight 
                    from   FDF       
                    where  1=1
                           and origin is not null and destination is not null
                           and origin<>destination
                           and filename = 'file:///Users/ybatash/PycharmProjects/jupyter/tau-network-science/data/raw_data/flights/flightlist_{f_year}_{f_month}.csv.gz'
                    group by 1,2)               
                
                select  S.Country_Id as Source,
                        T.Country_Id as Target,        
                        Sum(E.Weight)as Weight
                from    stg E join NDF S
                        on E.Source = S.Airport_Id
                        join NDF T
                        on E.Target = T.Airport_Id
                group by 1,2"""
    return spark.sql(query)


In [87]:
for year in list(['2019','2020','2021']):
    for month in list(['01','02','03','04']):
        res_df = None
        if year=='2021' and month=='03':
            break
        print(f'Year - {year} Month - {month}')
        res_df = preper_edges_query(f_year=year, f_month=month)
        res_df.repartition(1).write.format("com.databricks.spark.csv").option("header", "true").save(f'flights_{year}_{month}')
        
        folder_path = f'{cwd}/flights_{year}_{month}'
        file_path = f'{folder_path}/*.csv'
        to_path = f'{cwd}/data/edges/flights_{year}_{month}.csv'
        # Copy new file into proper data folder location
        !cp $file_path $to_path
        # Delete Spark output folder
        !rm -rf $folder_path

        
        

Year - 2019 Month - 01
Year - 2019 Month - 02
Year - 2019 Month - 03
Year - 2019 Month - 04
Year - 2020 Month - 01
Year - 2020 Month - 02
Year - 2020 Month - 03
Year - 2020 Month - 04
Year - 2021 Month - 01
Year - 2021 Month - 02


In [69]:
# folder_path = f'{cwd}/flights_{year}_{month}'
# file_path = f'{folder_path}/*.csv'
# to_path = f'{cwd}/data/edges/flights_{year}_{month}.csv'
# # Copy new file into proper data folder location
# !cp $file_path $to_path
# # Delete Spark output folder
# !rm -rf $folder_path


In [28]:
# flight_between_countries = spark.sql("""
# With edges
# as( select  E.Source       as Source,
#             S.Country_Id as S_Country_Id,
#             E.Target       as Target,
#             T.Country_Id as T_Country_Id,        
#             Weight
#     from   EDF E join NDF S
#             on E.Source = S.Airport_Id
#            join NDF T
#             on E.Target = T.Airport_Id)

# select  S_Country_Id as Source,
#         T_Country_Id as Target,        
#         Sum(Weight)    as Weight
# from    edges
# where   S_Country_Id<>T_Country_Id
# group by 1,2
            
# """)



In [29]:
# flight_between_countries.show(10,False)

# flight_between_countries.repartition(1).write.format("com.databricks.spark.csv").option("header", "true").save("flight_between_countries")

In [89]:
# tmp_edges = '/Users/ybatash/PycharmProjects/jupyter/tau-network-science/b_01_2019/flight_between_countries.csv'
# tdf = spark.read.csv(path=tmp_edges, header=True)
# tdf.createOrReplaceTempView('TDF')
# tdf.printSchema()


In [36]:
# countries = spark.sql("""
# With Con_filter
# as ( select  Source as Country_Id
#      from    TDF
#      union
#      select  Target as Country_Id
#      from    TDF)
     
# select  distinct N.Country_Id as Id,
#         N.Country_Name as Label,        
#         N.Country_latitude as lat,        
#         N.Country_longitude as lng        
# FROM    NDF as N join Con_filter as F
#         on N.Country_Id=F.Country_Id
     
# """)

# countries.repartition(1).write.format("com.databricks.spark.csv").option("header", "true").save("countries")

# # nodes_df.createOrReplaceTempView('NDF')

In [3]:
# nodes_df = 
# spark.sql("""
# select  A.ident         as Airport_Id,
#         A.name          as Airport_Name,
#         A.type          as Airport_Type,
#         A.latitude_deg  as Airport_lat,
#         A.longitude_deg as Airport_lng,
#         C.country       as Country_Id,
#         C.name          as Country_Name,        
#         C.latitude      as Country_latitude,        
#         C.longitude     as Country_longitude        
# from    ADF as A JOIN CDF as C 
#         ON A.iso_country = C.country
# """)

# nodes_df.createOrReplaceTempView('NDF')

In [4]:
# res_df.repartition(1).write.format("com.databricks.spark.csv").option("header", "true").save("b_01_2019")


In [88]:
# spark.sql("""
# select origin                   as origin_airport,
#        destination              as destination_airport,
#        count(*)                 as flights_count,
#        count(distinct callsign) as distinct_callsign_count,
#        count(distinct icao24)   as distinct_icao24_count,
#        min(firstseen)           as min_firstseen,
#        max(firstseen)           as max_firstseen 
# from   T       
# where   1=1
#         and origin is not null and destination  is not null
#         and filename = 'file:///Users/ybatash/PycharmProjects/jupyter/tau-network-science/data/raw_data/flightlist_2019_01.csv.gz'
# group by 1,2        
# """).show(10, False)
