# __Setup__

### _Imports and Reading in Data_

In [37]:
# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
import scipy.spatial.distance

In [2]:
# read in data
airports_df = pd.read_csv("data/airports.csv")  # information about each airport
routes_df = pd.read_csv("data/routes.csv")  # information about all routes between airports

### _Transforming Airports Data_

In [3]:
airports_df.head()

Unnamed: 0,index,Airport ID,Name,City,Country,IATA,ICAO,Latitude,Longitude,Altitude,Timezone,DST,Tz database time zone,Type,Source
0,0,1,Goroka Airport,Goroka,Papua New Guinea,GKA,AYGA,-6.08169,145.391998,5282,10,U,Pacific/Port_Moresby,airport,OurAirports
1,1,2,Madang Airport,Madang,Papua New Guinea,MAG,AYMD,-5.20708,145.789001,20,10,U,Pacific/Port_Moresby,airport,OurAirports
2,2,3,Mount Hagen Kagamuga Airport,Mount Hagen,Papua New Guinea,HGU,AYMH,-5.82679,144.296005,5388,10,U,Pacific/Port_Moresby,airport,OurAirports
3,3,4,Nadzab Airport,Nadzab,Papua New Guinea,LAE,AYNZ,-6.569803,146.725977,239,10,U,Pacific/Port_Moresby,airport,OurAirports
4,4,5,Port Moresby Jacksons International Airport,Port Moresby,Papua New Guinea,POM,AYPY,-9.44338,147.220001,146,10,U,Pacific/Port_Moresby,airport,OurAirports


In [4]:
# removing spaces in column names
airports_df.columns = ["_".join(c.lower().split()) for c in airports_df.columns]

In [5]:
# setting index to airport_id column and dropping index to avoid redundant data
airports_df = airports_df.set_index("index")

In [6]:
airports_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7698 entries, 0 to 7697
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   airport_id             7698 non-null   int64  
 1   name                   7698 non-null   object 
 2   city                   7649 non-null   object 
 3   country                7698 non-null   object 
 4   iata                   7698 non-null   object 
 5   icao                   7698 non-null   object 
 6   latitude               7698 non-null   float64
 7   longitude              7698 non-null   float64
 8   altitude               7698 non-null   int64  
 9   timezone               7698 non-null   object 
 10  dst                    7698 non-null   object 
 11  tz_database_time_zone  7698 non-null   object 
 12  type                   7698 non-null   object 
 13  source                 7698 non-null   object 
dtypes: float64(2), int64(2), object(10)
memory usage: 902.1+ KB


In [7]:
# removing any airports where the City value is null
BM = airports_df["city"].notnull()
airports_df = airports_df[BM]

In [8]:
# removing any airports with value \N for main columns (basically null)
for col in airports_df.columns:
    BM = airports_df[col] != "\\N"
    airports_df = airports_df[BM]

In [9]:
# changing timezone datatype to int
airports_df.timezone = airports_df.timezone.apply(float)

In [10]:
# convert airport_id to str
airports_df.airport_id = airports_df.airport_id.apply(str)

In [45]:
airports_df.head()

Unnamed: 0_level_0,airport_id,name,city,country,iata,icao,latitude,longitude,altitude,timezone,dst,tz_database_time_zone,type,source
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,1,Goroka Airport,Goroka,Papua New Guinea,GKA,AYGA,-6.08169,145.391998,5282,10.0,U,Pacific/Port_Moresby,airport,OurAirports
1,2,Madang Airport,Madang,Papua New Guinea,MAG,AYMD,-5.20708,145.789001,20,10.0,U,Pacific/Port_Moresby,airport,OurAirports
2,3,Mount Hagen Kagamuga Airport,Mount Hagen,Papua New Guinea,HGU,AYMH,-5.82679,144.296005,5388,10.0,U,Pacific/Port_Moresby,airport,OurAirports
3,4,Nadzab Airport,Nadzab,Papua New Guinea,LAE,AYNZ,-6.569803,146.725977,239,10.0,U,Pacific/Port_Moresby,airport,OurAirports
4,5,Port Moresby Jacksons International Airport,Port Moresby,Papua New Guinea,POM,AYPY,-9.44338,147.220001,146,10.0,U,Pacific/Port_Moresby,airport,OurAirports


In [11]:
airports_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5515 entries, 0 to 6806
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   airport_id             5515 non-null   object 
 1   name                   5515 non-null   object 
 2   city                   5515 non-null   object 
 3   country                5515 non-null   object 
 4   iata                   5515 non-null   object 
 5   icao                   5515 non-null   object 
 6   latitude               5515 non-null   float64
 7   longitude              5515 non-null   float64
 8   altitude               5515 non-null   int64  
 9   timezone               5515 non-null   float64
 10  dst                    5515 non-null   object 
 11  tz_database_time_zone  5515 non-null   object 
 12  type                   5515 non-null   object 
 13  source                 5515 non-null   object 
dtypes: float64(3), int64(1), object(10)
memory usage: 646.3+ KB


In [47]:
# create new df with columns for location only
airports_loc_df = airports_df[["airport_id","latitude", "longitude"]]#, "timezone", "dst"]]
airports_loc_df = airports_loc_df.set_index("airport_id")
airports_loc_df.head()

Unnamed: 0_level_0,latitude,longitude
airport_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,-6.08169,145.391998
2,-5.20708,145.789001
3,-5.82679,144.296005
4,-6.569803,146.725977
5,-9.44338,147.220001


### _Transforming Routes Data_

In [15]:
routes_df.head()

Unnamed: 0,index,Airline,Airline ID,Source airport,Source airport ID,Destination airport,Destination airport ID,Codeshare,Stops,Equipment
0,0,2B,410,AER,2965,KZN,2990,,0,CR2
1,1,2B,410,ASF,2966,KZN,2990,,0,CR2
2,2,2B,410,ASF,2966,MRV,2962,,0,CR2
3,3,2B,410,CEK,2968,KZN,2990,,0,CR2
4,4,2B,410,CEK,2968,OVB,4078,,0,CR2


In [16]:
# set index column as index to avoid redundancy
routes_df = routes_df.set_index("index")

In [17]:
# clean column names
routes_df.columns = ["_".join(x.lower().split()) for x in routes_df.columns]

In [18]:
# narrow down to necessary columns only
routes_df = routes_df[["source_airport","source_airport_id","destination_airport","destination_airport_id"]]

In [19]:
# drop rows with any null columns
BM = routes_df["source_airport_id"].notnull()
routes_df = routes_df[BM]

BM = routes_df["destination_airport_id"].notnull()
routes_df = routes_df[BM]

In [20]:
# removing any airports where the City value is null
id_cols = ["source_airport_id", "destination_airport_id"]
for col in id_cols:
    BM = routes_df[col] != "\\N"
    routes_df = routes_df[BM]

In [21]:
routes_df.head()

Unnamed: 0_level_0,source_airport,source_airport_id,destination_airport,destination_airport_id
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,AER,2965,KZN,2990
1,ASF,2966,KZN,2990
2,ASF,2966,MRV,2962
3,CEK,2968,KZN,2990
4,CEK,2968,OVB,4078


In [22]:
routes_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 67240 entries, 0 to 67662
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   source_airport          67240 non-null  object
 1   source_airport_id       67240 non-null  object
 2   destination_airport     67240 non-null  object
 3   destination_airport_id  67240 non-null  object
dtypes: object(4)
memory usage: 2.6+ MB


### _Graph Building and Export_

In [38]:
# create graph
g = nx.MultiDiGraph()

In [39]:
# add nodes
for index, row in airports_df.iterrows():
    airport_desc = row["name"] + " - " + row["city"] + ", " + row["country"]
    g.add_node(row["airport_id"], name=airport_desc)

In [40]:
# check that all airports were added to the graph
print("Expected Nodes:", airports_df.shape[0], "\t Total Nodes:", len(g.nodes))

Expected Nodes: 5515 	 Total Nodes: 5515


In [41]:
# add edges
count = 0
for i in routes_df.index:
    left_airport = routes_df["source_airport_id"][i]
    right_airport = routes_df["destination_airport_id"][i]
    
    # check for current weight
    current_weight = g.get_edge_data(left_airport, right_airport)#, default={"weight":0})["weight"]

    # add edge
    g.add_edge(left_airport, right_airport, key=count)
    count += 1

In [42]:
# check that all routes were added to the graph
print("Expected Edges:", routes_df.shape[0], "\t Total Edges:", len(g.edges))

Expected Edges: 67240 	 Total Edges: 67240


In [44]:
# calculate betweenness centrality for all nodes
between_centrality = nx.betweenness_centrality(g, 1000)
n = 20

print("TOP", n, "NODES BY BETWEENNESS CENTRALITY\n-------------------------")
# display top n nodes by betweenness centrality - how many connections they have
for u in sorted(between_centrality, key=between_centrality.get, reverse=True)[:n]:
    print("ID:", u, "   NAME:", g.nodes[u]["name"], "-", between_centrality[u])

TOP 20 NODES BY BETWEENNESS CENTRALITY
-------------------------
ID: 3774    NAME: Ted Stevens Anchorage International Airport - Anchorage, United States - 0.024153782747979947
ID: 3484    NAME: Los Angeles International Airport - Los Angeles, United States - 0.022809833642693702
ID: 2188    NAME: Dubai International Airport - Dubai, United Arab Emirates - 0.020065968051046518
ID: 1382    NAME: Charles de Gaulle International Airport - Paris, France - 0.019783703035380726
ID: 3364    NAME: Beijing Capital International Airport - Beijing, China - 0.018589468401249143
ID: 340    NAME: Frankfurt am Main Airport - Frankfurt, Germany - 0.017179222486781977
ID: 580    NAME: Amsterdam Airport Schiphol - Amsterdam, Netherlands - 0.01665296391494695
ID: 2564    NAME: Guarulhos - Governador André Franco Montoro International Airport - Sao Paulo, Brazil - 0.01651914464519577
ID: 3830    NAME: Chicago O'Hare International Airport - Chicago, United States - 0.016233727241634938
ID: 3577    NAME: Se

In [36]:
# export to GraphML file
nx.write_graphml(g, "airport_routes.graphml")

In [61]:
# calculate and display Euclidean similarity for top 3 airports in different regions/continents by betweenness centrality
# 3774 - Ted Stevens Anchorage International Airport - Anchorage, United States
# 2188 - Dubai International Airport - Dubai, United Arab Emirates
# 1382 - Charles de Gaulle International Airport - Paris, France
query_nodes = ["3774", "2188", "1382"]

for node in query_nodes:
    # location of target airport
    target_node = airports_loc_df.loc[node]
    
    # generating distances from target airport to others
    distances = scipy.spatial.distance.cdist(airports_loc_df, [target_node], metric="euclidean")[:,0]
    
    query_distances = list(zip(airports_loc_df.index, distances))
    
    # print top ten closest airports
    print("CLOSEST AIRPORTS TO", g.nodes[node]["name"])
    for other_airport, other_distance in sorted(query_distances, key=lambda x: x[1], reverse=False)[1:11]:
        print(" "*4, ">", g.nodes[other_airport]["name"])
    print("---"*20)

CLOSEST AIRPORTS TO Ted Stevens Anchorage International Airport - Anchorage, United States
     > Merrill Field - Anchorage, United States
     > Elmendorf Air Force Base - Anchorage, United States
     > Bryant Army Heliport - Fort Richardson, United States
     > Warren "Bud" Woods Palmer Municipal Airport - Palmer, United States
     > Talkeetna Airport - Talkeetna, United States
     > Seward Airport - Seward, United States
     > Soldotna Airport - Soldotna, United States
     > Kenai Municipal Airport - Kenai, United States
     > Homer Airport - Homer, United States
     > Chenega Bay Airport - Chenega, United States
------------------------------------------------------------
CLOSEST AIRPORTS TO Dubai International Airport - Dubai, United Arab Emirates
     > Sharjah International Airport - Sharjah, United Arab Emirates
     > Al Minhad Air Base - Minhad AB, United Arab Emirates
     > Al Maktoum International Airport - Dubai, United Arab Emirates
     > Ras Al Khaimah Internat