# __Setup__

### _Imports and Reading in Data_

In [96]:
# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx

In [97]:
# read in data
airports_df = pd.read_csv("data/airports.csv")  # information about each airport
routes_df = pd.read_csv("data/routes.csv")  # information about all routes between airports

### _Transforming Airports Data_

In [98]:
airports_df.head()

Unnamed: 0,index,Airport ID,Name,City,Country,IATA,ICAO,Latitude,Longitude,Altitude,Timezone,DST,Tz database time zone,Type,Source
0,0,1,Goroka Airport,Goroka,Papua New Guinea,GKA,AYGA,-6.08169,145.391998,5282,10,U,Pacific/Port_Moresby,airport,OurAirports
1,1,2,Madang Airport,Madang,Papua New Guinea,MAG,AYMD,-5.20708,145.789001,20,10,U,Pacific/Port_Moresby,airport,OurAirports
2,2,3,Mount Hagen Kagamuga Airport,Mount Hagen,Papua New Guinea,HGU,AYMH,-5.82679,144.296005,5388,10,U,Pacific/Port_Moresby,airport,OurAirports
3,3,4,Nadzab Airport,Nadzab,Papua New Guinea,LAE,AYNZ,-6.569803,146.725977,239,10,U,Pacific/Port_Moresby,airport,OurAirports
4,4,5,Port Moresby Jacksons International Airport,Port Moresby,Papua New Guinea,POM,AYPY,-9.44338,147.220001,146,10,U,Pacific/Port_Moresby,airport,OurAirports


In [99]:
# removing spaces in column names
airports_df.columns = ["_".join(c.lower().split()) for c in airports_df.columns]

In [100]:
# setting index to airport_id column and dropping index to avoid redundant data
airports_df = airports_df.set_index("airport_id")
airports_df = airports_df.drop(columns=["index"])

In [101]:
airports_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7698 entries, 1 to 14110
Data columns (total 13 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   name                   7698 non-null   object 
 1   city                   7649 non-null   object 
 2   country                7698 non-null   object 
 3   iata                   7698 non-null   object 
 4   icao                   7698 non-null   object 
 5   latitude               7698 non-null   float64
 6   longitude              7698 non-null   float64
 7   altitude               7698 non-null   int64  
 8   timezone               7698 non-null   object 
 9   dst                    7698 non-null   object 
 10  tz_database_time_zone  7698 non-null   object 
 11  type                   7698 non-null   object 
 12  source                 7698 non-null   object 
dtypes: float64(2), int64(1), object(10)
memory usage: 842.0+ KB


In [102]:
# removing any airports where the City value is null
BM = airports_df["city"].notnull()
airports_df = airports_df[BM]

In [103]:
# removing any airports with value \N for main columns (basically null)
for col in airports_df.columns:
    BM = airports_df[col] != "\\N"
    airports_df = airports_df[BM]

In [104]:
# changing timezone datatype to int
airports_df.timezone = airports_df.timezone.apply(float)

In [105]:
airports_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5515 entries, 1 to 10952
Data columns (total 13 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   name                   5515 non-null   object 
 1   city                   5515 non-null   object 
 2   country                5515 non-null   object 
 3   iata                   5515 non-null   object 
 4   icao                   5515 non-null   object 
 5   latitude               5515 non-null   float64
 6   longitude              5515 non-null   float64
 7   altitude               5515 non-null   int64  
 8   timezone               5515 non-null   float64
 9   dst                    5515 non-null   object 
 10  tz_database_time_zone  5515 non-null   object 
 11  type                   5515 non-null   object 
 12  source                 5515 non-null   object 
dtypes: float64(3), int64(1), object(9)
memory usage: 603.2+ KB


In [112]:
# create new df with columns for location only
airports_loc_df = airports_df[["latitude", "longitude"]]#, "timezone", "dst"]]
airports_loc_df.head()

Unnamed: 0_level_0,latitude,longitude
airport_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,-6.08169,145.391998
2,-5.20708,145.789001
3,-5.82679,144.296005
4,-6.569803,146.725977
5,-9.44338,147.220001


In [109]:
# # one hot encoding categorical dst feature
# dst_df = pd.get_dummies(airports_loc_df.dst)

# # remove spaces in column names and indicate category
# dst_df.columns = ["dst_"+x for x in dst_df.columns]

# # add boolean category variables to airports_loc_df
# airports_loc_df = airports_loc_df.merge(dst_df, left_index=True, right_index=True)
# airports_loc_df = airports_loc_df.drop(columns=["dst"])

In [113]:
# # normalize high-magnitude columns
# norm_cols = ["latitude", "longitude"]#, "timezone"]

# for col in norm_cols:
#     airports_loc_df[col] = (
#         (airports_loc_df[col] - airports_loc_df[col].min()) / (airports_loc_df[col].max() - airports_loc_df[col].min())
#     )

### _Transforming Routes Data_

### _Graph Building and Export_

In [13]:
# create graph
g = nx.Graph()

In [14]:
# add nodes
for index, row in airports_df.iterrows():
    airport_desc = row["Name"] + " - " + row["City"] + ", " + row["Country"]
    g.add_node(row["Airport_ID"], name=airport_desc)

In [15]:
# check that all airports were added to the graph
print("Expected Nodes:", airports_df.shape[0], "\t Total Nodes:", len(g.nodes))

Expected Nodes: 7649 	 Total Nodes: 7649


In [16]:
# add edges
for i in routes_df.index:
    left_airport = routes_df["Source_airport_ID"][i]
    right_airport = routes_df["Destination_airport_ID"][i]
    
    # check for current weight
    current_weight = g.get_edge_data(left_airport, right_airport, default={"weight":0})["weight"]

    # add edge
    g.add_edge(left_airport, right_airport, weight=current_weight+1)

In [17]:
print(g.nodes["1"]["name"])

Goroka Airport - Goroka, Papua New Guinea


In [21]:
# calculate degree centrality for all nodes
degree_centrality = nx.degree_centrality(g)
n = 20

print("TOP", n, "NODES BY DEGREE CENTRALITY\n-------------------------")
# display top n nodes by degree centrality - how many connections they have
for u in sorted(degree_centrality, key=degree_centrality.get, reverse=True)[:n]:
    print("ID:", u, "   NAME:", g.nodes[u]["name"], "-", degree_centrality[u])

TOP 20 NODES BY DEGREE CENTRALITY
-------------------------
ID: 580    NAME: Amsterdam Airport Schiphol - Amsterdam, Netherlands - 0.031958762886597936
ID: 340    NAME: Frankfurt am Main Airport - Frankfurt, Germany - 0.031443298969072164
ID: 1382    NAME: Charles de Gaulle International Airport - Paris, France - 0.030927835051546393
ID: 1701    NAME: Atatürk International Airport - Istanbul, Turkey - 0.030154639175257732
ID: 3682    NAME: Hartsfield Jackson Atlanta International Airport - Atlanta, United States - 0.027963917525773197
ID: 3364    NAME: Beijing Capital International Airport - Beijing, China - 0.026675257731958762
ID: 3830    NAME: Chicago O'Hare International Airport - Chicago, United States - 0.02654639175257732
ID: 346    NAME: Munich Airport - Munich, Germany - 0.024742268041237116
ID: 3670    NAME: Dallas Fort Worth International Airport - Dallas-Fort Worth, United States - 0.02422680412371134
ID: 4029    NAME: Domodedovo International Airport - Moscow, Russia - 0.0

In [None]:
# export to GraphML file
# sub_g_addyosmani = nx.ego_graph(g, 35773, radius=1)