In [1]:
import neo4j

import pandas as pd

from IPython.display import display

### General imports, function rips from week 8 lab as well as custom airport & route functions

In [2]:
driver = neo4j.GraphDatabase.driver(uri="neo4j://neo4j:7687", auth=("neo4j","ucb_mids_w205"))

In [3]:
session = driver.session(database="neo4j")

In [18]:
def my_neo4j_wipe_out_database():
    "wipe out database by deleting all nodes and relationships"
    
    query = "match (node)-[relationship]->() delete node, relationship"
    session.run(query)
    
    query = "match (node) delete node"
    session.run(query)

In [4]:
def my_neo4j_run_query_pandas(query, **kwargs):
    "run a query and return the results in a pandas dataframe"
    
    result = session.run(query, **kwargs)
    
    df = pd.DataFrame([r.values() for r in result], columns=result.keys())
    
    return df

In [20]:
def my_neo4j_nodes_relationships():
    "print all the nodes and relationships"
   
    print("-------------------------")
    print("  Nodes:")
    print("-------------------------")
    
    query = """
        match (n) 
        return n.code as node_code, labels(n) as labels
        order by n.code
    """
    
    df = my_neo4j_run_query_pandas(query)
    
    number_nodes = df.shape[0]
    
    display(df)
    
    print("-------------------------")
    print("  Relationships:")
    print("-------------------------")
    
    query = """
        match (n1)-[r]->(n2) 
        return n1.code as node_code_1, labels(n1) as node_1_labels, 
            type(r) as relationship_type, n2.name as node_name_2, labels(n2) as node_2_labels
        order by node_name_1, node_name_2
    """
    
    df = my_neo4j_run_query_pandas(query)
    
    number_relationships = df.shape[0]
    
    display(df)
    
    density = (2 * number_relationships) / (number_nodes * (number_nodes - 1))
    
    print("-------------------------")
    print("  Density:", f'{density:.1f}')
    print("-------------------------")
    

In [21]:
def insert_airports(airport_list):
    query = """
    UNWIND $airports AS airport_code
    MERGE (:Airport {code: airport_code})
    """
    session.run(query, airports=airport_list)


In [22]:
def insert_routes(route_pairs):
    query = """
    UNWIND $routes AS pair
    MATCH (a1:Airport {code: pair.from}), (a2:Airport {code: pair.to})
    MERGE (a1)-[:FLIES_TO]->(a2)
    """
    session.run(query, routes=route_pairs)


### Loading, formatting and cleaning routes data

In [23]:
routes_df = pd.read_csv('routes.csv')
routes_df.columns = [
    'airline',
    'airline_id',
    'source_airport',
    'source_airport_id',
    'destination_airport',
    'destination_airport_id',
    'codeshare',
    'stops',
    'equipment'
]

In [24]:
routes_df.head()

Unnamed: 0,airline,airline_id,source_airport,source_airport_id,destination_airport,destination_airport_id,codeshare,stops,equipment
0,2B,410,AER,2965,KZN,2990,,0,CR2
1,2B,410,ASF,2966,KZN,2990,,0,CR2
2,2B,410,ASF,2966,MRV,2962,,0,CR2
3,2B,410,CEK,2968,KZN,2990,,0,CR2
4,2B,410,CEK,2968,OVB,4078,,0,CR2


In [25]:
routes_df_dedupe = routes_df.drop_duplicates()

routes_df_clean = routes_df.dropna(subset=[
    'airline', 
    'airline_id', 
    'source_airport', 
    'source_airport_id', 
    'destination_airport', 
    'destination_airport_id', 
    'stops', 
    'equipment'
])

In [26]:
print(f'original shape:{routes_df.shape}')
print(f'deduped shape:{routes_df_dedupe.shape}')
print(f'non-null shape:{routes_df_clean.shape}')

routes_df = routes_df_clean

original shape:(67663, 9)
deduped shape:(67663, 9)
non-null shape:(67645, 9)


In [27]:
routes_df

Unnamed: 0,airline,airline_id,source_airport,source_airport_id,destination_airport,destination_airport_id,codeshare,stops,equipment
0,2B,410,AER,2965,KZN,2990,,0,CR2
1,2B,410,ASF,2966,KZN,2990,,0,CR2
2,2B,410,ASF,2966,MRV,2962,,0,CR2
3,2B,410,CEK,2968,KZN,2990,,0,CR2
4,2B,410,CEK,2968,OVB,4078,,0,CR2
...,...,...,...,...,...,...,...,...,...
67658,ZL,4178,WYA,6334,ADL,3341,,0,SF3
67659,ZM,19016,DME,4029,FRU,2912,,0,734
67660,ZM,19016,FRU,2912,DME,4029,,0,734
67661,ZM,19016,FRU,2912,OSS,2913,,0,734


### Create airports, routes

In [28]:
airport_list = pd.unique(routes_df[['source_airport', 'destination_airport']].values.ravel()).tolist()
route_pairs = routes_df[['source_airport', 'destination_airport']].rename(
    columns={'source_airport': 'from', 'destination_airport': 'to'}
).to_dict('records')

insert_airports(airport_list)
insert_routes(route_pairs)


In [29]:
airport_list[:10]

['AER', 'KZN', 'ASF', 'MRV', 'CEK', 'OVB', 'DME', 'NBC', 'TGK', 'UUA']

### Algorithm 1: Page Rank 
   * useful to determine importance of individual airports in terms of moving traffic
   

In [5]:
query = """
CALL gds.graph.project(
  'flightGraph',
  'Airport',
  {
    FLIES_TO: {
      type: 'FLIES_TO',
      orientation: 'NATURAL'
    }
  }
)
"""
session.run(query)


<neo4j._sync.work.result.Result at 0x7f2c08497820>

In [6]:
query = """
CALL gds.pageRank.stream('flightGraph')
YIELD nodeId, score
RETURN gds.util.asNode(nodeId).code AS airport_code, score
ORDER BY score DESC
LIMIT 100
"""
pagerank_df = my_neo4j_run_query_pandas(query)
display(pagerank_df)


Unnamed: 0,airport_code,score
0,ATL,15.139925
1,IST,14.248203
2,DEN,13.867724
3,ORD,13.860655
4,DFW,13.583567
...,...,...
95,CMN,5.471904
96,POM,5.439679
97,DCA,5.360816
98,DMK,5.300346


### Algorithm 2: Shortest path

In [7]:
source = "SFO"
target = "MGA"

query = f"""
MATCH (src:Airport {{code: '{source}'}}), (dst:Airport {{code: '{target}'}})
CALL gds.shortestPath.dijkstra.stream('flightGraph', {{
  sourceNode: id(src),
  targetNode: id(dst)
}})
YIELD totalCost, nodeIds
RETURN totalCost, 
       [nodeId IN nodeIds | gds.util.asNode(nodeId).code] AS path
"""
shortest_path_df = my_neo4j_run_query_pandas(query)
display(shortest_path_df)



Unnamed: 0,totalCost,path
0,2.0,"[SFO, IAH, MGA]"


###  Algorithm 3.1 Louvain Community Detection

In [11]:
query = """
CALL gds.louvain.stream('flightGraph')
YIELD nodeId, communityId
RETURN gds.util.asNode(nodeId).code AS airport_code, communityId
ORDER BY communityId
LIMIT 100
"""
result = session.run(query)
for record in result:
    print(record)

<Record airport_code='ADQ' communityId=87>
<Record airport_code='AOS' communityId=87>
<Record airport_code='SYB' communityId=87>
<Record airport_code='KZB' communityId=87>
<Record airport_code='KYK' communityId=87>
<Record airport_code='OLH' communityId=87>
<Record airport_code='KLN' communityId=87>
<Record airport_code='AKK' communityId=87>
<Record airport_code='KKB' communityId=87>
<Record airport_code='ORI' communityId=87>
<Record airport_code='KOZ' communityId=87>
<Record airport_code='KPR' communityId=87>
<Record airport_code='FUN' communityId=293>
<Record airport_code='AVV' communityId=293>
<Record airport_code='AYQ' communityId=293>
<Record airport_code='RUS' communityId=293>
<Record airport_code='IRA' communityId=293>
<Record airport_code='SCZ' communityId=293>
<Record airport_code='HBA' communityId=293>
<Record airport_code='EGM' communityId=293>
<Record airport_code='ATD' communityId=293>
<Record airport_code='RBV' communityId=293>
<Record airport_code='HIR' communityId=293>


### Algorithm 3.2 Louvain Community Detection (Collect, count and sort by community id)

In [5]:
query = """
CALL gds.louvain.stream('flightGraph')
YIELD nodeId, communityId
WITH communityId, collect(gds.util.asNode(nodeId).code) AS airports
RETURN communityId AS Community,
       size(airports) AS AirportCount,
       airports[0..10] AS SampleAirports   // show first 10 airport codes
ORDER BY AirportCount DESC
LIMIT 10
"""
result = session.run(query)
for record in result:
    print(record)

<Record Community=2858 AirportCount=687 SampleAirports=['BRL', 'ORD', 'STL', 'DEC', 'JBR', 'YUL', 'ATL', 'LWB', 'MCN', 'MEI']>
<Record Community=2408 AirportCount=511 SampleAirports=['BDS', 'ZRH', 'BOD', 'BRS', 'GVA', 'LPA', 'LCA', 'RMF', 'TFS', 'AJR']>
<Record Community=2731 AirportCount=502 SampleAirports=['BSO', 'MNL', 'BXU', 'CBO', 'CGY', 'CRM', 'DGT', 'GES', 'KLO', 'LGP']>
<Record Community=649 AirportCount=308 SampleAirports=['JIB', 'DXB', 'JED', 'SAW', 'CAI', 'DAC', 'BZL', 'CCU', 'CGP', 'MCT']>
<Record Community=2693 AirportCount=279 SampleAirports=['AYP', 'LIM', 'CUZ', 'PEM', 'HUU', 'IQT', 'PCL', 'TPP', 'BOG', 'GYE']>
<Record Community=293 AirportCount=277 SampleAirports=['DRW', 'PER', 'SYD', 'MEL', 'ADL', 'BNE', 'AKL', 'PPT', 'ASP', 'CBR']>
<Record Community=3038 AirportCount=211 SampleAirports=['ABJ', 'BOY', 'OUA', 'ACC', 'BKO', 'DKR', 'COO', 'LFW', 'NIM', 'LOS']>
<Record Community=745 AirportCount=182 SampleAirports=['ADQ', 'AOS', 'KKB', 'KLN', 'KOZ', 'OLH', 'KZB', 'SYB', 'K

### Algorithm 4: Breadth-First Search (BFS)

In [21]:
query = """
MATCH (start:Airport {code: 'SFO'})
CALL gds.bfs.stream('flightGraph', {
    sourceNode: id(start)
})
YIELD nodeIds
RETURN [nodeId IN nodeIds | gds.util.asNode(nodeId).code] AS path
"""
bfs_df = my_neo4j_run_query_pandas(query)
display(bfs_df)



Unnamed: 0,path
0,"[SFO, ZRH, MNL, ORD, STL, YUL, DXB, HKG, KIX, ..."


### Algorithm 5: Betweenness Centrality 
- find major hubs in the network

In [22]:
query = """
CALL gds.betweenness.stream('flightGraph')
YIELD nodeId, score
RETURN gds.util.asNode(nodeId).code AS airport, score
ORDER BY score DESC
"""
betweenness_centrality_df = my_neo4j_run_query_pandas(query)
display(betweenness_centrality_df)

Unnamed: 0,airport,score
0,ANC,815780.210546
1,LAX,773285.385769
2,CDG,722535.785870
3,DXB,695342.411108
4,FRA,597231.583169
...,...,...
3418,GFN,0.000000
3419,RCM,0.000000
3420,LSY,0.000000
3421,MYA,0.000000


### Algorithm 6: Weakly Connected Components 
- detect isolated flight clusters

In [23]:
query = """
CALL gds.wcc.stream('flightGraph')
YIELD nodeId, componentId
RETURN gds.util.asNode(nodeId).code AS airport, componentId
ORDER BY componentId
"""
weakly_connected_df = my_neo4j_run_query_pandas(query)
display(weakly_connected_df)

Unnamed: 0,airport,componentId
0,AER,0
1,KZN,0
2,ASF,0
3,MRV,0
4,CEK,0
...,...,...
3418,LIF,3130
3419,MEE,3130
3420,TGJ,3130
3421,TOU,3130
