# Graph Types and Structures

In [1]:
import csv
import math
import numpy as np
import pandas as pd
import psycopg2

In [2]:
import neo4j
from neo4j import GraphDatabase
#from neo4j.exceptions import SessionError
#from graphdatascience import GraphDataScience

In [3]:
driver = neo4j.GraphDatabase.driver(uri="neo4j://neo4j:7687", auth=("neo4j","ucb_mids_w205"))

In [4]:
driver.verify_connectivity()
print("Connection to Neo4j established successfully!")

Connection to Neo4j established successfully!


In [5]:
session = driver.session(database="neo4j")

In [6]:
df_routes = pd.read_csv('routes.csv')

df_airports = pd.read_csv('airports.csv')

In [7]:
#removing leading/trailing spaces
df_routes.columns = df_routes.columns.str.strip()
df_airports.columns = df_airports.columns.str.strip()

# columns from the airports data
df_airports_coords = df_airports[[
    'iata_code', 'name', 'iso_country', 'municipality',
    'latitude_deg', 'longitude_deg'
]].copy()

# Merge for the source airport coordinates and details
df_merged = pd.merge(df_routes, df_airports_coords,
                     left_on='source airport', right_on='iata_code',
                     how='left', suffixes=('_source', None))
df_merged.rename(columns={
    'name': 'source_name',
    'iso_country': 'source_iso_country',
    'municipality': 'source_municipality',
    'latitude_deg': 'source_lat',
    'longitude_deg': 'source_lon'
}, inplace=True)
df_merged.drop('iata_code', axis=1, inplace=True)

#Merge for the destination airport coordinates and details
df_merged = pd.merge(df_merged, df_airports_coords,
                     left_on='destination apirport', right_on='iata_code',
                     how='left', suffixes=(None, '_dest'))
df_merged.rename(columns={
    'name': 'dest_name',
    'iso_country': 'dest_iso_country',
    'municipality': 'dest_municipality',
    'latitude_deg': 'dest_lat',
    'longitude_deg': 'dest_lon'
}, inplace=True)
df_merged.drop('iata_code', axis=1, inplace=True)

In [8]:
#Define and apply the distance calculation function
def haversine_distance(lat1, lon1, lat2, lon2):

    R = 3958.8 # Earth's radius in miles

    # Convert degrees to radians
    lat1, lon1, lat2, lon2 = map(np.deg2rad, [lat1, lon1, lat2, lon2])

    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    distance = R * c
    return distance

#Calculate the distance for each route
df_merged['distance_miles'] = df_merged.apply(
    lambda row: haversine_distance(row['source_lat'], row['source_lon'],
                                   row['dest_lat'], row['dest_lon']),
    axis=1
)

In [9]:
#Select columns for the final output
final_columns = [
    'airline', 'airline ID',
    'source airport id', 'source airport',
    'destination airport id', 'destination apirport',
    'source_name', 'source_iso_country', 'source_municipality',
    'dest_name', 'dest_iso_country', 'dest_municipality',
    'source_lat', 'source_lon', 'dest_lat', 'dest_lon', 'distance_miles',
    'codeshare', 'stops', 'equipment'
]
df_routes_all = df_merged[final_columns].copy()

In [10]:
#Save to csv
df_routes_all.to_csv('routes_distances.csv', index=False)

In [11]:
#convert IDs to ints
for col in ['airline ID', 'stops']:
    df_routes_all[col] = df_routes_all[col].replace('\\N', np.nan)
    df_routes_all[col] = df_routes_all[col].apply(lambda x: int(x) if pd.notna(x) else None)

#Drop rows with blank/NaN in ID columns
critical_id_columns = ['airline ID', 'source_name', 'dest_name']
df_routes_all.dropna(subset=critical_id_columns, inplace=True)
rows_after_dropping = len(df_routes_all)
print(f"Remaining rows after cleaning: {rows_after_dropping}")
print("-" * 50)

df_routes_all['codeshare_boolean'] = df_routes_all['codeshare'].apply(
    lambda x: True if (isinstance(x, str) and x.strip().upper() == 'Y') else False
)

df_routes_all.rename(columns={'destination apirport': 'destination airport'}, inplace=True)


Remaining rows after cleaning: 65989
--------------------------------------------------


In [12]:
df_routes_all.head()

Unnamed: 0,airline,airline ID,source airport id,source airport,destination airport id,destination airport,source_name,source_iso_country,source_municipality,dest_name,...,dest_municipality,source_lat,source_lon,dest_lat,dest_lon,distance_miles,codeshare,stops,equipment,codeshare_boolean
0,2B,410.0,2965,AER,2990,KZN,Sochi International Airport,RU,Sochi,Kazan International Airport,...,Kazan,43.449902,39.9566,55.606201,49.278702,936.308455,,0,CR2,False
1,2B,410.0,2966,ASF,2990,KZN,Astrakhan Narimanovo Boris M. Kustodiev Intern...,RU,Astrakhan,Kazan International Airport,...,Kazan,46.282843,48.010511,55.606201,49.278702,646.521493,,0,CR2,False
2,2B,410.0,2966,ASF,2962,MRV,Astrakhan Narimanovo Boris M. Kustodiev Intern...,RU,Astrakhan,Mineralnyye Vody Airport,...,Mineralnyye Vody,46.282843,48.010511,44.225101,43.081902,278.640578,,0,CR2,False
3,2B,410.0,2968,CEK,2990,KZN,Chelyabinsk Balandino Airport,RU,Chelyabinsk,Kazan International Airport,...,Kazan,55.305801,61.5033,55.606201,49.278702,478.777115,,0,CR2,False
4,2B,410.0,2968,CEK,4078,OVB,Chelyabinsk Balandino Airport,RU,Chelyabinsk,Novosibirsk Tolmachevo Airport,...,Novosibirsk,55.305801,61.5033,55.019756,82.618675,830.460631,,0,CR2,False


In [13]:
df_routes_all.shape

(65989, 21)

In [14]:
df_routes_all.dtypes

airline                    object
airline ID                float64
source airport id          object
source airport             object
destination airport id     object
destination airport        object
source_name                object
source_iso_country         object
source_municipality        object
dest_name                  object
dest_iso_country           object
dest_municipality          object
source_lat                float64
source_lon                float64
dest_lat                  float64
dest_lon                  float64
distance_miles            float64
codeshare                  object
stops                       int64
equipment                  object
codeshare_boolean            bool
dtype: object

In [15]:
df = df_routes_all.head(800)
data_rows = df.to_dict('records')

In [16]:
def Monopartite_graph(tx, data):

    query = """
    UNWIND $rows AS row
    MERGE (departure:Airport {id: row.`source airport id`, name: row.`source airport`})
    MERGE (arrival:Airport {id: row.`destination airport id`, name: row.`destination airport`})
    MERGE (departure)-[:FLIGHT {distance_miles: toFloat(row.distance_miles)}]->(arrival)
    """
    tx.run(query, rows=data)
    print("Monopartite graph created.")

In [36]:
def Bipartite_graph(tx, data):

    query = """
    UNWIND $rows AS row

    MERGE (airline:Airline {id: row.`airline ID`})
      ON CREATE SET airline.name = row.airline

    MERGE (departure_airport:Airport {id: row.`source airport id`})
      ON CREATE SET departure_airport.name = row.`source airport`

    MERGE (arrival_airport:Airport {id: row.`destination airport id`})
      ON CREATE SET arrival_airport.name = row.`destination airport`

    MERGE (airline)-[:OPERATES_AT]->(departure_airport)
    MERGE (airline)-[:OPERATES_AT]->(arrival_airport)
    """
    tx.run(query, rows=data)
    print("Bipartite graph created.")

In [None]:
def Threepartite_graph(tx, data):

    query = """
    UNWIND $rows AS row
    MERGE (departure:Airport {id: row.`source airport id`, name: row.`source airport`})
    MERGE (arrival:Airport {id: row.`destination airport id`, name: row.`destination airport`})
    MERGE (flight:Flight {airline: row.airline, airline_id: row.`airline ID`})
    MERGE (departure)-[:DEPARTS_AS]->(flight)
    MERGE (flight)-[:ARRIVES_AT]->(arrival)
    """
    tx.run(query, rows=data)
    print("Bipartite graph created.")

In [20]:
def my_neo4j_wipe_out_database():
    "wipe out database by deleting all nodes and relationships"
    
    query = "match (node)-[relationship]->() delete node, relationship"
    session.run(query)
    
    query = "match (node) delete node"
    session.run(query)

In [28]:
driver = neo4j.GraphDatabase.driver(uri="neo4j://neo4j:7687", auth=("neo4j","ucb_mids_w205"))

In [34]:
session = driver.session(database="neo4j")

In [38]:
print("Creating Monopartite Graph")
with driver.session() as session:
    my_neo4j_wipe_out_database()
    session.execute_write(Monopartite_graph, data=data_rows)

Creating Monopartite Graph
Monopartite graph created.


In [37]:
print("Creating Bipartite Graph")
with driver.session() as session:
    my_neo4j_wipe_out_database()
    session.execute_write(Bipartite_graph, data=data_rows)

Creating Bipartite Graph
Bipartite graph created.


In [41]:
def my_neo4j_run_query_pandas(query, **kwargs):
    "run a query and return the results in a pandas dataframe"
    
    result = session.run(query, **kwargs)
    
    df = pd.DataFrame([r.values() for r in result], columns=result.keys())
    
    return df

In [42]:
def my_neo4j_nodes_relationships():
    "print all the nodes and relationships"
   
    print("-------------------------")
    print("  Nodes:")
    print("-------------------------")
    
    query = """
        match (n) 
        return n.name as node_name, labels(n) as labels
        order by n.name
    """
    
    df = my_neo4j_run_query_pandas(query)
    
    number_nodes = df.shape[0]
    
    display(df)
    
    print("-------------------------")
    print("  Relationships:")
    print("-------------------------")
    
    query = """
        match (n1)-[r]->(n2) 
        return n1.name as node_name_1, labels(n1) as node_1_labels, 
            type(r) as relationship_type, n2.name as node_name_2, labels(n2) as node_2_labels
        order by node_name_1, node_name_2
    """
    
    df = my_neo4j_run_query_pandas(query)
    
    number_relationships = df.shape[0]
    
    display(df)
    
    density = (2 * number_relationships) / (number_nodes * (number_nodes - 1))
    
    print("-------------------------")
    print("  Density:", f'{density:.1f}')
    print("-------------------------")
    

In [43]:
session = driver.session(database="neo4j")

In [44]:
my_neo4j_nodes_relationships()

-------------------------
  Nodes:
-------------------------


Unnamed: 0,node_name,labels
0,ABJ,[Airport]
1,ACC,[Airport]
2,ACH,[Airport]
3,ADQ,[Airport]
4,AER,[Airport]
...,...,...
310,ZAM,[Airport]
311,ZHA,[Airport]
312,ZHY,[Airport]
313,ZRH,[Airport]


-------------------------
  Relationships:
-------------------------


Unnamed: 0,node_name_1,node_1_labels,relationship_type,node_name_2,node_2_labels
0,ABJ,[Airport],FLIGHT,BOY,[Airport]
1,ABJ,[Airport],FLIGHT,OUA,[Airport]
2,ACC,[Airport],FLIGHT,DKR,[Airport]
3,ACC,[Airport],FLIGHT,LOS,[Airport]
4,ACC,[Airport],FLIGHT,OUA,[Airport]
...,...,...,...,...,...
793,ZRH,[Airport],FLIGHT,GRZ,[Airport]
794,ZRH,[Airport],FLIGHT,LCA,[Airport]
795,ZRH,[Airport],FLIGHT,LPA,[Airport]
796,ZRH,[Airport],FLIGHT,RMF,[Airport]


-------------------------
  Density: 0.0
-------------------------


In [45]:
if 'driver' in locals() and driver:
    driver.close()
    print("Neo4j driver closed.")

Neo4j driver closed.


## Density Calculations:

* Maximum Density = (nodes (nodes - 1) ) / 2


* Actual Density = (2 * relationships) / (nodes * (nodes - 1) )

## Monopartitie - 1 node label, 1 relationship type, all graphs we have seen so far are monopartitie

## Bipartite - two sets, nodes from one set only connect to nodes in the other set

## k-Partite - k sets, nodes from one set only connect to nodes in another set, most real world graphs have a high k value; in this example k=4: Person, Club, Course, Day