In [None]:
### Description ###
# This script extract the entire driving network of UK from pbf
# Save the driving graph to gpickle
# And save the nodes and edges (smallest) as geojson for later feature enrichment

In [None]:
from pyrosm import OSM
import osmnx as ox
import geopandas as gpd
import networkx as nx

# === 1. Load OSM PBF file ===
pbf_path = "uk_highway_only.pbf"
osm = OSM(pbf_path)

# === 2. Extract the driving network edges and nodes ===
nodes, edges = osm.get_network(network_type="driving", nodes=True)

# === 3. Add x, y columns required by osmnx ===
nodes["x"] = nodes.geometry.x
nodes["y"] = nodes.geometry.y

# === 4. Prepare edge index for osmnx ===
edges["key"] = edges.groupby(["u", "v"]).cumcount()
edges.set_index(["u", "v", "key"], inplace=True)

# === 5. Convert to NetworkX graph ===
G = ox.graph_from_gdfs(nodes, edges)

# === 6. Save graph as .gpickle (preserves full topology & attributes) ===
nx.write_gpickle(G, "uk_driving_graph.gpickle")
print("‚úÖ Graph saved as uk_driving_graph.gpickle")

# === 7. Save nodes and edges as GeoJSON for external use ===
# nodes.to_file("uk_driving_nodes.geojson", driver="GeoJSON")
# edges.to_file("uk_driving_edges.geojson", driver="GeoJSON")
# print("‚úÖ Nodes and edges saved as GeoJSON")

In [None]:
### Description ###
# This script extract the entire driving network of UK from pbf
# Save the driving graph to gpickle
# And save the nodes and edges (simplified / merged) as geojson for later feature enrichment

In [1]:
from pyrosm import OSM
import osmnx as ox
import networkx as nx
import geopandas as gpd
import gc

pbf_path = "uk_highway_only.pbf"
print("üì• Loading PBF file...")
osm = OSM(pbf_path)

print("üö¶ Extracting driving network nodes and edges...")
nodes, edges = osm.get_network(network_type="driving", nodes=True)
print("‚û°Ô∏è Extracted:", len(nodes), "nodes and", len(edges), "edges")

üì• Loading PBF file...
üö¶ Extracting driving network nodes and edges...
‚û°Ô∏è Extracted: 26213394 nodes and 27137057 edges


In [2]:
nodes.head()

Unnamed: 0,timestamp,lat,visible,lon,tags,changeset,version,id,geometry
0,1291662509,52.555798,False,-1.826748,,0,2,200511,POINT (-1.82675 52.55580)
1,1291662508,52.555653,False,-1.826456,,0,1,1025338193,POINT (-1.82646 52.55565)
2,1291662513,52.555515,False,-1.826152,,0,2,177231081,POINT (-1.82615 52.55552)
3,1291662511,52.555271,False,-1.825647,,0,2,177081428,POINT (-1.82565 52.55527)
4,1291662509,52.555187,False,-1.825409,,0,1,1025338209,POINT (-1.82541 52.55519)


In [3]:
# Keep only id and geometry
nodes = nodes[["id", "geometry"]].copy()

# Add x/y required by OSMnx
nodes["x"] = nodes.geometry.x
nodes["y"] = nodes.geometry.y

# Set index to OSM node id
nodes = nodes.set_index("id")

In [4]:
edges.head()

Unnamed: 0,access,area,bicycle,bicycle_road,bridge,busway,cycleway,est_width,foot,footway,...,width,id,timestamp,version,tags,osm_type,geometry,u,v,length
0,,,,,,,,,,,...,,37,1424557057,21,"{""visible"":false,""abutters"":""residential"",""gri...",way,"LINESTRING (-1.82675 52.55580, -1.82646 52.55565)",200511,1025338193,25.483
1,,,,,,,,,,,...,,37,1424557057,21,"{""visible"":false,""abutters"":""residential"",""gri...",way,"LINESTRING (-1.82646 52.55565, -1.82615 52.55552)",1025338193,177231081,25.61
2,,,,,,,,,,,...,,37,1424557057,21,"{""visible"":false,""abutters"":""residential"",""gri...",way,"LINESTRING (-1.82615 52.55552, -1.82565 52.55527)",177231081,177081428,43.604
3,,,,,,,,,,,...,,37,1424557057,21,"{""visible"":false,""abutters"":""residential"",""gri...",way,"LINESTRING (-1.82565 52.55527, -1.82541 52.55519)",177081428,1025338209,18.646
4,,,,,,,,,,,...,,37,1424557057,21,"{""visible"":false,""abutters"":""residential"",""gri...",way,"LINESTRING (-1.82541 52.55519, -1.82517 52.55511)",1025338209,177081440,18.31


In [5]:
# Drop heavy/unnecessary columns early to save RAM
keep_edge_cols = ["u", "v", "geometry", "highway", "lanes", "maxspeed", "length"]
edges = edges[[col for col in edges.columns if col in keep_edge_cols]]

In [6]:
# Ensure MultiIndex for edges
edges["key"] = edges.groupby(["u", "v"]).cumcount()
edges = edges.set_index(["u", "v", "key"])

In [7]:
# Build NetworkX graph
print("üîó Building NetworkX graph...")
G = ox.graph_from_gdfs(nodes, edges)

# Free raw GeoDataFrames as soon as graph is built
del nodes, edges
gc.collect()

üîó Building NetworkX graph...


0

In [8]:
# Simplify graph in-place to avoid extra copies
print("üõ†Ô∏è Simplifying graph...")
G_simplified = ox.simplify_graph(G, strict=False)
print("‚û°Ô∏è Simplified:", len(G_simplified.nodes()), "nodes and", len(G_simplified.edges()), "edges")

# Free original graph
del G
gc.collect()


üõ†Ô∏è Simplifying graph...


  G_simplified = ox.simplify_graph(G, strict=False)


‚û°Ô∏è Simplified: 6301788 nodes and 7220434 edges


15

In [9]:
# Extract simplified GeoDataFrames
print("üìä Converting to GeoDataFrames...")
nodes_simplified, edges_simplified = ox.graph_to_gdfs(G_simplified, nodes=True, edges=True)


üìä Converting to GeoDataFrames...


In [10]:
# Save simplified nodes and edges as GeoJSON
print("üíæ Saving GeoJSON files...")
nodes_simplified.to_file("uk_driving_nodes_simplified.geojson", driver="GeoJSON")

üíæ Saving GeoJSON files...


In [11]:
import pandas as pd

# Detect columns with list-type values
for col in edges_simplified.columns:
    if edges_simplified[col].apply(lambda x: isinstance(x, list)).any():
        edges_simplified[col] = edges_simplified[col].apply(lambda x: ",".join(map(str, x)) if isinstance(x, list) else x)

edges_simplified.to_file("uk_driving_edges_simplified.geojson", driver="GeoJSON")
print("‚úÖ Saved simplified nodes and edges GeoJSON")

# Cleanup
del nodes_simplified, edges_simplified
gc.collect()

‚úÖ Saved simplified nodes and edges GeoJSON


4

In [12]:
import pickle

with open("uk_driving_graph_simplified.gpickle", "wb") as f:
    pickle.dump(G_simplified, f)

# with open("uk_driving_graph_simplified.gpickle", "rb") as f:
#     G = pickle.load(f)