# Testing extracting OSM data using Osmium

In [3]:
import os, sys, time, importlib
import osmnx

import geopandas as gpd
import pandas as pd
import networkx as nx
import numpy as np
sys.path.append("../../../GOSTNets")
import GOSTnets as gn

# pip install osmium
import osmium, logging
import shapely.wkb as wkblib

from shapely.geometry import LineString, Point

import time

In [4]:
# set file
some_file = './colombo.osm.pbf'

## simplest example of using Osmium

In [2]:
class HotelHandler(osmium.SimpleHandler):
    def __init__(self):
        super(HotelHandler, self).__init__()
        self.hotels = []

    def node(self, o):
        if o.tags.get('tourism') == 'hotel' and 'name' in o.tags:
            self.hotels.append(o.tags['name'])


h = HotelHandler()
h.apply_file(some_file)

print(sorted(h.hotels))

['Allahar Dan Hotel', 'BRAC INN', 'Bengal Inn', 'BestWestern LaVinci', 'Bismillah hotel', 'Blossom Hotel', 'Chittagong Hotel', 'Contemporary Heights Hotel', 'Element By Westin', 'Grand Dhaka Hotel', 'Grand Prince Hotel', 'Hotel Abakash', 'Hotel Al Razzak International', 'Hotel Ambala Inn', 'Hotel Arif', 'Hotel De Crystal Crown', 'Hotel Delux', 'Hotel Givenci', 'Hotel Givenci', 'Hotel Indropuri International', 'Hotel Khan', 'Hotel Lake Breeze', 'Hotel Milina', 'Hotel Mohammadia', 'Hotel New Savar হোটেল নিউ সাভার', 'Hotel Nidra', 'Hotel Razmoni Isha Kha', 'Hotel Sweet Dreams Dhaka', 'Hotel Washington', 'Innotel Luxury Business Hotel', 'La Villa', 'Le Meridien Dhaka', 'Manhattan Hotel', 'Nordic Hotels', 'Olyampia Palace', 'Platinum Grande', 'Platinum Suites', 'Quality Inn', 'Rawnak Palace', 'Richmont hotel', 'Robin ar Hotel রবিনের হোটেল', 'Royal Park Hotel', 'Sarina Hotel', 'Savar Internation', 'The Waterfront Hotel', 'Tripti Niloy', 'hotel Golden Deer', 'khaja Ajmiri Hotel', 'victory', '

## Extracting highways and nodes using Osmium

In [5]:
start_time = time.time()

wkbfab = osmium.geom.WKBFactory()

# extract highways
class HighwayExtractor(osmium.SimpleHandler):
    def __init__(self):
        osmium.SimpleHandler.__init__(self) 
        self.nodes = []
        #self.raw_h = []
        self.highways = []
        self.broken_highways = []
        self.total = 0
        self.num_nodes = 0
        
    def node(self, n):
        wkb = wkbfab.create_point(n)
        shp = wkblib.loads(wkb, hex = True)
        self.nodes.append([n.id, shp, shp.x, shp.y])
        #self.num_nodes += 1
        #self.nodes.append(shp)
    
    def way(self, w):
        #self.raw_h.append(w)
        try:
            nodes = [x.ref for x in w.nodes]
            wkb = wkbfab.create_linestring(w)
            shp = wkblib.loads(wkb, hex=True)
            if 'highway' in w.tags:
                info = [w.id, nodes, shp, w.tags['highway']]
                self.highways.append(info)
#                 self.highways.append(w.tags['highway'])
#                 self.total += shp.length
        except:
            print('hit exception')
            nodes = [x for x in w.nodes if x.location.valid()]
            if len(nodes) > 1:
                shp = LineString([Point(x.location.x, x.location.y) for x in nodes])
                info = [w.id, nodes, shp, w.tags['highway']]
                self.highways.append(info)
            else:
                self.broken_highways.append(w)
            logging.warning("Error Processing OSM Way %s" % w.id)
            
        
h = HighwayExtractor()
h.apply_file(some_file, locations=True)
#print(len(h.nodes))
print(len(h.highways))
print(len(h.broken_highways))

end_time = time.time()
print(end_time - start_time)

54063
0
184.9615442752838


## Even though we cannot get the nodes of ways using the ogr osm driver, still time it

In [6]:
start_time = time.time()

from osgeo import ogr
from shapely.wkt import loads

driver = ogr.GetDriverByName("OSM")

data = driver.Open(some_file)
sql_lyr = data.ExecuteSQL("SELECT * FROM lines WHERE highway IS NOT NULL")
roads = []

for feature in sql_lyr:
    if feature.GetField("highway") is not None:
        osm_id = feature.GetField("osm_id")
        shapely_geo = loads(feature.geometry().ExportToWkt())
        if shapely_geo is None:
            continue
        highway = feature.GetField("highway")
        roads.append([osm_id,highway,shapely_geo])

data = driver.Open(some_file)
sql_lyr = data.ExecuteSQL("SELECT * FROM points")
curRes = {}

for nodes in sql_lyr:
    nodes_vals = nodes.items()
    nodes_vals['geometry'] = loads(feature.geometry().ExportToWkt())
    curRes[nodes_vals['osm_id']] = nodes_vals

end_time = time.time()
print(end_time - start_time)

6.457503080368042


### results: 
Extracting highways and nodes independently using the OGR OSM driver took about 6.5 seconds. While using Osmium to extract just the highways with their nodes took about 44 seconds for Colombo. Using Osmium to extract the highway nodes and highways seperately took about 185 seconds.

In [8]:
nodes_df = pd.DataFrame(h.nodes, columns = ["osm_id", "geometry", "x", "y"])

## Split up the highways into seperate edges between each node. This involves doing a lookup based on osm_id from the nodes_df for each segment

In [9]:
start_time = time.time()

all_h = []

for x in h.highways:
    for n_idx in range(0, (len(x[1]) - 1)):
        try:
            osm_id_from = x[1][n_idx].ref
        except:
            osm_id_from = x[1][n_idx]
        try:
            osm_id_to   = x[1][n_idx+1].ref
        except:
            osm_id_to   = x[1][n_idx+1]
        try:
            from_pt = nodes_df.loc[nodes_df['osm_id'] == osm_id_from,'geometry'].iloc[0]
            to_pt   = nodes_df.loc[nodes_df['osm_id'] == osm_id_to  ,'geometry'].iloc[0]
            edge = LineString([from_pt, to_pt])
            attr = {'osm_id':x[0], 'Wkt':edge, 'length':edge.length, 'infra_type':x[3]}
            #Create an edge from the list of nodes in both directions
            all_h.append([osm_id_from, osm_id_to, attr])
            all_h.append([osm_id_to, osm_id_from, attr])
        except:
            logging.warning(f"Error adding edge between nodes {osm_id_from} and {osm_id_to}")

end_time = time.time()
print(end_time - start_time)

2404.4707980155945


### results: 
Took about 2404 seconds to run

In [12]:
all_h[:5]

[[60796641,
  2419600689,
  {'osm_id': 8111662,
   'Wkt': <shapely.geometry.linestring.LineString at 0x11dd5eed0>,
   'length': 0.00042626476514233825,
   'infra_type': 'tertiary'}],
 [2419600689,
  60796641,
  {'osm_id': 8111662,
   'Wkt': <shapely.geometry.linestring.LineString at 0x11dd5eed0>,
   'length': 0.00042626476514233825,
   'infra_type': 'tertiary'}],
 [2419600689,
  3830183348,
  {'osm_id': 8111662,
   'Wkt': <shapely.geometry.linestring.LineString at 0x11dd7edd0>,
   'length': 0.00028653378509489497,
   'infra_type': 'tertiary'}],
 [3830183348,
  2419600689,
  {'osm_id': 8111662,
   'Wkt': <shapely.geometry.linestring.LineString at 0x11dd7edd0>,
   'length': 0.00028653378509489497,
   'infra_type': 'tertiary'}],
 [91030247,
  3825202158,
  {'osm_id': 8111663,
   'Wkt': <shapely.geometry.linestring.LineString at 0x11d458b10>,
   'length': 6.193197881564978e-05,
   'infra_type': 'trunk'}]]

In [13]:
start_time = time.time()

G = nx.MultiDiGraph()
G.add_nodes_from([[osm_id, {'shape':shp, 'x':x, 'y':y}] for osm_id, shp, x, y in h.nodes])
G.add_edges_from(all_h)

end_time = time.time()
print(end_time - start_time)

26.02744698524475


### results: 
Took about 26 seconds to run

In [16]:
len(G.edges)

954374

In [17]:
gn.save(G, "osmium_graph", "./")