# Road network availability

---

### Objective & method

In this notebook, we analyze road data availability by measuring the actual length of the road network--instead of just bytes of information. In the OSM database, roads are modelled by "way" objects (*i.e.* polylines) with a non-null value for the `highway` property, that describes the type of road (*e.g.* primary, secondary, residential, track, pedestrian, etc.).

---

In [14]:
import os
import pytz
from datetime import datetime
import subprocess
import json
import concurrent.futures

import osmium
import numpy as np
import pandas as pd
import geopandas as gpd
from fiona import crs
import geojson
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [2]:
INPUT_DIR = os.path.abspath(os.path.join('..', 'data', 'input'))
OUTPUT_DIR = os.path.abspath(os.path.join('..', 'data', 'output'))
OSM_DATAFILE = os.path.join(INPUT_DIR, 'africa-internal-190515.osh.pbf')

In [3]:
africa = gpd.read_file('../data/input/africa.geojson')
africa.set_index('ADMIN', drop=True, inplace=True)
africa.drop(columns='ISO_A3', inplace=True)
# Output data from previous notebook
densities = pd.read_csv('../data/output/osmdb_densities.csv', index_col=0)

* https://osm-internal.download.geofabrik.de/africa-internal.osh.pbf

## Geographic extracts

* https://oslandia.com/en/2017/07/03/openstreetmap-data-analysis-how-to-parse-the-data-with-python/
* https://osmcode.org/osmium-tool/manual.html#creating-geographic-extracts
* https://github.com/jazzband/python-geojson/issues/84

The following analysis focuses on the road network, so we first filter input data based on the `highway` OSM tag in order to reduce the size of the dataset.

In [4]:
if not os.path.isfile('../data/input/africa-highways.osh.pbf'):

    subprocess.run([
        'osmium', 'tags-filter', '../data/input/africa-190605.osh.pbf', 'w/highway',
        '-o', '../data/input/africa-highways.osh.pbf'
    ])

Then we perform a geographical extract for each country using the `osmium extract` subcommand.

In [6]:
# Create geographical extracts for each country
osmdir = '../data/input/osm'
os.makedirs(osmdir, exist_ok=True)
for country in africa.index:
    
    # Generate ID from country name
    cid = country.replace(' ', '_')

    # Avoid if file already exists
    if os.path.isfile(os.path.join(osmdir, cid + '.osh.pbf')):
        continue

    # Dump geometry into a GeoJSON file
    geom = africa.at[country, 'geometry'].__geo_interface__
    with open('/tmp/geom.geojson', 'w') as f:
        geojson.dump(geojson.Feature(geometry=geom), f)

    # Extract from main OSM datafile with osmium
    subprocess.run([
        'osmium', 'extract', '--with-history',
        '-p', '/tmp/geom.geojson',
        '../data/input/africa-highways.osh.pbf',
        '-o', os.path.join(osmdir, cid + '.osh.pbf')
    ])
    
    os.remove('/tmp/geom.geojson')

## Road network analysis

* https://wiki.openstreetmap.org/wiki/Highways

In [7]:
TAGS = [
    'highway',
    'maxspeed',
    'oneway',
    'width',
    'surface',
    'incline',
    'foot',
    'bicycle',
    'access',
    'bus',
    'tracktype',
    'lit',
    'lanes',
    'sidewalk',
    'crossing'
]

In [10]:
class RoadsHandler(osmium.SimpleHandler):
    
    def __init__(self):
        osmium.SimpleHandler.__init__(self)
        self.data = []
    
    def way(self, w):
        if 'highway' in w.tags:
            try:
                length = osmium.geom.haversine_distance(w.nodes)
            except osmium.InvalidLocationError:
                # Way incomplete (ignore)
                return
            self.data.append([
                w.id,
                w.version,
                pd.Timestamp(w.timestamp),
                w.tags['highway'],
                length
            ])

In [11]:
def snapshot(roads, date):
    """Snapshot of the road network at a specific date."""
    data = roads.query('timestamp <= @date').groupby('osmid').version.max()
    data = data.reset_index()
    return pd.merge(data, roads, on=['osmid', 'version'])

In [12]:
def summarize(roads, date):
    """Total road length per type at a specific date."""
    roads_snapshot = snapshot(roads, date)
    lengths = roads_snapshot.groupby('highway').length.sum().round()
    return lengths / 1000

In [13]:
def history(roads, start_date, end_date, freq='3M'):
    """Evolution of road length for each road type."""
    daterange = pd.date_range(start_date, end_date, freq=freq, tz=pytz.UTC)
    summaries = [summarize(roads, date) for date in daterange]
    evolution = pd.DataFrame(summaries).T
    evolution.columns = [d.strftime('%Y-%m') for d in daterange]
    return evolution

In [None]:
from mu

In [16]:
roads_availability = {}

def process(country):
    cid = country.replace(' ', '_')
    handler = RoadsHandler()
    handler.apply_file(os.path.join(osmdir, f'{cid}.osh.pbf'), locations=True)
    columns = ['osmid', 'version', 'timestamp', 'highway', 'length']
    roads = pd.DataFrame(handler.data, columns=columns)
    start = datetime(2010, 1, 1, tzinfo=pytz.UTC)
    end = datetime(2019, 5, 1, tzinfo=pytz.UTC)
    evolution = history(roads, start, end, freq='3M')
    return evolution

In [19]:
os.makedirs('../data/output/roads_availability', exist_ok=True)
results = {}

with concurrent.futures.ProcessPoolExecutor(max_workers=8) as executor:
    for country, evolution in zip(africa.index, executor.map(process, africa.index)):
        cid = country.replace(' ', '_')
        evolution.to_csv('../data/output/roads_availability/' + cid + '.csv')

KeyboardInterrupt: 

### Semantic information