In [2]:
import os
import requests
import subprocess

#from pyrosm import OSM
import geopandas as gpd
import pandas as pd

In [2]:
## check if osmium is installed

try:
    result = subprocess.run(['osmium', '--version'], check=True, capture_output=True, text=True)
    print(f"Osmium version: {result.stdout.strip()}")
except subprocess.CalledProcessError as e:
    print(f"Error running Osmium: {e}")

Osmium version: osmium version 1.16.0
libosmium version 2.20.0
Supported PBF compression types: none zlib lz4

Copyright (C) 2013-2023  Jochen Topf <jochen@topf.org>
License: GNU GENERAL PUBLIC LICENSE Version 3 <https://gnu.org/licenses/gpl.html>.
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law.


what this does:

1. Download OSM data from geofabrik
2. Extract all highways using osmium
3. Convert pbf tp geoparquet using ogr2ogr

OUTPUT: Filtered OSM-Netzwerk as geoparquet

In [6]:

#### Downloading OSM data from Geofabrik

#set_date = "250509" # 2025-05-09
set_date = "251006" 


#https://download.geofabrik.de/europe/germany/berlin-250401.osm.pbf
#	germany-250405.osm.pbf

def download_geofabrik_pbf(filename,base_url = "https://download.geofabrik.de/europe/"):
    folder_download = "osm_geofabrik_pbf"
    os.makedirs(folder_download, exist_ok=True)
    
    #filename = "germany-250401.osm.pbf"
    file_path = os.path.join(folder_download, filename)
    file_url = base_url + filename
    
    if os.path.exists(file_path):
        print(f"File already exists: {file_path}, skipping download.")
    else:
        print(f"Downloading: {file_url}")
        response = requests.get(file_url, stream=True, timeout=60)
        if response.status_code == 200:
            with open(file_path, "wb") as f:
                for chunk in response.iter_content(chunk_size=1024):
                    f.write(chunk)
            print(f"Downloaded: {file_path}")
        else:
            print(f"Failed to download {file_url} (Status code: {response.status_code})")


# osmium needs to be installed on your system in order to run this code/filtering
# https://osmcode.org/osmium-tool/
# for my win11 machine i used https://trac.osgeo.org/osgeo4w/

def run_osmium(filename):
    try:
        folder_download = "osm_geofabrik_pbf"
        folder_processed = "processed_osm_files"
        os.makedirs(folder_processed, exist_ok=True)
        
        input_pbf = os.path.join(folder_download, filename)
        filtered_pbf = os.path.join(folder_processed, f"processed_highways_germany_{set_date}.pbf")

        # # Convert to Unix-style paths using forward slashes
        # input_pbf = input_pbf.replace("\\", "/")
        # filtered_pbf = filtered_pbf.replace("\\", "/")

        if os.path.exists(filtered_pbf):
            print(f"Processed file already exists: {filtered_pbf}, skipping processing.")
            return

        filter_command = [
            "osmium", "tags-filter",
            input_pbf,
            "w/highway",
            "-o", filtered_pbf
        ]
        print("🔹 Running: ", " ".join(filter_command))
        subprocess.run(filter_command, check=True)

        print("✅ Osmium processing complete! Files saved in 'processed_osm_files/'")

    except subprocess.CalledProcessError as e:
        print("❌ Error running Osmium:", e)




filename = f"germany-{set_date}.osm.pbf"

download_geofabrik_pbf(filename )

run_osmium(filename)


Downloading: https://download.geofabrik.de/europe/germany-251006.osm.pbf
Downloaded: osm_geofabrik_pbf/germany-251006.osm.pbf
🔹 Running:  osmium tags-filter osm_geofabrik_pbf/germany-251006.osm.pbf w/highway -o processed_osm_files/processed_highways_germany_251006.pbf
✅ Osmium processing complete! Files saved in 'processed_osm_files/'


____________________________________

In [None]:
# --- OSM-PBF zu Parquet konvertieren ---

# Es gibt verschiedene Möglichkeiten:

# Variante 1: Mit geopandas (einfach, aber langsamer und benötigt viel RAM) hier ~5min

# Variante 2: Mit ogr2ogr (am schnellsten, benötigt GDAL mit Parquet-Support) hier ~2min

# --- Hinweise zur Umgebung ---

# Für Variante 2 (ogr2ogr) wird eine GDAL-Version mit Parquet-Support benötigt.
# Empfohlene Installation mit micromamba:
# 1. micromamba installieren
# 2. Umgebung erstellen:
#    micromamba create -y -n gdal_parquet_env -c conda-forge python=3.10 pyarrow gdal libgdal-arrow-parquet ipykernel

# Alternativ geht auch mit miniconda:
# 1. miniconda installieren
# 2. Umgebung erstellen:
#    source ~/miniconda3/bin/activate
#    conda create -y -n gdal_parquet_env -c conda-forge python=3.10 pyarrow gdal libgdal-arrow-parquet ipykernel


In [4]:
set_date = "251006" 

In [5]:
# Variante 1 (geopandas):

In [None]:
### via geopandas: 5min

gdf = gpd.read_file(f"processed_osm_files/processed_highways_germany_{set_date}.pbf", layer="lines")
gdf.to_parquet(f"processed_osm_files/processed_highways_germany_{set_date}_GP.parquet")

In [None]:
#Variante 2 (ogr2ogr):

In [1]:
import subprocess
from pathlib import Path
import os

def ogr2ogr_parquet(input_file, output_parquet, layer="lines", osmconf_path=None):
    ogr2ogr_path = "ogr2ogr"  # Use system-installed ogr2ogr on WSL

    input_file = Path(input_file).resolve()
    output_parquet = Path(output_parquet).resolve()

    cmd = [
        ogr2ogr_path,
        "-f", "Parquet",
        str(output_parquet),
        str(input_file),
    ]

    if layer:
        cmd.append(layer)


    # Prepare environment with optional OSM config path
    env = os.environ.copy()
    if osmconf_path:
        env["OSM_CONFIG_FILE"] = str(Path(osmconf_path).resolve())

    print("Running:", " ".join(cmd))
    try:
        result = subprocess.run(
            cmd,
            check=True,
            capture_output=True,
            text=True,
            env=env
        )
        print(result.stdout)
    except subprocess.CalledProcessError as e:
        print("❌ ogr2ogr failed:")
        print("STDERR:", e.stderr)
        print("STDOUT:", e.stdout)
        raise

In [2]:
# usage
#input_pbf = Path("processed_osm_files/processed_highways_berlin_250401.pbf")
#output_file = Path("processed_osm_files/processed_highways_berlin_250401.parquet")

set_date = "251006" 

input_pbf = Path(f"processed_osm_files/processed_highways_germany_{set_date}.pbf")
output_file = Path(f"processed_osm_files/processed_highways_germany_{set_date}_env_new_mamba.parquet")

ogr2ogr_parquet(
    input_pbf,
    output_file,
)

Running: ogr2ogr -f Parquet /home/simon/mapillary_coverage/processed_osm_files/processed_highways_germany_251006_env_new_mamba.parquet /home/simon/mapillary_coverage/processed_osm_files/processed_highways_germany_251006.pbf lines
0...10...20...30...40...50...60...70...80...90...100 - done in 00:01:51.

