# Was tut dieses Skript?

Dieses Skript lädt OSM-Daten von Geofabrik herunter,
filtert alle Straßen mit Osmium,
und konvertiert das Ergebnis in das Parquet-Format
(wahlweise mit geopandas oder ogr2ogr).

In [None]:
import os
import requests
import subprocess
import json
from datetime import datetime

import geopandas as gpd
import pandas as pd

from pathlib import Path

from config import GEOFABRIK_CONFIG, PROCESSING_CONFIG, MAPILLARY_CONFIG, TILES_CONFIG

## Check if osmium is installed

In [3]:
try:
    result = subprocess.run(['osmium', '--version'], check=True, capture_output=True, text=True)
    print(f"Osmium version: {result.stdout.strip()}")
except subprocess.CalledProcessError as e:
    print(f"Error running Osmium: {e}")

Osmium version: osmium version 1.16.0
libosmium version 2.20.0
Supported PBF compression types: none zlib lz4

Copyright (C) 2013-2023  Jochen Topf <jochen@topf.org>
License: GNU GENERAL PUBLIC LICENSE Version 3 <https://gnu.org/licenses/gpl.html>.
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law.


## Downloading OSM data from Geofabrik

In [4]:
def get_osm_timestamp(pbf_file):
    """Extract timestamp from PBF file using osmium."""
    try:
        result = subprocess.run([
            'osmium', 'fileinfo', pbf_file, '-g', 'header.option.timestamp'
        ], capture_output=True, text=True, check=True)
        return result.stdout.strip()
    except subprocess.CalledProcessError as e:
        print(f"Error getting timestamp from {pbf_file}: {e}")
        return None

def save_metadata(metadata, filename="osm_metadata.json"):
    """Save metadata to JSON file for use in other notebooks."""
    with open(filename, 'w') as f:
        json.dump(metadata, f, indent=2, default=str)
    print(f"Metadata saved to {filename}")

def load_metadata(filename="osm_metadata.json"):
    """Load metadata from JSON file."""
    try:
        with open(filename, 'r') as f:
            return json.load(f)
    except FileNotFoundError:
        print(f"Metadata file {filename} not found")
        return None

def download_geofabrik_pbf(pbf_url=None):
    if pbf_url is None:
        pbf_url = GEOFABRIK_CONFIG["pbf_url"]
    folder_download = GEOFABRIK_CONFIG["download_folder"]
    os.makedirs(folder_download, exist_ok=True)

    # Extract filename from URL
    filename = os.path.basename(pbf_url)
    file_path = os.path.join(folder_download, filename)
    file_url = pbf_url

    if os.path.exists(file_path):
        print(f"File already exists: {file_path}, skipping download.")
    else:
        print(f"Downloading: {file_url}")
        response = requests.get(file_url, stream=True, timeout=60, allow_redirects=True)
        if response.status_code == 200:
            # Show the final URL after redirects
            final_url = response.url
            if final_url != file_url:
                print(f"Redirected to: {final_url}")
            with open(file_path, "wb") as f:
                for chunk in response.iter_content(chunk_size=1024):
                    f.write(chunk)
            print(f"Downloaded: {file_path}")
        else:
            print(f"Failed to download {file_url} (Status code: {response.status_code})")


# osmium needs to be installed on your system in order to run this code/filtering
# https://osmcode.org/osmium-tool/
# for my win11 machine i used https://trac.osgeo.org/osgeo4w/

def run_osmium(filename):
    try:
        folder_download = GEOFABRIK_CONFIG["download_folder"]
        folder_processed = GEOFABRIK_CONFIG["processed_folder"]
        os.makedirs(folder_processed, exist_ok=True)

        input_pbf = os.path.join(folder_download, filename)
        filtered_pbf = os.path.join(folder_processed, "processed_highways_latest.pbf")

        # # Convert to Unix-style paths using forward slashes
        # input_pbf = input_pbf.replace("\\", "/")
        # filtered_pbf = filtered_pbf.replace("\\", "/")

        if os.path.exists(filtered_pbf):
            print(f"Processed file already exists: {filtered_pbf}, skipping processing.")
            return

        filter_command = [
            "osmium", "tags-filter",
            input_pbf,
            "w/highway",
            "-o", filtered_pbf
        ]
        print("🔹 Running: ", " ".join(filter_command))
        subprocess.run(filter_command, check=True)

        print("✅ Osmium processing complete! Files saved in 'processed_osm_files/'")

    except subprocess.CalledProcessError as e:
        print("❌ Error running Osmium:", e)




# Download and process the latest OSM data
download_geofabrik_pbf()
filename = os.path.basename(GEOFABRIK_CONFIG["pbf_url"])
run_osmium(filename)


Downloading: https://download.geofabrik.de/europe/germany-latest.osm.pbf
Downloaded: osm_geofabrik_pbf/germany-latest.osm.pbf
🔹 Running:  osmium tags-filter osm_geofabrik_pbf/germany-latest.osm.pbf w/highway -o processed_osm_files/processed_highways_latest.pbf
✅ Osmium processing complete! Files saved in 'processed_osm_files/'


In [5]:
# Load and display metadata
metadata = load_metadata()
if metadata:
    print("OSM Data Metadata:")
    print(f"  Data from: {metadata.get('osm_data_from', 'Unknown')}")
    print(f"  Download date: {metadata.get('download_date', 'Unknown')}")
    print(f"  File path: {metadata.get('file_path', 'Unknown')}")
    print(f"  Download URL: {metadata.get('download_url', 'Unknown')}")
else:
    print("No metadata found. Run the download first.")


Metadata file osm_metadata.json not found
No metadata found. Run the download first.


In [6]:
# Extract OSM data date from downloaded file
def extract_osm_date():
    """Extract OSM data date from downloaded PBF file."""
    # Get the downloaded file path
    filename = os.path.basename(GEOFABRIK_CONFIG["pbf_url"])
    file_path = os.path.join(GEOFABRIK_CONFIG["download_folder"], filename)

    if os.path.exists(file_path):
        print(f"Extracting OSM data date from: {file_path}")
        timestamp = get_osm_timestamp(file_path)
        if timestamp:
            # Save only the OSM data date
            metadata = {"osm_data_from": timestamp}
            save_metadata(metadata)
            print(f"✅ OSM data date: {timestamp}")
        else:
            print("❌ Could not extract timestamp from PBF file")
    else:
        print(f"❌ PBF file not found: {file_path}")
        print("   Run the download first!")

# Extract OSM data date
extract_osm_date()


Extracting OSM data date from: osm_geofabrik_pbf/germany-latest.osm.pbf
Metadata saved to osm_metadata.json
✅ OSM data date: 2025-10-17T20:21:35Z


In [7]:
# Display OSM data date
metadata = load_metadata()
if metadata and "osm_data_from" in metadata:
    print(f"📅 OSM data from: {metadata['osm_data_from']}")
else:
    print("❌ No OSM data date found. Run the extraction first.")


📅 OSM data from: 2025-10-17T20:21:35Z


____________________________________

### --- OSM-PBF zu Parquet konvertieren ---


In [None]:
# --- OSM-PBF zu Parquet konvertieren ---

# Es gibt verschiedene Möglichkeiten:

# Variante 1: Mit geopandas (einfach, aber langsamer und benötigt viel RAM) hier ~5min

# Variante 2: Mit ogr2ogr (am schnellsten, benötigt GDAL mit Parquet-Support) hier ~2min

# --- Hinweise zur Umgebung ---

# Für Variante 2 (ogr2ogr) wird eine GDAL-Version mit Parquet-Support benötigt.
# Empfohlene Installation mit micromamba:
# 1. micromamba installieren
# 2. Umgebung erstellen:
#    micromamba create -y -n gdal_parquet_env -c conda-forge python=3.10 pyarrow gdal libgdal-arrow-parquet ipykernel


In [None]:


def ogr2ogr_parquet(input_file, output_parquet, layer="lines"):
    ogr2ogr_path = "ogr2ogr"  # Use system-installed ogr2ogr 

    input_file = Path(input_file).resolve()
    output_parquet = Path(output_parquet).resolve()
    home = Path.home()

    def shorten_path(p: Path):
        try:
            return "~/" + str(p.relative_to(home))
        except ValueError:
            return str(p)


    cmd = [
        ogr2ogr_path,
        "-f", "Parquet",
        str(output_parquet),
        str(input_file),
    ]

    if layer:
        cmd.append(layer)


    # Kürze alle Pfade (auch nicht existierende)
    printable_cmd = []
    for c in cmd:
        # Kürze nur, wenn es nach einem Pfad aussieht
        if c.startswith("/") or c.startswith(str(home)):
            printable_cmd.append(shorten_path(Path(c)))
        else:
            printable_cmd.append(c)

    print("Running:", " ".join(printable_cmd))

    # Prepare environment
    env = os.environ.copy()

    try:
        result = subprocess.run(
            cmd,
            check=True,
            capture_output=True,
            text=True,
            env=env
        )
        print(result.stdout)
    except subprocess.CalledProcessError as e:
        print("❌ ogr2ogr failed:")
        print("STDERR:", e.stderr)
        print("STDOUT:", e.stdout)
        raise

## run 

In [None]:

if PROCESSING_CONFIG['pbf_to_parquet'] == 'geopandas':
    ### ~5min
    print("Using geopandas to convert PBF to Parquet")
    input_file = "processed_osm_files/processed_highways_latest.pbf"
    output_parquet = "processed_osm_files/processed_highways_latest.parquet"
    
    gdf = gpd.read_file(input_file, layer="lines")
    gdf.to_parquet(output_parquet)

if PROCESSING_CONFIG['pbf_to_parquet'] == 'ogr2ogr':
    ### ~2min
    print("Using ogr2ogr to convert PBF to Parquet")
    input_pbf = Path("processed_osm_files/processed_highways_latest.pbf")
    output_file = Path("processed_osm_files/processed_highways_latest.parquet")

    ogr2ogr_parquet(
        input_pbf,
        output_file,
    )
