In [6]:
# ------------------------------------------
# Was tut dieses Skript?
# Dieses Skript verarbeitet Mapillary-Coverage-Daten für OSM-Highways in Deutschland.
# Es liest die Coverage-Daten für "pano" und "regular", filtert sie nach einem Coverage-Ratio >= 0.6,
# kombiniert die Ergebnisse, entfernt Duplikate (bevorzugt "pano"), und speichert die finale Tabelle als CSV.
# siehe https://github.com/vizsim/mapillary_coverage/issues/1
# ------------------------------------------

In [7]:
import pandas as pd
import geopandas as gpd

from datetime import datetime

from config import TILES_CONFIG, PROCESSING_CONFIG, MAPILLARY_CONFIG


In [8]:
# set target variables
osm_data_from = None
ml_data_from = None

for fname, primary_key in files.items():
    p = Path(fname)
    if not p.exists():
        print(f"{p} not found")
        continue

    with p.open("r", encoding="utf-8") as f:
        meta = json.load(f)

    if primary_key in meta:
        value = meta[primary_key]
        if primary_key == "osm_data_from":
            osm_data_from = value
        elif primary_key == "ml_data_from":
            ml_data_from = value
        #print(f"{p.name} -> {primary_key}: {value}")

# optional: show assigned values
print("osm_data_from: ", osm_data_from)
print("ml_data_from: ", ml_data_from)

osm_data_from:  2025-10-17T20:21:35Z
ml_data_from:  2025-10-18T00:00:00Z


In [9]:

# Read the Parquet files
# gdf_mp_pano = pd.read_parquet("germany_osm-highways_25-10-06_mp_pano_coverage_25-10-07_ratio.parquet")
# gdf_mp_regular = pd.read_parquet("germany_osm-highways_25-10-06_mp_regular_coverage_25-10-07_ratio.parquet")


gdf_mp_pano = pd.read_parquet(PROCESSING_CONFIG["output_folder"] + "/germany_osm-highways_mp_pano_coverage_latest.parquet")
gdf_mp_regular = pd.read_parquet(PROCESSING_CONFIG["output_folder"] + "/germany_osm-highways_mp_regular_coverage_latest.parquet")



### pano

In [10]:

# drop all columns except osm_id and mp_coverage_ratio
gdf_mp_pano=gdf_mp_pano[["osm_id","mp_coverage_ratio"]].copy()

# filter for coverage ratio >= threshold
gdf_mp_pano_above_threshold=gdf_mp_pano[gdf_mp_pano["mp_coverage_ratio"]>=PROCESSING_CONFIG['mp_coverage_ratio_threshold']].copy()

# add new column "mapillary_coverage" with value "pano"
gdf_mp_pano_above_threshold["mapillary_coverage"] = "pano"

gdf_mp_pano_above_threshold

Unnamed: 0,osm_id,mp_coverage_ratio,mapillary_coverage
0,680,1.000000,pano
1,1978,1.000000,pano
5,2293021,1.000000,pano
7,2413197,1.000000,pano
8,2413202,1.000000,pano
...,...,...,...
775648,1442884707,1.000000,pano
775649,1442884708,1.000000,pano
775650,1442886071,1.000000,pano
775652,1442893495,1.000000,pano


### regular

In [11]:

# drop all columns except osm_id and mp_coverage_ratio
gdf_mp_regular=gdf_mp_regular[["osm_id","mp_coverage_ratio"]].copy()

# filter for coverage ratio >= threshold
gdf_mp_regular_above_threshold=gdf_mp_regular[gdf_mp_regular["mp_coverage_ratio"]>=PROCESSING_CONFIG['mp_coverage_ratio_threshold']].copy()

# add new column "mapillary_coverage" with value "regular"
gdf_mp_regular_above_threshold["mapillary_coverage"] = "regular"

gdf_mp_regular_above_threshold

Unnamed: 0,osm_id,mp_coverage_ratio,mapillary_coverage
2,104,1.000000,regular
6,111,1.000000,regular
7,117,1.000000,regular
8,122,1.000000,regular
9,123,0.868701,regular
...,...,...,...
3004947,1442897173,1.000000,regular
3004948,1442898138,0.656622,regular
3004954,1442900516,0.731486,regular
3004955,1442900517,1.000000,regular


In [12]:
# Concatenate both GeoDataFrames
both_concat=pd.concat([gdf_mp_pano_above_threshold,gdf_mp_regular_above_threshold],ignore_index=True)

both_concat["mapillary_coverage"].value_counts()

mapillary_coverage
regular    1645430
pano        434631
Name: count, dtype: int64

In [13]:
## drop duplicates, keep pano over regular
both_concat = both_concat.sort_values(by="mapillary_coverage", ascending=True).drop_duplicates(subset="osm_id", keep="first")

both_concat["mapillary_coverage"].value_counts()

mapillary_coverage
regular    1459682
pano        434631
Name: count, dtype: int64

In [14]:
# check for duplicates
both_concat["osm_id"].value_counts()

osm_id
680           1
1066813155    1
1066793353    1
1066793352    1
1066793351    1
             ..
188451547     1
188451544     1
188451542     1
188454501     1
1442901032    1
Name: count, Length: 1894313, dtype: int64

In [15]:
# drop all columns except osm_id and mapillary_coverage
both_concat=both_concat[["osm_id","mapillary_coverage"]].copy()
both_concat

Unnamed: 0,osm_id,mapillary_coverage
0,680,pano
289760,1066813155,pano
289759,1066793353,pano
289758,1066793352,pano
289757,1066793351,pano
...,...,...
983101,188451547,regular
983100,188451544,regular
983099,188451542,regular
983109,188454501,regular


In [16]:
# Save to CSV
#both_concat.to_csv("output/germany_osm-highways_25-10-06_mp_coverage_23-01-01until25-10-07_ratio_above_06.csv",index=False)
both_concat.to_csv("output/germany_osm-highways_mp-coverage_latest.csv",index=False)




In [24]:
# Get the current date
current_date = datetime.now().strftime("%Y-%m-%d")

# Create the content for the README file
readme_content = f"""# Mapillary Coverage per OSM Highway — Output

This folder contains the **latest** output file for *Mapillary coverage per OSM highway analysis*.

**Data created:** {current_date} 
**OSM data:** {osm_data_from} 
**Mapillary data:** {PROCESSING_CONFIG['min_capture_date']} → {ml_data_from} 
**Buffer distance:** {PROCESSING_CONFIG['buffer_distance']} meters 
**Coverage ratio threshold:** {PROCESSING_CONFIG['mp_coverage_ratio_threshold']} (60%) 

Segments are considered *covered* if at least {int(PROCESSING_CONFIG['mp_coverage_ratio_threshold']*100)}% of their length falls within {PROCESSING_CONFIG['buffer_distance']} meters of Mapillary images.
"""



# Write the README file
with open("output/README.md", "w", encoding="utf-8") as readme_file:
    readme_file.write(readme_content)