In [None]:
import json
from pathlib import Path
import pandas as pd
from collections import Counter
import requests
import os
from tqdm import tqdm

In [None]:
data_base = Path("data")

In [None]:
data = json.loads(Path("data/campusplan.json").read_text())
data[0]

In [None]:
df = pd.DataFrame(data)
len(df)

## Some entries have 2 positions

In [None]:
Counter(df.positionList.apply(len))

In [None]:
df[df.positionList.apply(len) == 2]

We deal with this (in the pandas world) by exploding based on the positionList. We do this because in the following data enrichment, we want to have a unique position (lat and long) per row.

Later, we perhaps want a normalized SQL schema.

In [None]:
df = df.explode("positionList")
len(df)

In [None]:
df.head()

In [None]:
df.to_csv(data_base / "campusplan.csv", index=False)

## Data enrichment

In [None]:
df.loc[0]

In [None]:
def reverse_geocode(lat, lon):
    url = f"https://nominatim.openstreetmap.org/reverse?format=geojson&lat={lat}&lon={lon}"
    response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
    return response.json()

def get_details(osm_type, osm_id):
    # Convert osm_type to API-required short form
    osm_type_map = {'node': 'N', 'way': 'W', 'relation': 'R'}
    type_code = osm_type_map[osm_type.lower()]
    url = f"https://nominatim.openstreetmap.org/details?osmtype={type_code}&osmid={osm_id}&format=json"
    response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
    return response.json()

def enrich_and_save(df, output_dir: Path, refetch: bool = False):
    output_dir.mkdir(parents=True, exist_ok=refetch)
    for index, row in tqdm(df.iterrows(), total=len(df)):
        lat, lon = row["positionList"]
        rev_data = reverse_geocode(lat, lon)

        # Extract osm_type and osm_id
        try:
            osm_type = rev_data["features"][0]["properties"]["osm_type"]
            osm_id = rev_data["features"][0]["properties"]["osm_id"]
        except (KeyError, IndexError) as e:
            tqdm.write(f"Problem for {i}: {e}")

        details_data = get_details(osm_type, osm_id)

        # Save raw data
        with open(os.path.join(output_dir, f"{row['id']}_reverse.json"), "w") as f:
            json.dump(rev_data, f, indent=2)
        with open(os.path.join(output_dir, f"{row['id']}_details.json"), "w") as f:
            json.dump(details_data, f, indent=2)

In [None]:
df.iloc[-1:]

In [None]:
enrich_and_save(df, data_base / "nominatim_cache", refetch = False)

## Transform / merge

Explore the data in `explore_data.ipynb`. Adjust transformation based on insights.

- `rev_` prefixed columns come from data from reverse geocoding.
- `det_` prefixed columns come from data from OSM details.

In [None]:
def get_cached_details(id: str):
    p = data_base / "nominatim_cache"
    return json.loads((p / f"{id}_details.json").read_text())

def get_cached_reverse(id: str):
    p = data_base / "nominatim_cache"
    return json.loads((p / f"{id}_reverse.json").read_text())
    

In [None]:
df["reverse"] = df.id.apply(get_cached_reverse)
df["details"] = df.id.apply(get_cached_details)

In [None]:
df.head()

### from reverse mapping
`["features"][0]["properties"][[type, category, name, display_name]]`

In [None]:
for property_name in ("type", "category", "name", "display_name"):
    df[f"rev_{property_name}"] = df["reverse"].apply(lambda x: x["features"][0]["properties"][property_name])

In [None]:
df.columns

### from details
`["extratags"]["wheelchair(:description),opening hours,contact:phone"]`

In [None]:
for extra_tag_name in ("opening_hours", "contact:phone", "wheelchair", "wheelchair:description"):
    df[f"det_{extra_tag_name}"] = df["details"].apply(lambda x: x["extratags"].get(extra_tag_name))

In [None]:
df.columns

In [None]:
del df["reverse"]
del df["details"]

df.to_csv(data_base / "campusplan_enriched.csv", index=False)