In [3]:
import json
import pandas as pd
import os
import geopandas as gpd
import matplotlib.pyplot as plt
from shapely.geometry import shape, Polygon, MultiPolygon
from shapely.wkt import loads
import requests

In [12]:
print(os.getpid())

23122


: 

In [None]:
df = pd.read_csv("/mntssd/mnt3/shanshanbai/nlpinearthobservation/synthetic_data/merged.csv", sep=",")
df = df.iloc[22507:100000]
df['geometry'] = df['geometry'].apply(loads)

In [None]:
len(df)

In [None]:
df.head()

In [8]:
# Output JSON Lines file to save results dynamically
output_file = "/mntssd/mnt3/shanshanbai/my_storage_from_qian/results/0_100000.jsonl"


In [9]:
# Define Overpass API endpoint
url = "https://overpass-api.de/api/interpreter"

In [10]:
# Function to decode Unicode escape sequences
def decode_unicode(string):
    try:
        return json.loads(f'"{string}"')  # Decodes strings like "\u00fc" to "ü"
    except json.JSONDecodeError:
        return string  # Return the string as-is if decoding fails

# Function to extract building information
def extract_building_info(element):
    tags = element.get('tags', {})
    building_type = tags.get('building')
    name = tags.get('name')
    operator = tags.get('operator')

    # Decode Unicode sequences in strings
    if name:
        name = decode_unicode(name)
    if operator:
        operator = decode_unicode(operator)
    
    return building_type, name, operator


In [11]:
# Process each geometry in the DataFrame
for index, row in df.iterrows():
    geometry = row['geometry']
    bounds = geometry.bounds  # (minx, miny, maxx, maxy)
    south, west, north, east = bounds[1], bounds[0], bounds[3], bounds[2]

    # Overpass query to retrieve relations, ways, and nodes
    query = f"""
    [out:json];
    (
      way["building"]({south},{west},{north},{east});
      relation["building"]({south},{west},{north},{east});
      node["building"]({south},{west},{north},{east});
    );
    out body;
    >;
    out skel qt;
    """
    
    # Initialize combined data for the current geometry
    combined_data = {
        'index': index,
        'building id': row.get("osm_building_id", None),
        'building class': row.get("building_class", None),
        'building city': row.get("building_city", None),
        'building tags': set(),
        'building names': set(),
        'building operators': set(),
        'tweet language distribution': row.get("tweet_lang", None),  # Include additional column if available
        'error': None  # Track errors if they occur
    }

    # Query the Overpass API
    try:
        response = requests.get(url, params={"data": query})
        response.raise_for_status()
        data = response.json()
    except requests.exceptions.RequestException as e:
        print(f"Error querying Overpass API for geometry {index}: {e}")
        combined_data['error'] = f"HTTP error: {str(e)}"
        # Write the error result for this row
        with open(output_file, "a", encoding="utf-8") as f:
            # json.dump(combined_data, f, ensure_ascii=False)
            f.write("Error querying Overpass API")
            f.write("\n")
        continue  # Skip processing this row

    # Check if data is empty
    if not data.get("elements"):
        print(f"No data retrieved for geometry {index}")
        combined_data['error'] = "No data retrieved from Overpass API"
        # Write the empty result for this row
        with open(output_file, "a", encoding="utf-8") as f:
            # json.dump(combined_data, f, ensure_ascii=False)
            f.write("No data retrieved from Overpass API")
            f.write("\n")
        continue

    # Process elements from the response
    for element in data.get("elements", []):
        if element["type"] in ["relation", "way"]:
            building_type, name, operator = extract_building_info(element)
            if building_type and building_type != "yes":
                combined_data["building tags"].add(building_type)
            if name:
                combined_data["building names"].add(name)
            if operator:
                combined_data["building operators"].add(operator)

    # Convert sets to lists for JSON compatibility
    combined_data["building tags"] = list(combined_data["building tags"])
    combined_data["building names"] = list(combined_data["building names"])
    combined_data["building operators"] = list(combined_data["building operators"])

    # Use fallback from 'building_class' if 'building tags' is empty
    if not combined_data['building tags']:
        fallback_class = row.get("building_class", None)
        if pd.notna(fallback_class):
            combined_data['building tags'].append(fallback_class)

    # Save results dynamically to JSON Lines file
    with open(output_file, "a", encoding="utf-8") as f:
        json.dump(combined_data, f, ensure_ascii=False)
        f.write("\n")

    print(f"Saved results for geometry {index}/{len(df)}")



Saved results for geometry 22507/77493
Saved results for geometry 22508/77493
Saved results for geometry 22509/77493
Saved results for geometry 22510/77493
Saved results for geometry 22511/77493
Saved results for geometry 22512/77493
Saved results for geometry 22513/77493
Saved results for geometry 22514/77493
Saved results for geometry 22515/77493
Saved results for geometry 22516/77493
Saved results for geometry 22517/77493
Saved results for geometry 22518/77493
Saved results for geometry 22519/77493
Saved results for geometry 22520/77493
Saved results for geometry 22521/77493
Saved results for geometry 22522/77493
Saved results for geometry 22523/77493
Saved results for geometry 22524/77493
Saved results for geometry 22525/77493
Saved results for geometry 22526/77493
Saved results for geometry 22527/77493
Saved results for geometry 22528/77493
Saved results for geometry 22529/77493
Saved results for geometry 22530/77493
Saved results for geometry 22531/77493
Saved results for geometr