In [1]:
import numpy as np
import requests
import json
import time
import pandas as pd
from math import radians, cos, sin, asin, sqrt
from rtree import index

# --------------------------
# Helper Functions
# --------------------------
def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance (in km) between two points on Earth.
    """
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a))
    r = 6371  # Radius of Earth in kilometers
    return c * r

# Overpass query templates for different amenity types
amenity_queries = {
    'hospital': """
        [out:json];
        (
          node["amenity"="hospital"]({min_lat},{min_lon},{max_lat},{max_lon});
          way["amenity"="hospital"]({min_lat},{min_lon},{max_lat},{max_lon});
          node["amenity"="clinic"]({min_lat},{min_lon},{max_lat},{max_lon});
        );
        out center;
    """,
    'public_transport': """
        [out:json];
        (
          node["public_transport"="station"]({min_lat},{min_lon},{max_lat},{max_lon});
          node["highway"="bus_stop"]({min_lat},{min_lon},{max_lat},{max_lon});
          node["railway"="station"]({min_lat},{min_lon},{max_lat},{max_lon});
        );
        out center;
    """,
    'education': """
        [out:json];
        (
          node["amenity"~"^(school|college|university)$"]({min_lat},{min_lon},{max_lat},{max_lon});
          way["amenity"~"^(school|college|university)$"]({min_lat},{min_lon},{max_lat},{max_lon});
        );
        out center;
    """,
    'govt_office': """
        [out:json];
        (
          node["office"="government"]({min_lat},{min_lon},{max_lat},{max_lon});
          way["office"="government"]({min_lat},{min_lon},{max_lat},{max_lon});
        );
        out center;
    """,
    'highway': """
        [out:json];
        (
          way["highway"~"^(motorway|trunk|primary|secondary|tertiary)$"]({min_lat},{min_lon},{max_lat},{max_lon});
        );
        out center;
    """,
    'mall': """
        [out:json];
        (
          node["shop"="mall"]({min_lat},{min_lon},{max_lat},{max_lon});
          way["shop"="mall"]({min_lat},{min_lon},{max_lat},{max_lon});
        );
        out center;
    """,
    'park': """
        [out:json];
        (
          node["leisure"="park"]({min_lat},{min_lon},{max_lat},{max_lon});
          way["leisure"="park"]({min_lat},{min_lon},{max_lat},{max_lon});
        );
        out center;
    """,
    'airport': """
        [out:json];
        (
          node["aeroway"="aerodrome"]({min_lat},{min_lon},{max_lat},{max_lon});
          way["aeroway"="aerodrome"]({min_lat},{min_lon},{max_lat},{max_lon});
        );
        out center;
    """
}

overpass_url = "https://overpass-api.de/api/interpreter"

def download_amenity_points(bbox, query_template, retries=3):
    """
    Download amenity data for a given bounding box using the provided query template.
    Retries if a request fails.
    """
    min_lat, min_lon, max_lat, max_lon = bbox
    query = query_template.format(min_lat=min_lat, min_lon=min_lon, max_lat=max_lat, max_lon=max_lon)
    for attempt in range(retries):
        try:
            response = requests.get(overpass_url, params={'data': query}, timeout=60)
            # If response is empty, raise an error to trigger a retry
            if response.text.strip() == "":
                raise ValueError("Empty response")
            data = response.json()
            points = []
            for element in data.get('elements', []):
                if element['type'] == 'node':
                    points.append((element['lat'], element['lon']))
                elif 'center' in element:
                    center = element['center']
                    points.append((center['lat'], center['lon']))
            return points
        except Exception as e:
            print(f"Error downloading amenity data for bbox {bbox} on attempt {attempt+1}: {e}")
            time.sleep(2)
    return []

def get_amenity_data_for_bbox(bbox):
    """
    For a given bbox, download amenity data for all types.
    """
    amenity_data = {}
    for amenity_type, query_template in amenity_queries.items():
        print(f"Downloading {amenity_type} data for bbox {bbox}...")
        points = download_amenity_points(bbox, query_template)
        amenity_data[amenity_type] = list(set(points))  # deduplicate points
        print(f"Found {len(amenity_data[amenity_type])} unique {amenity_type} points.")
        time.sleep(1)  # be polite to the API
    return amenity_data

def build_spatial_index(points):
    """
    Build an R-tree spatial index for the given list of (lat, lon) points.
    """
    idx = index.Index()
    for i, (lat, lon) in enumerate(points):
        idx.insert(i, (lon, lat, lon, lat))
    return idx

def get_nearest_distance(lat, lon, idx, points):
    """
    Get the nearest distance (in km) between (lat, lon) and a set of points using the provided index.
    Returns -1 if no points are available.
    """
    if not points:
        return -1
    nearest = list(idx.nearest((lon, lat, lon, lat), 1))
    if nearest:
        nearest_idx = nearest[0]
        amenity_lat, amenity_lon = points[nearest_idx]
        return round(haversine(lon, lat, amenity_lon, amenity_lat), 3)
    return -1

def process_chunk(chunk_df):
    """
    Process a chunk (subset) of the CSV data:
      1. Compute the chunk's bounding box (with a small buffer).
      2. Download amenity data for that bbox.
      3. Build spatial indexes for each amenity type.
      4. Compute the nearest amenity distance for each row in the chunk.
    """
    # Compute bounding box for the chunk and add a buffer
    min_lat = chunk_df['latitude'].min()
    max_lat = chunk_df['latitude'].max()
    min_lon = chunk_df['longitude'].min()
    max_lon = chunk_df['longitude'].max()
    buffer = 0.1  # adjust as needed
    bbox = (min_lat - buffer, min_lon - buffer, max_lat + buffer, max_lon + buffer)
    
    # Download amenity data for this bbox
    amenity_data = get_amenity_data_for_bbox(bbox)
    
    # Build spatial indexes for each amenity type
    spatial_indexes = {}
    for amenity_type, points in amenity_data.items():
        spatial_indexes[amenity_type] = (build_spatial_index(points), points)
    
    # For each coordinate in the chunk, find the nearest distance for each amenity type
    results = []
    for i, row in chunk_df.iterrows():
        lat = row['latitude']
        lon = row['longitude']
        result = {
            "latitude": lat,
            "longitude": lon,
            "nearest_hospital": get_nearest_distance(lat, lon, *spatial_indexes.get('hospital', (None, []))),
            "nearest_public_transport": get_nearest_distance(lat, lon, *spatial_indexes.get('public_transport', (None, []))),
            "nearest_education": get_nearest_distance(lat, lon, *spatial_indexes.get('education', (None, []))),
            "nearest_govt_office": get_nearest_distance(lat, lon, *spatial_indexes.get('govt_office', (None, []))),
            "nearest_highway": get_nearest_distance(lat, lon, *spatial_indexes.get('highway', (None, []))),
            "nearest_mall": get_nearest_distance(lat, lon, *spatial_indexes.get('mall', (None, []))),
            "nearest_park": get_nearest_distance(lat, lon, *spatial_indexes.get('park', (None, []))),
            "nearest_airport": get_nearest_distance(lat, lon, *spatial_indexes.get('airport', (None, [])))
        }
        results.append(result)
    return results

# --------------------------
# Main Workflow: Process CSV in chunks
# --------------------------
def main():
    coordinates_df = pd.read_csv("lat and long.csv")
    all_results = []
    chunk_size = 3000
    num_chunks = (len(coordinates_df) + chunk_size - 1) // chunk_size
    print(f"Total rows: {len(coordinates_df)}, processing in {num_chunks} chunks of {chunk_size} rows each.")
    
    for i in range(num_chunks):
        print(f"\nProcessing chunk {i+1}/{num_chunks}...")
        chunk_df = coordinates_df.iloc[i*chunk_size:(i+1)*chunk_size]
        chunk_results = process_chunk(chunk_df)
        all_results.extend(chunk_results)
        # Save intermediate results for each chunk
        with open(f"results_chunk_{i+1}.json", "w") as outfile:
            json.dump(chunk_results, outfile, indent=4)
    
    # Save the final combined results
    with open("locations_amenities_list_total.json", "w") as outfile:
        json.dump(all_results, outfile, indent=4)
    print("All chunks processed and combined results saved.")

if __name__ == "__main__":
    main()


Total rows: 271466, processing in 91 chunks of 3000 rows each.

Processing chunk 1/91...
Downloading hospital data for bbox (28.05451431274414, 76.93120422363282, 28.7666660308838, 77.54757989999999)...
Found 1800 unique hospital points.
Downloading public_transport data for bbox (28.05451431274414, 76.93120422363282, 28.7666660308838, 77.54757989999999)...
Found 2390 unique public_transport points.
Downloading education data for bbox (28.05451431274414, 76.93120422363282, 28.7666660308838, 77.54757989999999)...
Found 1605 unique education points.
Downloading govt_office data for bbox (28.05451431274414, 76.93120422363282, 28.7666660308838, 77.54757989999999)...
Found 238 unique govt_office points.
Downloading highway data for bbox (28.05451431274414, 76.93120422363282, 28.7666660308838, 77.54757989999999)...
Found 18166 unique highway points.
Downloading mall data for bbox (28.05451431274414, 76.93120422363282, 28.7666660308838, 77.54757989999999)...
Found 116 unique mall points.
Down

In [4]:
import os
import json
import pandas as pd

# Initialize an empty list to store data
data_list = []

# Loop through all JSON files in the current directory
for i in range(1, 92):  # Since you have 91 files
    file_name = f"results_chunk_{i}.json"  # File naming pattern
    
    try:
        with open(file_name, "r", encoding="utf-8") as file:
            data = json.load(file)  # Load JSON file
            if isinstance(data, list):
                data_list.extend(data)  # Append list of data
            else:
                data_list.append(data)  # Append single JSON object
    except Exception as e:
        print(f"Error reading {file_name}: {e}")

# Convert list to DataFrame
df = pd.DataFrame(data_list)

# Save DataFrame to CSV in the same folder
df.to_csv("merged_output.csv", index=False, encoding="utf-8")

print("Merged CSV saved as merged_output.csv")


Merged CSV saved as merged_output.csv
