In [1]:
import requests
import time
import csv
import pandas as pd
import numpy as np
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

In [None]:
# Replace with your actual API token
MAPILLARY_TOKEN = 'MLY|TOKEN'


# Define San Francisco bounding box (approximate)
bbox = "-122.5149,37.7081,-122.3569,37.8324"  # [west,south,east,north]

# Base URL for image search
BASE_URL = "https://graph.mapillary.com/images"

# Fields to request
FIELDS = "id,thumb_1024_url,captured_at,geometry"

# URL to request images
def get_images(bbox, limit=1000):
    params = {
        "access_token": MAPILLARY_TOKEN,
        "bbox": bbox,
        "fields": FIELDS,
        "limit": limit
    }
    image_urls = []

    while True:
        response = requests.get(BASE_URL, params=params)
        data = response.json()
        
        for feature in data.get("data", []):
            image_urls.append(feature.get("thumb_1024_url"))
        
        # Handle pagination if more results exist
        next_url = data.get("links", {}).get("next")
        if not next_url:
            break
        else:
            # wait to avoid hitting rate limits
            time.sleep(1)
            response = requests.get(next_url)
            data = response.json()
    
    return image_urls

In [8]:
# Run it
image_urls = get_images(bbox)

# Print first few
print(f"Total images: {len(image_urls)}")
for url in image_urls[:5]:
    print(url)


Total images: 1000
https://scontent-iad3-1.xx.fbcdn.net/m1/v/t6/An_UCqlX-sQm4vJzE3J_vqSitlHi9sTWRvOQioNyl_uyIPxmpLGNksocNoEy9aRp_p4BTlCf6mL6z5uA5etaTfm8m5zN5TRr91AT51kYprLQD4q-yvxNb0vTFURQHQPSXVhDl0wA_k7vXTH3gP_WNg?stp=s1024x512&edm=AOnQwmMEAAAA&_nc_gid=6G4-2qrZBVr8Vv-YUiNpcw&_nc_oc=Adn7Ow7NIhI2pZNeOXspifY7aWU20vT9jkjGVjdcWNu6v_ePUf-AsAvHct029F7l7Jg&ccb=10-5&oh=00_AfLUPFxmibBOmvKxXrBaqHhtoFDhpzsS06KSZGRjJBUzwg&oe=6845670A&_nc_sid=201bca
https://scontent-iad3-1.xx.fbcdn.net/m1/v/t6/An8FTYwqBDcF1WBTTw2yqwJQy9UOK5dGNS7X1UTaGkH8KCl0CVsTld6c6-MrQ6iP7UvNJYp9C3LhYkY17BaTKUHpbOz4lxckaJ_ctaRnesxrg8-BMaXkNJdbvDK7wIii7_9FncK_9sCApDLaOkeZjg?stp=s1024x576&edm=AOnQwmMEAAAA&_nc_gid=6G4-2qrZBVr8Vv-YUiNpcw&_nc_oc=Adn8WUgm-2wQFzBlGdxQ8Gk7GwGttb5wOy3zqFq6etBHDW8FJAvjpzuQs2yqxhVBCvo&ccb=10-5&oh=00_AfIHPXY44vYSqMz70vF3G8UE-lRUy-dbWoGUEjlr15gyyA&oe=68458A51&_nc_sid=201bca
https://scontent-iad3-1.xx.fbcdn.net/m1/v/t6/An9HbjYSrz0F57xfUAcjEBSDwBPfV0Hl6fMVPkdhNof7LqlFCQcllDhqbsibVvfIRO8k61O0qf4eQefajU59n_SV7QFg

In [9]:
import requests
import time
import csv


# Define bounding box for San Francisco: [west,south,east,north]
bbox = "-122.5149,37.7081,-122.3569,37.8324"

# API base and parameters
BASE_URL = "https://graph.mapillary.com/images"
FIELDS = "id,thumb_1024_url,geometry,captured_at"
LIMIT = 1000  # max per request

def get_images_with_location(bbox):
    params = {
        "access_token": MAPILLARY_TOKEN,
        "bbox": bbox,
        "fields": FIELDS,
        "limit": LIMIT
    }

    image_data = []

    while True:
        response = requests.get(BASE_URL, params=params)
        data = response.json()

        for item in data.get("data", []):
            image_id = item.get("id")
            url = item.get("thumb_1024_url")
            lat = item["geometry"]["coordinates"][1]
            lon = item["geometry"]["coordinates"][0]
            timestamp = item.get("captured_at")
            image_data.append({
                "id": image_id,
                "url": url,
                "latitude": lat,
                "longitude": lon,
                "captured_at": timestamp
            })

        next_url = data.get("links", {}).get("next")
        if not next_url:
            break
        else:
            time.sleep(1)  # Be polite to the API
            response = requests.get(next_url)
            data = response.json()

    return image_data

In [None]:
# Collect the data
results = get_images_with_location(bbox)

In [12]:
results_df = pd.DataFrame(results)
results_df

Unnamed: 0,id,url,latitude,longitude,captured_at
0,1729318050735725,https://scontent-iad3-1.xx.fbcdn.net/m1/v/t6/A...,37.784378,-122.407647,1658225672000
1,1752079222322441,https://scontent-iad3-1.xx.fbcdn.net/m1/v/t6/A...,37.784772,-122.397862,1734431794976
2,1813573022738727,https://scontent-iad3-1.xx.fbcdn.net/m1/v/t6/A...,37.786997,-122.398421,1734085023514
3,4136912486558424,https://scontent-iad3-1.xx.fbcdn.net/m1/v/t6/A...,37.785637,-122.404098,1736245009516
4,1728656104740193,https://scontent-iad3-1.xx.fbcdn.net/m1/v/t6/A...,37.786803,-122.402693,1734096252604
...,...,...,...,...,...
995,1424721848497954,https://scontent-iad3-1.xx.fbcdn.net/m1/v/t6/A...,37.789927,-122.391190,1733239549579
996,1409777943342621,https://scontent-iad3-1.xx.fbcdn.net/m1/v/t6/A...,37.791293,-122.392478,1733236125359
997,1412135273547707,https://scontent-iad3-1.xx.fbcdn.net/m1/v/t6/A...,37.792785,-122.393958,1733322898063
998,2923327124496592,https://scontent-iad3-1.xx.fbcdn.net/m1/v/t6/A...,37.790593,-122.396740,1733482245475


In [None]:
# Save to CSV
with open("sf_mapillary_images.csv", "w", newline="") as csvfile:
    fieldnames = ["id", "url", "latitude", "longitude", "captured_at"]
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(results)

print(f"Saved {len(results)} images to sf_mapillary_images.csv")

In [13]:
import requests
import time
import csv


# San Francisco bounding box (west, south, east, north)
bbox = "-122.5149,37.7081,-122.3569,37.8324"

# Base URL and query fields
BASE_URL = "https://graph.mapillary.com/images"
FIELDS = "id,thumb_1024_url,geometry,captured_at"
LIMIT = 1000

def get_all_images(bbox):
    image_data = []
    url = BASE_URL
    params = {
        "access_token": MAPILLARY_TOKEN,
        "bbox": bbox,
        "fields": FIELDS,
        "limit": LIMIT
    }

    while url:
        response = requests.get(url, params=params if url == BASE_URL else {})
        data = response.json()

        for item in data.get("data", []):
            try:
                image_data.append({
                    "id": item.get("id"),
                    "url": item.get("thumb_1024_url"),
                    "latitude": item["geometry"]["coordinates"][1],
                    "longitude": item["geometry"]["coordinates"][0],
                    "captured_at": item.get("captured_at")
                })
            except (KeyError, TypeError):
                continue  # skip problematic entries

        print(f"Collected: {len(image_data)} images so far...")

        # Prepare for next loop
        url = data.get("links", {}).get("next")
        params = {}  # only needed for the first request
        time.sleep(0.5)  # be polite to the API

    return image_data


In [14]:
# Fetch data
results = get_all_images(bbox)

Collected: 1000 images so far...


In [None]:
# Save to CSV
with open("sf_mapillary_all_images.csv", "w", newline="") as f:
    writer = csv.DictWriter(f, fieldnames=["id", "url", "latitude", "longitude", "captured_at"])
    writer.writeheader()
    writer.writerows(results)

print(f"Done! Total images saved: {len(results)}")


In [15]:
import requests
import time
import csv

# San Francisco bounding box (west, south, east, north)
bbox = "-122.5149,37.7081,-122.3569,37.8324"

# Base URL and query fields
BASE_URL = "https://graph.mapillary.com/images"
FIELDS = "id,thumb_1024_url,geometry,captured_at"
LIMIT = 1000

def get_all_images(bbox):
    image_data = []
    url = BASE_URL
    params = {
        "access_token": MAPILLARY_TOKEN,
        "bbox": bbox,
        "fields": FIELDS,
        "limit": LIMIT
    }

    while url:
        response = requests.get(url, params=params if url == BASE_URL else {})
        data = response.json()

        for item in data.get("data", []):
            try:
                image_data.append({
                    "id": item.get("id"),
                    "url": item.get("thumb_1024_url"),
                    "latitude": item["geometry"]["coordinates"][1],
                    "longitude": item["geometry"]["coordinates"][0],
                    "captured_at": item.get("captured_at")
                })
            except (KeyError, TypeError):
                continue  # skip problematic entries

        print(f"Collected: {len(image_data)} images so far...")

        # Prepare for next loop
        url = data.get("links", {}).get("next")
        params = {}  # only needed for the first request
        time.sleep(0.5)  # be polite to the API

    return image_data

# Fetch data
results = get_all_images(bbox)

# Save to CSV
with open("sf_mapillary_all_images.csv", "w", newline="") as f:
    writer = csv.DictWriter(f, fieldnames=["id", "url", "latitude", "longitude", "captured_at"])
    writer.writeheader()
    writer.writerows(results)

print(f"Done! Total images saved: {len(results)}")


Collected: 1000 images so far...
Done! Total images saved: 1000


In [None]:

BASE_URL = "https://graph.mapillary.com/images"
FIELDS = "id,thumb_1024_url,geometry,captured_at"
LIMIT = 1000

# Define bounding box for San Francisco
lon_min, lat_min = -122.5149, 37.7081
lon_max, lat_max = -122.3569, 37.8324
tile_size = 0.001  # approx ~100m tiles

def get_images_in_tile(tile_bbox):
    url = BASE_URL
    params = {
        "access_token": MAPILLARY_TOKEN,
        "bbox": tile_bbox,
        "fields": FIELDS,
        "limit": LIMIT
    }
    images = []
    while url:
        resp = requests.get(url, params=params if url == BASE_URL else {})
        data = resp.json()

        for item in data.get("data", []):
            try:
                images.append({
                    "id": item["id"],
                    "url": item["thumb_1024_url"],
                    "latitude": item["geometry"]["coordinates"][1],
                    "longitude": item["geometry"]["coordinates"][0],
                    "captured_at": item.get("captured_at")
                })
            except (KeyError, TypeError):
                continue
        url = data.get("links", {}).get("next")
        params = {}
        time.sleep(0.5)
    return images

def generate_tiles(lon_min, lat_min, lon_max, lat_max, step):
    lats = [round(lat_min + i * step, 5) for i in range(int((lat_max - lat_min) / step) + 1)]
    lons = [round(lon_min + i * step, 5) for i in range(int((lon_max - lon_min) / step) + 1)]
    tiles = []
    for i in range(len(lats) - 1):
        for j in range(len(lons) - 1):
            tile = f"{lons[j]},{lats[i]},{lons[j+1]},{lats[i+1]}"
            tiles.append(tile)
    return tiles

# Build tiles and fetch images
tiles = generate_tiles(lon_min, lat_min, lon_max, lat_max, tile_size)
all_images = []

for i, tile in enumerate(tiles):
    print(f"Fetching tile {i+1}/{len(tiles)}: {tile}")
    images = get_images_in_tile(tile)
    all_images.extend(images)
    print(f" → {len(images)} images collected")

# Save to CSV
with open("sf_mapillary_full.csv", "w", newline="") as f:
    writer = csv.DictWriter(f, fieldnames=["id", "url", "latitude", "longitude", "captured_at"])
    writer.writeheader()
    writer.writerows(all_images)

print(f"\n Done! Total images saved: {len(all_images)}")

In [7]:
import requests
import time
import csv
from concurrent.futures import ThreadPoolExecutor, as_completed

# Basic parameters
MAPILLARY_TOKEN = 'MLY|25519453337668549|19ddb43d09d1996ff1ceaec8b0693efc'
BASE_URL = "https://graph.mapillary.com/images"
FIELDS = "id,thumb_1024_url,geometry,captured_at"
LIMIT = 1000
tile_size = 0.0005 # approx ~50m tiles
MAX_WORKERS = 10

# Define SF bounding box
lon_min, lat_min = -122.5149, 37.7081
lon_max, lat_max = -122.3569, 37.8324

def generate_tiles(lon_min, lat_min, lon_max, lat_max, step):
    lats = [round(lat_min + i * step, 5) for i in range(int((lat_max - lat_min) / step) + 1)]
    lons = [round(lon_min + i * step, 5) for i in range(int((lon_max - lon_min) / step) + 1)]
    tiles = []
    for i in range(len(lats) - 1):
        for j in range(len(lons) - 1):
            tile = f"{lons[j]},{lats[i]},{lons[j+1]},{lats[i+1]}"
            tiles.append(tile)
    return tiles

def fetch_tile(tile_bbox):
    url = BASE_URL
    params = {
        "access_token": MAPILLARY_TOKEN,
        "bbox": tile_bbox,
        "fields": FIELDS,
        "limit": LIMIT
    }
    images = []
    while url:
        try:
            resp = requests.get(url, params=params if url == BASE_URL else {})
            data = resp.json()
            for item in data.get("data", []):
                images.append({
                    "id": item["id"],
                    "url": item["thumb_1024_url"],
                    "latitude": item["geometry"]["coordinates"][1],
                    "longitude": item["geometry"]["coordinates"][0],
                    "captured_at": item.get("captured_at")
                })
            url = data.get("links", {}).get("next")
            params = {}  # clear params for pagination
            time.sleep(0.2)  # rate limiting
        except Exception as e:
            print(f"Error on tile {tile_bbox}: {e}")
            break
    return images

# Generate tiles
tiles = generate_tiles(lon_min, lat_min, lon_max, lat_max, tile_size)

all_images = []
maxed_out_tiles = []  # List to hold tiles that hit the 1000 image cap

with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    futures = {executor.submit(fetch_tile, tile): tile for tile in tiles}
    for i, future in enumerate(as_completed(futures), 1):
        tile = futures[future]
        try:
            result = future.result()
            all_images.extend(result)
            print(f"[{i}/{len(tiles)}] {len(result)} images from tile {tile}")
            if len(result) == 1000:
                maxed_out_tiles.append(tile)
        except Exception as exc:
            print(f"Tile {tile} generated an exception: {exc}")

# Write to CSV
with open("sf_mapillary_parallel.csv", "w", newline="") as f:
    writer = csv.DictWriter(f, fieldnames=["id", "url", "latitude", "longitude", "captured_at"])
    writer.writeheader()
    writer.writerows(all_images)

print(f"\nDone! Total images saved: {len(all_images)}")

[1/78368] 0 images from tile -122.5109,37.7081,-122.5104,37.7086
[2/78368] 0 images from tile -122.5139,37.7081,-122.5134,37.7086
[3/78368] 0 images from tile -122.5144,37.7081,-122.5139,37.7086
[4/78368] 0 images from tile -122.5099,37.7081,-122.5094,37.7086
[5/78368] 0 images from tile -122.5119,37.7081,-122.5114,37.7086
[6/78368] 0 images from tile -122.5104,37.7081,-122.5099,37.7086
[7/78368] 0 images from tile -122.5134,37.7081,-122.5129,37.7086
[8/78368] 0 images from tile -122.5149,37.7081,-122.5144,37.7086
[9/78368] 0 images from tile -122.5114,37.7081,-122.5109,37.7086
[10/78368] 0 images from tile -122.5129,37.7081,-122.5124,37.7086
[11/78368] 0 images from tile -122.5124,37.7081,-122.5119,37.7086
[12/78368] 0 images from tile -122.5094,37.7081,-122.5089,37.7086
[13/78368] 0 images from tile -122.5069,37.7081,-122.5064,37.7086
[14/78368] 0 images from tile -122.5089,37.7081,-122.5084,37.7086
[15/78368] 0 images from tile -122.5084,37.7081,-122.5079,37.7086
[16/78368] 0 images

In [8]:
with open("sf_maxed_out_tiles.txt", "w") as f:
    for tile in maxed_out_tiles:
        f.write(f"{tile}\n")

print(f"\n{len(maxed_out_tiles)} tiles returned exactly 1000 images. Saved to 'sf_maxed_out_tiles.txt'")


130 tiles returned exactly 1000 images. Saved to 'sf_maxed_out_tiles.txt'


In [9]:
def subdivide_tile(tile_str, split_factor=4):
    """Split a tile into smaller tiles by dividing lat/lon ranges."""
    lon_min, lat_min, lon_max, lat_max = map(float, tile_str.split(","))
    lon_step = (lon_max - lon_min) / split_factor
    lat_step = (lat_max - lat_min) / split_factor
    subtiles = []

    for i in range(split_factor):
        for j in range(split_factor):
            sub_lon_min = lon_min + j * lon_step
            sub_lon_max = sub_lon_min + lon_step
            sub_lat_min = lat_min + i * lat_step
            sub_lat_max = sub_lat_min + lat_step
            subtile = f"{sub_lon_min},{sub_lat_min},{sub_lon_max},{sub_lat_max}"
            subtiles.append(subtile)
    
    return subtiles


In [10]:
def revisit_maxed_out_tiles(maxed_out_tiles_file, output_csv="sf_mapillary_retry.csv", workers=5):
    all_retried_images = []

    # Read tiles from file
    with open(maxed_out_tiles_file, "r") as f:
        original_tiles = [line.strip() for line in f.readlines()]
    
    # Subdivide each into smaller tiles
    refined_tiles = []
    for tile in original_tiles:
        refined_tiles.extend(subdivide_tile(tile, split_factor=4))  # creates 16 subtiles

    print(f"Revisiting {len(original_tiles)} maxed-out tiles as {len(refined_tiles)} subtiles...")

    with ThreadPoolExecutor(max_workers=workers) as executor:
        futures = {executor.submit(fetch_tile, tile): tile for tile in refined_tiles}
        for i, future in enumerate(as_completed(futures), 1):
            tile = futures[future]
            try:
                result = future.result()
                all_retried_images.extend(result)
                print(f"[{i}/{len(refined_tiles)}] {len(result)} images from refined tile {tile}")
            except Exception as exc:
                print(f"Refined tile {tile} failed: {exc}")

    # Save retried images
    with open(output_csv, "w", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=["id", "url", "latitude", "longitude", "captured_at"])
        writer.writeheader()
        writer.writerows(all_retried_images)

    print(f"\n Retry complete. Saved {len(all_retried_images)} additional images to {output_csv}")


In [11]:
# Run the retry script
revisit_maxed_out_tiles("sf_maxed_out_tiles.txt")

Revisiting 130 maxed-out tiles as 2080 subtiles...
[1/2080] 38 images from refined tile -122.4079,37.741225,-122.407775,37.74135
[2/2080] 26 images from refined tile -122.40752499999999,37.7411,-122.4074,37.741225
[3/2080] 50 images from refined tile -122.4079,37.7411,-122.407775,37.741225
[4/2080] 62 images from refined tile -122.40764999999999,37.7411,-122.40752499999999,37.741225
[5/2080] 24 images from refined tile -122.4079,37.74135,-122.407775,37.741474999999994
[6/2080] 126 images from refined tile -122.407775,37.7411,-122.40765,37.741225
[7/2080] 90 images from refined tile -122.407775,37.741225,-122.40765,37.74135
[8/2080] 99 images from refined tile -122.40764999999999,37.741225,-122.40752499999999,37.74135
[9/2080] 42 images from refined tile -122.40752499999999,37.741225,-122.4074,37.74135
[10/2080] 4 images from refined tile -122.4079,37.741475,-122.407775,37.7416
[11/2080] 59 images from refined tile -122.407775,37.74135,-122.40765,37.741474999999994
[12/2080] 41 images f