In [1]:
import h5py
import osmnx as ox
import pandas as pd
import numpy as np
import networkx as nx
from tqdm import tqdm
from itertools import product
from functools import cache
from scipy.spatial import KDTree
from haversine import haversine

In [30]:
# Referenced:
# https://towardsdatascience.com/finding-time-dependent-travel-times-between-every-pair-of-locations-in-manhattan-c3c48b0db7ba
# https://towardsdatascience.com/shortest-path-algorithm-with-osm-walking-network-6d2863ae96be
# https://osmnx.readthedocs.io/en/stable/osmnx.html and https://github.com/gboeing/osmnx
# https://movement.uber.com/?lang=en-US

### Update the city name and speed files, if available. If not, just make sure they point to a real speed file ###
place = 'New York, NY'
savename = r'C:\Users\willd\Documents\Georgia Tech\CSE6424\Project\ZCTA_New_York.csv'
speed_path = r'C:\Users\willd\Documents\Georgia Tech\CSE6424\Project\2020_speeds_new_york.csv'

In [31]:
# Get the graph and the speeds associated with all edges
graph = ox.graph_from_place(place, network_type='drive')
print('Got graph')
graph = ox.add_edge_speeds(graph)
print('Got speeds')
graph = ox.add_edge_travel_times(graph)
print('Got travel times')
graph = ox.utils_graph.get_largest_component(graph, strongly=True)
print('Got largest connected component')
# If you want to save for future use
#ox.save_graphml(graph, r'C:\Users\willd\Documents\Georgia Tech\CSE6424\Project\new_york_metro_area_cleaned.graphml')
#graph = ox.load_graphml(r'C:\Users\willd\Documents\Georgia Tech\CSE6424\Project\new_york_metro_area_cleaned.graphml')

  gdf = gdf.append(_geocode_query_to_gdf(q, wr, by_osmid))


Got graph
Got speeds
Got travel times
Got largest connected component


In [32]:
# How big is the graph?
print(len(graph.edges), len(graph.nodes))

139975 55228


In [33]:
# Find all food stores
food_tags = {'shop': 'supermarket', 'amenity': 'marketplace'}
food_places = ox.geometries_from_place(place, food_tags)

  gdf = gdf.append(_geocode_query_to_gdf(q, wr, by_osmid))


In [34]:
# Replace Polygons with a single point
food_places.loc[food_places['geometry'].type == 'Polygon', 'geometry'] = food_places.loc[food_places['geometry'].type == 'Polygon', 'geometry'].representative_point()

In [35]:
# Find all major green places or recreational areas
physical_tags = {'leisure': ['park', 'recreation_ground', 'playground', 'fitness_station', 'sports_centre', 'nature_reserve', 'pitch']}
physical_places = ox.geometries_from_place(place, physical_tags)
physical_places.loc[physical_places['geometry'].type == 'Polygon', 'geometry'] = physical_places.loc[physical_places['geometry'].type == 'Polygon', 'geometry'].representative_point()

  gdf = gdf.append(_geocode_query_to_gdf(q, wr, by_osmid))
  for merged_outer_linestring in list(merged_outer_linestrings):
  for merged_outer_linestring in list(merged_outer_linestrings):


In [36]:
# Find public transit
transport_tags = {'public_transport': ['platform', 'stop_position'], 'highway': ['bus_stop', 'platform'],
                 'railway': ['subway_entrance', 'station', 'tram', 'tram_stop'], 'station': 'subway'}
transport_places = ox.geometries_from_place(place, transport_tags)
transport_places.loc[transport_places['geometry'].type == 'Polygon', 'geometry'] = transport_places.loc[transport_places['geometry'].type == 'Polygon', 'geometry'].representative_point()

  gdf = gdf.append(_geocode_query_to_gdf(q, wr, by_osmid))


In [37]:
# Find libraries and schools
education_tags = {'amenity': ['library', 'school', 'kindergarten']}
education_places = ox.geometries_from_place(place, education_tags)
education_places.loc[education_places['geometry'].type == 'Polygon', 'geometry'] = education_places.loc[education_places['geometry'].type == 'Polygon', 'geometry'].representative_point()

  gdf = gdf.append(_geocode_query_to_gdf(q, wr, by_osmid))
  for merged_outer_linestring in list(merged_outer_linestrings):
  for merged_outer_linestring in list(merged_outer_linestrings):


In [38]:
# Find places of worship
worship_tags = {'amenity': 'place_of_worship'}
worship_places = ox.geometries_from_place(place, worship_tags)
worship_places.loc[worship_places['geometry'].type == 'Polygon', 'geometry'] = worship_places.loc[worship_places['geometry'].type == 'Polygon', 'geometry'].representative_point()

  gdf = gdf.append(_geocode_query_to_gdf(q, wr, by_osmid))
  for merged_outer_linestring in list(merged_outer_linestrings):
  for merged_outer_linestring in list(merged_outer_linestrings):


In [39]:
# Simplify everything
food_places = food_places['geometry'].droplevel(0)
physical_places = physical_places['geometry'].droplevel(0)
transport_places = transport_places['geometry'].droplevel(0)
education_places = education_places['geometry'].droplevel(0)
worship_places = worship_places['geometry'].droplevel(0)
food_places = food_places[food_places.type == 'Point']
physical_places = physical_places[physical_places.type == 'Point']
transport_places = transport_places[transport_places.type == 'Point']
education_places = education_places[education_places.type == 'Point']
worship_places = worship_places[worship_places.type == 'Point']

In [40]:
# Load the speed data
speed_raw = pd.read_csv(speed_path)

In [41]:
# Remove unnecessary columns
speed_raw.drop(columns=['quarter', 'year', 'segment_id', 'start_junction_id', 'end_junction_id'], inplace=True)
# Noon seems like a fair time
speed_raw = speed_raw[speed_raw['hour_of_day'] == 12]
speed_raw.set_index('osm_way_id', drop=True, inplace=True)

In [43]:
# Find the real speed information by matching up OSM with UberMovement if possible
diffs = []
dists = []
real_calculation = 0
for edge in tqdm(graph.edges):
    # length is meters, speed_kph is kph (duh), maxspeed has units in string, and travel_time is seconds
    e = graph[edge[0]][edge[1]][edge[2]]
    if isinstance(e['osmid'], list):
        # Some graph edges are made up of multiple OSM ways apparently
        for osmid in e['osmid']:
            try:
                meters_per_second = speed_raw.at[osmid, 'speed_mph_mean']*0.44704    # Convert to meters/sec
                time = e['length']/meters_per_second
                if isinstance(time, float) and isinstance(e['travel_time'], float):
                    diffs.append(time - e['travel_time'])
                    dists.append(e['length'])
                real_calculation += 1
                break
            except (KeyError, ZeroDivisionError):
                time = e['travel_time']     # Backup (i.e. length/speed limit) if Uber data isn't available
    else:
        try:
            meters_per_second = speed_raw.at[e['osmid'], 'speed_mph_mean']*0.44704
            time = e['length']/meters_per_second
            real_calculation += 1
        except (KeyError, ZeroDivisionError):
            time = e['travel_time']
    if isinstance(time, pd.Series):
        # This is an occasional error when reading in the CSV
        time = time.mean()
    graph[edge[0]][edge[1]][edge[2]]['actual_travel_time'] = time

100%|████████████████████████████████████████████████████████████████████████| 139975/139975 [01:10<00:00, 1985.86it/s]


In [44]:
# What's the discrepancy between streets with and without real travel times?
print(np.nanmean(diffs), np.nanmean(dists))

7.5413888370318505 287.114783339047


In [14]:
### Update with the Census health tracts data ###
health = pd.read_csv(r'C:\Users\willd\Documents\Georgia Tech\CSE6424\Project\PLACES__Census_Tract_Data__GIS_Friendly_Format___2021_release.csv')

In [15]:
# Parse the lat/long
health['zcta_x'] = health['Geolocation'].map(lambda x: x.lstrip('POINT (').rstrip(')').split(' ')[0]).astype(np.float32)
health['zcta_y'] = health['Geolocation'].map(lambda x: x.lstrip('POINT (').rstrip(')').split(' ')[1]).astype(np.float32)

In [16]:
# Find the nearest OSM nodes to the census tract centers
all_nearest_nodes, dists = ox.distance.nearest_nodes(graph, health['zcta_x'].to_numpy(), health['zcta_y'].to_numpy(), return_dist=True)
# If it's more than 1km away from anything in the graph, then we probably don't want it
all_nearest_nodes = np.asarray(all_nearest_nodes)[np.asarray(dists) < 1000]
health = health[np.asarray(dists) < 1000]
print('Number left:', all_nearest_nodes.shape[0])
food_closest_travel_times = np.full(all_nearest_nodes.shape[0], np.nan, dtype=np.float32)
physical_closest_dist = food_closest_travel_times.copy()
transport_closest_dist = food_closest_travel_times.copy()
education_closest_travel_times = food_closest_travel_times.copy()
worship_closest_travel_times = food_closest_travel_times.copy()

Number: 75


In [17]:
# Find the nearest nodes to each of the features of interest
food_nodes, dists = ox.distance.nearest_nodes(graph, [x.x for x in food_places], [x.y for x in food_places], return_dist=True)
print('Found food', 'Number within 1km:', np.count_nonzero(np.asarray(dists) < 1000), 'Number total:', len(food_nodes))
food_nodes = np.asarray(food_nodes)[np.asarray(dists) < 1000]
physical_nodes, dists = ox.distance.nearest_nodes(graph, [x.x for x in physical_places], [x.y for x in physical_places], return_dist=True)
print('Found physical', 'Number within 1km:', np.count_nonzero(np.asarray(dists) < 1000), 'Number total:', len(physical_nodes))
physical_nodes = np.asarray(physical_nodes)[np.asarray(dists) < 1000]
transport_nodes, dists = ox.distance.nearest_nodes(graph, [x.x for x in transport_places], [x.y for x in transport_places], return_dist=True)
print('Found transport', 'Number within 1km:', np.count_nonzero(np.asarray(dists) < 1000), 'Number total:', len(transport_nodes))
transport_nodes = np.asarray(transport_nodes)[np.asarray(dists) < 1000]
education_nodes, dists = ox.distance.nearest_nodes(graph, [x.x for x in education_places], [x.y for x in education_places], return_dist=True)
print('Found education', 'Number within 1km:', np.count_nonzero(np.asarray(dists) < 1000), 'Number total:', len(education_nodes))
education_nodes = np.asarray(education_nodes)[np.asarray(dists) < 1000]
worship_nodes, dists = ox.distance.nearest_nodes(graph, [x.x for x in worship_places], [x.y for x in worship_places], return_dist=True)
print('Found worship', 'Number within 1km:', np.count_nonzero(np.asarray(dists) < 1000), 'Number total:', len(worship_nodes))
worship_nodes = np.asarray(worship_nodes)[np.asarray(dists) < 1000]

Found food Number within 1km: 132 Number total: 140
Found physical Number within 1km: 2814 Number total: 3050
Found transport Number within 1km: 484 Number total: 488
Found education Number within 1km: 573 Number total: 605
Found worship Number within 1km: 1273 Number total: 1311


In [18]:
# Convert their lats/lons to an array
food_lat_lons = np.array([[graph.nodes[x]['x'] for x in food_nodes], [graph.nodes[x]['y'] for x in food_nodes]], dtype=np.float32).T
physical_lat_lons = np.array([[graph.nodes[x]['x'] for x in physical_nodes], [graph.nodes[x]['y'] for x in physical_nodes]], dtype=np.float32).T
transport_lat_lons = np.array([[graph.nodes[x]['x'] for x in transport_nodes], [graph.nodes[x]['y'] for x in transport_nodes]], dtype=np.float32).T
education_lat_lons = np.array([[graph.nodes[x]['x'] for x in education_nodes], [graph.nodes[x]['y'] for x in education_nodes]], dtype=np.float32).T
worship_lat_lons = np.array([[graph.nodes[x]['x'] for x in worship_nodes], [graph.nodes[x]['y'] for x in worship_nodes]], dtype=np.float32).T

In [19]:
# Create the k-d trees for quick querying
food_tree = KDTree(food_lat_lons)
physical_tree = KDTree(physical_lat_lons)
transport_tree = KDTree(transport_lat_lons)
education_tree = KDTree(education_lat_lons)
worship_tree = KDTree(worship_lat_lons)

In [20]:
# Easy lat/lon distance calculations
def haversine_wrapper(lat1, lon1, lat2, lon2):
    return haversine((lat1, lon1), (lat2, lon2))

In [21]:
# Find the nearest nodes using the k-d trees
closest_food_nodes = []
closest_education_nodes = []
closest_worship_nodes = []
for i, sample_node in tqdm(enumerate(all_nearest_nodes), total=len(all_nearest_nodes)):
    idxes = food_tree.query((graph.nodes[sample_node]['x'], graph.nodes[sample_node]['y']), k=5)[1]
    closest_food_nodes.append(food_nodes[idxes])
    idx = physical_tree.query((graph.nodes[sample_node]['x'], graph.nodes[sample_node]['y']), k=1)[1]
    dist = haversine_wrapper(graph.nodes[sample_node]['y'], graph.nodes[sample_node]['x'], physical_lat_lons[idx, 1], physical_lat_lons[idx, 0])
    physical_closest_dist[i] = dist
    idx = transport_tree.query((graph.nodes[sample_node]['x'], graph.nodes[sample_node]['y']), k=1)[1]
    dist = haversine_wrapper(graph.nodes[sample_node]['y'], graph.nodes[sample_node]['x'], transport_lat_lons[idx, 1], transport_lat_lons[idx, 0])
    transport_closest_dist[i] = dist
    idxes = education_tree.query((graph.nodes[sample_node]['x'], graph.nodes[sample_node]['y']), k=5)[1]
    closest_education_nodes.append(education_nodes[idxes])
    idxes = worship_tree.query((graph.nodes[sample_node]['x'], graph.nodes[sample_node]['y']), k=5)[1]
    closest_worship_nodes.append(worship_nodes[idxes])

100%|████████████████████████████████████████████████████████████████████████████████| 75/75 [00:00<00:00, 2491.65it/s]


In [22]:
# Calculate the shortest path using Dijkstra's
@cache    # Trying to speed things up a little
def shortest_path(source, target):
    return nx.shortest_path_length(graph, source=source, target=target, weight='actual_travel_time')

In [23]:
# Find the shortest path/closest node to each feature node from each census node
for i, sample_node in enumerate(tqdm(all_nearest_nodes)):
    current_shortest = np.inf
    shortest_node = 0
    for food_node in closest_food_nodes[i]:
        shortest = shortest_path(sample_node, food_node)
        if shortest < current_shortest:
            current_shortest = shortest
            shortest_node = food_node
    food_closest_travel_times[i] = current_shortest
    
    current_shortest = np.inf
    shortest_node = 0
    for education_node in closest_education_nodes[i]:
        shortest = shortest_path(sample_node, education_node)
        if shortest < current_shortest:
            current_shortest = shortest
            shortest_node = education_node
    education_closest_travel_times[i] = current_shortest
    
    current_shortest = np.inf
    shortest_node = 0
    for worship_node in closest_worship_nodes[i]:
        shortest = shortest_path(sample_node, worship_node)
        if shortest < current_shortest:
            current_shortest = shortest
            shortest_node = worship_node
    worship_closest_travel_times[i] = current_shortest

100%|██████████████████████████████████████████████████████████████████████████████████| 75/75 [00:05<00:00, 13.50it/s]


In [24]:
# Get the results ready
health['food_closest_travel_times'] = food_closest_travel_times
health['physical_closest_dist'] = physical_closest_dist
health['transport_closest_dist'] = transport_closest_dist
health['education_closest_travel_times'] = education_closest_travel_times
health['worship_closest_travel_times'] = worship_closest_travel_times
health

Unnamed: 0,ZCTA5,TotalPopulation,ACCESS2_CrudePrev,ACCESS2_Crude95CI,ARTHRITIS_CrudePrev,ARTHRITIS_Crude95CI,BINGE_CrudePrev,BINGE_Crude95CI,BPHIGH_CrudePrev,BPHIGH_Crude95CI,...,TEETHLOST_CrudePrev,TEETHLOST_Crude95CI,Geolocation,zcta_x,zcta_y,food_closest_travel_times,physical_closest_dist,transport_closest_dist,education_closest_travel_times,worship_closest_travel_times
19038,77047,21077,28.4,"(26.3, 30.6)",20.8,"(20.3, 21.3)",17.6,"(17.2, 17.9)",37.0,"(36.3, 37.7)",...,18.9,"(13.9, 24.3)",POINT (-95.3865954 29.6084493),-95.386597,29.608450,506.399994,0.805984,0.814393,268.000000,0.000000
19067,77088,49660,37.6,"(36.2, 38.8)",23.3,"(23.0, 23.6)",16.0,"(15.8, 16.2)",39.6,"(39.2, 39.9)",...,23.4,"(21.6, 25.4)",POINT (-95.45356169 29.88180657),-95.453560,29.881807,287.100006,0.345974,1.839086,20.000000,55.900002
19242,77087,36399,49.2,"(47.3, 51.2)",20.3,"(20.0, 20.6)",16.7,"(16.5, 17.0)",34.4,"(34.0, 34.8)",...,25.1,"(23.1, 27.2)",POINT (-95.30343992 29.68607999),-95.303436,29.686081,149.399994,0.398267,2.335928,70.400002,39.900002
19446,77055,41989,36.5,"(34.9, 38.2)",19.2,"(18.9, 19.4)",19.0,"(18.8, 19.3)",29.8,"(29.4, 30.1)",...,16.3,"(14.4, 18.1)",POINT (-95.4917735 29.80742053),-95.491776,29.807421,120.900002,0.488102,2.790692,20.200001,39.599998
19523,77022,29557,49.9,"(47.8, 51.7)",23.3,"(23.0, 23.6)",15.6,"(15.4, 15.9)",38.5,"(38.1, 38.9)",...,30.5,"(28.0, 32.9)",POINT (-95.37699027 29.83056201),-95.376991,29.830563,0.000000,0.367838,0.199864,33.700001,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27517,77077,52151,20.5,"(19.7, 21.3)",19.4,"(19.1, 19.7)",19.1,"(19.0, 19.3)",29.2,"(28.9, 29.6)",...,8.4,"( 6.9, 10.7)",POINT (-95.6154378 29.75014795),-95.615440,29.750149,81.300003,0.625678,0.266252,88.000000,62.099998
27554,77027,14331,14.0,"(13.1, 15.2)",18.0,"(17.6, 18.4)",22.1,"(21.8, 22.4)",25.2,"(24.8, 25.7)",...,5.5,"( 4.2, 7.5)",POINT (-95.44591312 29.74019055),-95.445915,29.740191,0.000000,0.344911,0.371581,44.200001,39.299999
27918,77048,15294,29.7,"(27.4, 32.0)",26.2,"(25.6, 26.9)",14.9,"(14.6, 15.2)",45.3,"(44.5, 46.0)",...,23.9,"(19.1, 28.9)",POINT (-95.33029018 29.62042202),-95.330292,29.620422,583.000000,1.851361,2.752591,48.900002,94.800003
28037,77092,33745,40.4,"(38.4, 42.2)",21.6,"(21.3, 21.9)",17.6,"(17.4, 17.9)",33.9,"(33.5, 34.3)",...,19.9,"(18.0, 21.8)",POINT (-95.47380503 29.82965393),-95.473808,29.829655,107.500000,0.028031,4.236812,38.700001,33.400002


In [25]:
# Save the results off
health.to_csv(savename)