In [1]:
import h5py
import osmnx as ox
import pandas as pd
import numpy as np
import networkx as nx
from tqdm import tqdm
from itertools import product
from functools import cache
from scipy.spatial import KDTree
from haversine import haversine
from shapely.geometry import Point

In [2]:
tractcenters = pd.read_csv("ny_tracts.csv", dtype=str)
tractcenters['CountyFIPS'] = tractcenters['STATEFP']+tractcenters['COUNTYFP']
tractcenters['TractFIPS'] = tractcenters['STATEFP']+tractcenters['COUNTYFP']+tractcenters['TRACTCE']
tractcenters = tractcenters[tractcenters['CountyFIPS'].isin(['36081', '36061', '36005', '36047', '36085'])]
#tractcenters = tractcenters.astype({'TractFIPS':'int'})
tracts = tractcenters['TractFIPS'].to_numpy()
tractcenters.drop(['STATEFP','COUNTYFP', 'TRACTCE'], inplace=True, axis=1)
tractcenters.head()

Unnamed: 0,POPULATION,LATITUDE,LONGITUDE,CountyFIPS,TractFIPS
88,11091,40.793356,-73.881698,36005,36005000100
89,4334,40.811766,-73.859826,36005,36005000200
90,5503,40.810343,-73.853387,36005,36005000400
91,5643,40.81975,-73.859176,36005,36005001600
92,1917,40.804472,-73.918385,36005,36005001900


In [3]:
lats = tractcenters.copy()['LATITUDE'].astype(float).to_numpy()
lons = tractcenters.copy()['LONGITUDE'].astype(float).to_numpy()
tractfips = tractcenters.copy()['TractFIPS'].to_numpy()

In [4]:
# Referenced:
# https://towardsdatascience.com/finding-time-dependent-travel-times-between-every-pair-of-locations-in-manhattan-c3c48b0db7ba
# https://towardsdatascience.com/shortest-path-algorithm-with-osm-walking-network-6d2863ae96be
# https://osmnx.readthedocs.io/en/stable/osmnx.html and https://github.com/gboeing/osmnx
# https://movement.uber.com/?lang=en-US
place = 'New York, New York'
savename = r'tracts_NYC.csv'
speed_path = '2020_speeds_new_york.csv'

In [5]:
# Get the graph and the speeds associated with all edges
graph = ox.graph_from_place(place, network_type='drive')
print('Got graph')
graph = ox.add_edge_speeds(graph)
print('Got speeds')
graph = ox.add_edge_travel_times(graph)
print('Got travel times')
graph = ox.utils_graph.get_largest_component(graph, strongly=True)
print('Got largest connected component')
#ox.save_graphml(graph, r'C:\Users\willd\Documents\Georgia Tech\CSE6424\Project\new_york_metro_area_cleaned.graphml')
#graph = ox.load_graphml(r'C:\Users\willd\Documents\Georgia Tech\CSE6424\Project\new_york_metro_area_cleaned.graphml')

  gdf = gdf.append(_geocode_query_to_gdf(q, wr, by_osmid))


Got graph
Got speeds
Got travel times
Got largest connected component


In [6]:
# Find all food stores
food_tags = {'shop': 'supermarket', 'amenity': 'marketplace'}
food_places = ox.geometries_from_place(place, food_tags)

  gdf = gdf.append(_geocode_query_to_gdf(q, wr, by_osmid))


In [7]:
# Replace Polygons with a single point
food_places.loc[food_places['geometry'].type == 'Polygon', 'geometry'] = food_places.loc[food_places['geometry'].type == 'Polygon', 'geometry'].representative_point()

In [8]:
# Find all major green places or recreational areas
physical_tags = {'leisure': ['park', 'recreation_ground', 'playground', 'fitness_station', 'sports_centre', 'nature_reserve', 'pitch']}
physical_places = ox.geometries_from_place(place, physical_tags)
physical_places.loc[physical_places['geometry'].type == 'Polygon', 'geometry'] = physical_places.loc[physical_places['geometry'].type == 'Polygon', 'geometry'].representative_point()

  gdf = gdf.append(_geocode_query_to_gdf(q, wr, by_osmid))
  for merged_outer_linestring in list(merged_outer_linestrings):
  for merged_outer_linestring in list(merged_outer_linestrings):


In [9]:
# Find public transit
transport_tags = {'public_transport': ['platform', 'stop_position'], 'highway': ['bus_stop', 'platform'],
                 'railway': ['subway_entrance', 'station', 'tram', 'tram_stop'], 'station': 'subway'}
transport_places = ox.geometries_from_place(place, transport_tags)
transport_places.loc[transport_places['geometry'].type == 'Polygon', 'geometry'] = transport_places.loc[transport_places['geometry'].type == 'Polygon', 'geometry'].representative_point()

  gdf = gdf.append(_geocode_query_to_gdf(q, wr, by_osmid))


In [10]:
# Find libraries and schools
education_tags = {'amenity': ['library', 'school', 'kindergarten']}
education_places = ox.geometries_from_place(place, education_tags)
education_places.loc[education_places['geometry'].type == 'Polygon', 'geometry'] = education_places.loc[education_places['geometry'].type == 'Polygon', 'geometry'].representative_point()

  gdf = gdf.append(_geocode_query_to_gdf(q, wr, by_osmid))
  for merged_outer_linestring in list(merged_outer_linestrings):
  for merged_outer_linestring in list(merged_outer_linestrings):


In [11]:
# Find places of worship
worship_tags = {'amenity': 'place_of_worship'}
worship_places = ox.geometries_from_place(place, worship_tags)
worship_places.loc[worship_places['geometry'].type == 'Polygon', 'geometry'] = worship_places.loc[worship_places['geometry'].type == 'Polygon', 'geometry'].representative_point()

  gdf = gdf.append(_geocode_query_to_gdf(q, wr, by_osmid))
  for merged_outer_linestring in list(merged_outer_linestrings):
  for merged_outer_linestring in list(merged_outer_linestrings):


In [12]:
# Simplify everything
food_places = food_places['geometry'].droplevel(0)
physical_places = physical_places['geometry'].droplevel(0)
transport_places = transport_places['geometry'].droplevel(0)
education_places = education_places['geometry'].droplevel(0)
worship_places = worship_places['geometry'].droplevel(0)
food_places = food_places[food_places.type == 'Point']
physical_places = physical_places[physical_places.type == 'Point']
transport_places = transport_places[transport_places.type == 'Point']
education_places = education_places[education_places.type == 'Point']
worship_places = worship_places[worship_places.type == 'Point']

In [13]:
speed_raw = pd.read_csv(speed_path)

In [14]:
# Remove unnecessary columns
speed_raw.drop(columns=['quarter', 'year', 'segment_id', 'start_junction_id', 'end_junction_id'], inplace=True)
# Noon seems like a fair time
speed_raw = speed_raw[speed_raw['hour_of_day'] == 12]
speed_raw.set_index('osm_way_id', drop=True, inplace=True)

In [15]:
real_calculation = 0
for edge in tqdm(graph.edges):
   # length is meters, speed_kph is kph (duh), maxspeed has units in string, and travel_time is seconds
    e = graph[edge[0]][edge[1]][edge[2]]
    if isinstance(e['osmid'], list):
       # Some graph edges are made up of multiple OSM ways apparently
       for osmid in e['osmid']:
            try:
                meters_per_second = speed_raw.at[osmid, 'speed_mph_mean']*0.44704    # Convert to meters/sec
                time = e['length']/meters_per_second
                real_calculation += 1
                break
            except (KeyError, ZeroDivisionError):
                time = e['travel_time']     # Backup (i.e. length/speed limit) if Uber data isn't available
    else:
        try:
            meters_per_second = speed_raw.at[e['osmid'], 'speed_mph_mean']*0.44704
            time = e['length']/meters_per_second
            real_calculation += 1
        except (KeyError, ZeroDivisionError):
            time = e['travel_time']
    if isinstance(time, pd.Series):
       # TODO Bug check why this is happening, but not late at night
        time = time.mean()
    graph[edge[0]][edge[1]][edge[2]]['actual_travel_time'] = time

100%|█████████████████████████████████| 139975/139975 [00:28<00:00, 4895.40it/s]


In [16]:
#health = pd.read_csv(r'C:\Users\willd\Documents\Georgia Tech\CSE6424\Project\PLACES__Census_Tract_Data__GIS_Friendly_Format___2021_release.csv')
health = pd.read_csv(r'PLACES__Census_Tract_Data__GIS_Friendly_Format___2021_release.csv')

In [17]:
health

Unnamed: 0,StateAbbr,StateDesc,CountyName,CountyFIPS,TractFIPS,TotalPopulation,ACCESS2_CrudePrev,ACCESS2_Crude95CI,ARTHRITIS_CrudePrev,ARTHRITIS_Crude95CI,...,OBESITY_Crude95CI,PHLTH_CrudePrev,PHLTH_Crude95CI,SLEEP_CrudePrev,SLEEP_Crude95CI,STROKE_CrudePrev,STROKE_Crude95CI,TEETHLOST_CrudePrev,TEETHLOST_Crude95CI,Geolocation
0,AZ,Arizona,Maricopa,4013,4013422643,5789,11.9,"(10.1, 14.0)",17.0,"(16.1, 17.9)",...,"(26.9, 29.7)",9.2,"( 8.2, 10.2)",34.5,"(33.0, 36.3)",1.8,"( 1.6, 2.0)",8.0,"( 5.2, 11.6)",POINT (-111.61853 33.35726769)
1,CA,California,Sacramento,6067,6067007402,6180,15.4,"(13.5, 17.3)",24.6,"(23.8, 25.3)",...,"(29.6, 31.4)",15.1,"(14.2, 16.2)",35.7,"(34.8, 36.7)",3.9,"( 3.6, 4.3)",18.2,"(13.7, 23.5)",POINT (-121.3791473 38.6869681)
2,AL,Alabama,Madison,1089,1089000201,760,25.4,"(21.2, 30.1)",36.0,"(34.6, 37.3)",...,"(46.6, 49.7)",22.5,"(20.4, 24.7)",50.3,"(48.8, 51.3)",7.6,"( 6.8, 8.6)",33.3,"(24.1, 43.8)",POINT (-86.55005486 34.77465775)
3,AL,Alabama,Montgomery,1101,1101002202,1185,25.2,"(21.2, 29.4)",36.1,"(35.0, 37.3)",...,"(45.3, 47.9)",20.7,"(19.1, 22.7)",49.7,"(48.3, 51.1)",7.2,"( 6.4, 8.0)",34.0,"(25.6, 42.0)",POINT (-86.30555503 32.31774882)
4,AL,Alabama,Butler,1013,1013952800,1394,14.3,"(11.9, 17.1)",36.0,"(34.5, 37.4)",...,"(32.3, 35.2)",15.2,"(13.5, 16.9)",35.7,"(33.9, 37.5)",4.2,"( 3.6, 4.8)",15.1,"( 8.9, 22.9)",POINT (-86.62833756 31.83774723)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72332,WI,Wisconsin,Milwaukee,55079,55079120101,3953,9.0,"( 7.2, 11.1)",30.4,"(28.9, 31.9)",...,"(28.4, 31.5)",10.3,"( 9.0, 11.8)",30.0,"(28.2, 31.3)",3.0,"( 2.5, 3.6)",8.6,"( 4.3, 15.3)",POINT (-88.04807056 42.95920626)
72333,WI,Wisconsin,Barron,55005,55005000300,4215,12.1,"(11.1, 13.2)",30.8,"(30.3, 31.4)",...,"(34.9, 36.4)",13.7,"(13.1, 14.3)",33.4,"(32.6, 34.3)",3.7,"( 3.5, 3.9)",14.4,"(12.2, 16.9)",POINT (-92.03012814 45.44804305)
72334,WI,Wisconsin,Ozaukee,55089,55089620100,6079,9.9,"( 8.5, 11.4)",25.3,"(24.3, 26.1)",...,"(33.7, 36.4)",10.8,"( 9.9, 11.7)",32.5,"(31.2, 33.6)",2.7,"( 2.5, 2.9)",10.8,"( 8.3, 13.6)",POINT (-87.98059035 43.41140163)
72335,WI,Wisconsin,Waushara,55137,55137960800,4515,15.8,"(13.8, 17.8)",31.9,"(30.9, 32.9)",...,"(38.3, 40.7)",15.5,"(14.3, 16.7)",34.0,"(33.1, 35.5)",4.4,"( 4.0, 4.8)",18.6,"(13.4, 23.7)",POINT (-89.31255284 44.06818511)


In [18]:
all_nearest_nodes, dists = ox.distance.nearest_nodes(graph, lons, lats, return_dist=True)
# If it's more than 1km away from anything in the graph, then we probably don't want it
all_nearest_nodes = np.asarray(all_nearest_nodes)#[np.asarray(dists) < 1700]
#health = health[np.asarray(dists) < 1700]
print('Number:', all_nearest_nodes.shape[0])
food_closest_travel_times = np.full(all_nearest_nodes.shape[0], np.nan, dtype=np.float32)
physical_closest_dist = food_closest_travel_times.copy()
transport_closest_dist = food_closest_travel_times.copy()
education_closest_travel_times = food_closest_travel_times.copy()
worship_closest_travel_times = food_closest_travel_times.copy()

Number: 2168


In [27]:
food_nodes, dists = ox.distance.nearest_nodes(graph, [x.x for x in food_places], [x.y for x in food_places], return_dist=True)
print('Found food', 'Number within 1km:', np.count_nonzero(np.asarray(dists) < 17000000), 'Number total:', len(food_nodes))
food_nodes = np.asarray(food_nodes)[np.asarray(dists) < 17000000]
physical_nodes, dists = ox.distance.nearest_nodes(graph, [x.x for x in physical_places], [x.y for x in physical_places], return_dist=True)
print('Found physical', 'Number within 1km:', np.count_nonzero(np.asarray(dists) < 1700000), 'Number total:', len(physical_nodes))
physical_nodes = np.asarray(physical_nodes)[np.asarray(dists) < 1700000]
transport_nodes, dists = ox.distance.nearest_nodes(graph, [x.x for x in transport_places], [x.y for x in transport_places], return_dist=True)
print('Found transport', 'Number within 1km:', np.count_nonzero(np.asarray(dists) < 17000000), 'Number total:', len(transport_nodes))
transport_nodes = np.asarray(transport_nodes)[np.asarray(dists) < 1700000]
education_nodes, dists = ox.distance.nearest_nodes(graph, [x.x for x in education_places], [x.y for x in education_places], return_dist=True)
print('Found education', 'Number within 1km:', np.count_nonzero(np.asarray(dists) < 1700000), 'Number total:', len(education_nodes))
education_nodes = np.asarray(education_nodes)[np.asarray(dists) < 1700000]
worship_nodes, dists = ox.distance.nearest_nodes(graph, [x.x for x in worship_places], [x.y for x in worship_places], return_dist=True)
print('Found worship', 'Number within 1km:', np.count_nonzero(np.asarray(dists) < 1700000), 'Number total:', len(worship_nodes))
worship_nodes = np.asarray(worship_nodes)[np.asarray(dists) < 17000000]

Found food Number within 1km: 914 Number total: 914
Found physical Number within 1km: 7927 Number total: 7927
Found transport Number within 1km: 16487 Number total: 16487
Found education Number within 1km: 1978 Number total: 1978
Found worship Number within 1km: 2488 Number total: 2488


In [28]:
food_lat_lons = np.array([[graph.nodes[x]['x'] for x in food_nodes], [graph.nodes[x]['y'] for x in food_nodes]], dtype=np.float32).T
physical_lat_lons = np.array([[graph.nodes[x]['x'] for x in physical_nodes], [graph.nodes[x]['y'] for x in physical_nodes]], dtype=np.float32).T
transport_lat_lons = np.array([[graph.nodes[x]['x'] for x in transport_nodes], [graph.nodes[x]['y'] for x in transport_nodes]], dtype=np.float32).T
education_lat_lons = np.array([[graph.nodes[x]['x'] for x in education_nodes], [graph.nodes[x]['y'] for x in education_nodes]], dtype=np.float32).T
worship_lat_lons = np.array([[graph.nodes[x]['x'] for x in worship_nodes], [graph.nodes[x]['y'] for x in worship_nodes]], dtype=np.float32).T

In [29]:
food_tree = KDTree(food_lat_lons)
physical_tree = KDTree(physical_lat_lons)
transport_tree = KDTree(transport_lat_lons)
education_tree = KDTree(education_lat_lons)
worship_tree = KDTree(worship_lat_lons)

In [30]:
def haversine_wrapper(lat1, lon1, lat2, lon2):
    return haversine((lat1, lon1), (lat2, lon2))

In [31]:
closest_food_nodes = []
closest_education_nodes = []
closest_worship_nodes = []
for i, sample_node in tqdm(enumerate(all_nearest_nodes), total=len(all_nearest_nodes)):
    idxes = food_tree.query((graph.nodes[sample_node]['x'], graph.nodes[sample_node]['y']), k=5)[1]
    closest_food_nodes.append(food_nodes[idxes])
    idx = physical_tree.query((graph.nodes[sample_node]['x'], graph.nodes[sample_node]['y']), k=1)[1]
    dist = haversine_wrapper(graph.nodes[sample_node]['y'], graph.nodes[sample_node]['x'], physical_lat_lons[idx, 1], physical_lat_lons[idx, 0])
    physical_closest_dist[i] = dist
    idx = transport_tree.query((graph.nodes[sample_node]['x'], graph.nodes[sample_node]['y']), k=1)[1]
    dist = haversine_wrapper(graph.nodes[sample_node]['y'], graph.nodes[sample_node]['x'], transport_lat_lons[idx, 1], transport_lat_lons[idx, 0])
    transport_closest_dist[i] = dist
    idxes = education_tree.query((graph.nodes[sample_node]['x'], graph.nodes[sample_node]['y']), k=5)[1]
    closest_education_nodes.append(education_nodes[idxes])
    idxes = worship_tree.query((graph.nodes[sample_node]['x'], graph.nodes[sample_node]['y']), k=5)[1]
    closest_worship_nodes.append(worship_nodes[idxes])

100%|█████████████████████████████████████| 2168/2168 [00:00<00:00, 7310.44it/s]


In [32]:
@cache    # Trying to speed things up a little
def shortest_path(source, target):
    return nx.shortest_path_length(graph, source=source, target=target, weight='actual_travel_time')

In [33]:
for i, sample_node in enumerate(tqdm(all_nearest_nodes)):
    current_shortest = np.inf
    shortest_node = 0
    for food_node in closest_food_nodes[i]:
        shortest = shortest_path(sample_node, food_node)
        if shortest < current_shortest:
            current_shortest = shortest
            shortest_node = food_node
    food_closest_travel_times[i] = current_shortest
    
    current_shortest = np.inf
    shortest_node = 0
    for education_node in closest_education_nodes[i]:
        shortest = shortest_path(sample_node, education_node)
        if shortest < current_shortest:
            current_shortest = shortest
            shortest_node = education_node
    education_closest_travel_times[i] = current_shortest
    
    current_shortest = np.inf
    shortest_node = 0
    for worship_node in closest_worship_nodes[i]:
        shortest = shortest_path(sample_node, worship_node)
        if shortest < current_shortest:
            current_shortest = shortest
            shortest_node = worship_node
    worship_closest_travel_times[i] = current_shortest

100%|██████████████████████████████████████| 2168/2168 [00:18<00:00, 118.19it/s]


In [36]:
deserts_wtracts = np.vstack((food_closest_travel_times, physical_closest_dist, transport_closest_dist, education_closest_travel_times,
                worship_closest_travel_times, tracts)).T
columns = ['food_closest_travel_times', 'physical_closest_dist','transport_closest_dist', 
           'education_closest_travel_times', 'worship_closest_travel_times', 'GEOID']
desert_measures = pd.DataFrame(deserts_wtracts, columns=columns)


In [38]:
desert_measures.to_csv('nyc_desert_tracts.csv', index=False)