# cuSpatial API demo
GTC April 2023 Michael Wang and Thomson Comer


The following notebook demonstrates the use of cuSpatial to perform analytics using large datasets.

The structure of the notebook is as follows:
1. Imports
2. Read datasets: National Address Database (NAD), NYC Taxi Boroughs Polygons, 2015 NYC Taxi pickup/dropoff information with lon/lat. Also convert epsg:2263 (NYC Long Island) to WSG.
3. Convert separate lon/lat columns in DataFrames into cuspatial.GeoSeries
4. Count the number of pickups and dropoffs per zone
5. Compute street names for each pickup and dropoff
6. Calculate the number of addresses per zone

In [1]:
import cudf
import cuspatial
import geopandas
import cupy as cp
import pandas as pd

In [2]:
# I/O (18GB NAD, 265 borough polygons, 7m taxi pickups and 16m taxi dropoffs.
NAD_Street = cudf.read_csv('NAD_r11.txt', usecols=[
    'State',
    'StN_PreDir',
    'StreetName',
    'StN_PosTyp',
    'Add_Number'
])
NAD = cudf.read_csv('NAD_r11.txt', usecols=[
    'State',
    'Longitude',
    'Latitude',
])
NAD = NAD[NAD['State'] == 'NY']
NAD_Street = NAD_Street[NAD_Street['State'] == 'NY']
# Read taxi_zones.zip shapefile with GeoPandas, then convert to epsg:4326 for lon/lat
host_zones = geopandas.read_file('taxi_zones.zip')
host_lonlat = host_zones.to_crs(epsg=4326)
zones = cuspatial.from_geopandas(host_lonlat)
taxi2015 = cudf.read_csv('taxi2015.csv')

In [3]:
# Utility function to convert dataframes into GeoSeries

def make_geoseries_from_lon_lat(lon, lat):
    # Scatter the two columns into one column
    assert len(lon) == len(lat)
    xy = cudf.Series(cp.zeros(len(lon) * 2))
    xy[::2] = lon
    xy[1::2] = lat

    return cuspatial.GeoSeries(cuspatial.core._column.geocolumn.GeoColumn._from_points_xy(xy._column))


In [4]:
# Convert DataFrames to GeoSeries

pickups = make_geoseries_from_lon_lat(
    taxi2015['pickup_longitude'],
    taxi2015['pickup_latitude']
)
addresses = make_geoseries_from_lon_lat(
    NAD['Longitude'],
    NAD['Latitude']
)

In [None]:
# Count the number of dropoffs and pickups per zone, one at a time.

pickup_counts = zones['geometry'].contains_properly(pickups, align=False).sum()
zones['pickup_counts'] = pickup_counts

In [5]:
def quadtree(polygons, points):
    poly_points_x = polygons.polygons.x
    poly_points_y = polygons.polygons.y
    poly_offsets = polygons.polygons.part_offset
    poly_ring_offsets = polygons.polygons.ring_offset
    test_points_x = points.points.x
    test_points_y = points.points.y
    scale = 50
    max_depth = 7
    min_size = 125
    x_max = poly_points_x.max()
    x_min = poly_points_x.min()
    y_max = poly_points_y.max()
    y_min = poly_points_y.min()
    point_indices, quadtree = cuspatial.quadtree_on_points(
        test_points_x,
        test_points_y,
        x_min,
        x_max,
        y_min,
        y_max,
        scale,
        max_depth,
        min_size,
    )
    poly_bboxes = cuspatial.polygon_bounding_boxes(
        poly_offsets, poly_ring_offsets, poly_points_x, poly_points_y
    )
    intersections = cuspatial.join_quadtree_and_bounding_boxes(
        quadtree, poly_bboxes, x_min, x_max, y_min, y_max, scale, max_depth
    )
    polygons_and_points = cuspatial.quadtree_point_in_polygon(
        intersections,
        quadtree,
        point_indices,
        test_points_x,
        test_points_y,
        poly_offsets,
        poly_ring_offsets,
        poly_points_x,
        poly_points_y,
    )
    polygons_and_points['point_index'] = point_indices.iloc[
        polygons_and_points['point_index']
    ].reset_index(drop=True)
    return polygons_and_points

In [6]:
addresses_pip = quadtree(zones['geometry'], addresses)
addresses_pip

Unnamed: 0,polygon_index,point_index
0,1,5648100
1,1,5648101
2,34,5202801
3,34,5202802
4,34,5202803
...,...,...
966784,353,5368821
966785,353,5368822
966786,353,5368823
966787,353,5368824


In [7]:
pickups_pip = quadtree(zones['geometry'].iloc[0:120], pickups)
pickups_pip

Unnamed: 0,polygon_index,point_index
0,0,44084
1,0,76169
2,0,129737
3,0,177939
4,0,219859
...,...,...
3408316,167,12253904
3408317,167,12574064
3408318,167,12634955
3408319,167,12666699


In [8]:
# a mapping from parts to polygons

def pip_result_to_id_map(polygons, pip_result):
    offsets = cp.array(polygons.polygons.geometry_offset)
    polygon_lengths = offsets[1:] - offsets[:-1]
    parts = polygons.polygons.part_offset
    polygon_map = cp.arange(len(polygon_lengths)).repeat(polygon_lengths.tolist())
    idx_df = cudf.DataFrame({
        'OBJECTID': polygon_map,
        'polygon_index': cp.arange(len(parts)-1)
    })
    return pip_result.merge(idx_df, on="polygon_index").drop('polygon_index', axis=1)
borough_addresses = pip_result_to_id_map(zones['geometry'], addresses_pip)
borough_pickups = pip_result_to_id_map(zones['geometry'], pickups_pip)
borough_addresses

Unnamed: 0,point_index,OBJECTID
0,5209351,2
1,5209352,2
2,5209353,2
3,5209354,2
4,5209355,2
...,...,...
966784,5368821,262
966785,5368822,262
966786,5368823,262
966787,5368824,262


In [9]:
# Let's compute the practical limit for actual boroughs.
pickup_counts = borough_pickups.groupby('OBJECTID').count()
address_counts = borough_addresses.groupby('OBJECTID').count()
pickup_counts = pickup_counts.fillna(0)
address_counts = address_counts.fillna(0)
comparison_size = pickup_counts.sort_index()['point_index'] * address_counts.sort_index()['point_index']
comparison_size.unique()

0           <NA>
1              1
2             28
3             35
4             36
         ...    
110    265547996
111    396374132
112    414797526
113    640879498
114    931986782
Name: point_index, Length: 115, dtype: int32

In [36]:
# Let's make two GeoSeries: For each borough, create a GeoSeries with all address Points
# repeated the number of times there are pickups in that borough, and another GeoSeries with
# the opposite: all pickups Points repeated the number of times there are addresses in that
# borough.

# addresses
borough_address_point_ids = borough_addresses['point_index'][borough_addresses['OBJECTID'] == 2]
pickups_count = len(borough_pickups[borough_pickups['OBJECTID'] == 2])
addresses_tiled = NAD.iloc[
    borough_address_point_ids
].tile(pickups_count)

# pickups
addresses_ids = borough_address_point_ids.tile(pickups_count).reset_index(drop=True)
borough_pickup_point_ids = borough_pickups['point_index'][borough_pickups['OBJECTID'] == 2]
addresses_count = len(borough_addresses[borough_addresses['OBJECTID'] == 2])
pickups_tiled = taxi2015[[
    'pickup_longitude',
    'pickup_latitude'
]].iloc[
    borough_pickup_point_ids
].tile(addresses_count)

# map of pickup ids so we can reconstruct which are the closets
pickups_ids = borough_pickup_point_ids.tile(addresses_count).reset_index(drop=True)

pickup_points = make_geoseries_from_lon_lat(
    pickups_tiled['pickup_longitude'],
    pickups_tiled['pickup_latitude']
)
address_points = make_geoseries_from_lon_lat(
    addresses_tiled['Longitude'],
    addresses_tiled['Latitude']
)
addresses_tiled

Unnamed: 0,State,Longitude,Latitude
42741776,NY,-73.859091,40.868947
42741777,NY,-73.859002,40.869004
42741778,NY,-73.859340,40.869020
42741779,NY,-73.859323,40.869150
42741780,NY,-73.859682,40.869085
...,...,...,...
42739134,NY,-73.847768,40.869917
42739135,NY,-73.847419,40.869652
42739136,NY,-73.847767,40.869841
42739137,NY,-73.848233,40.869519


In [55]:
# get the list of addresses and their indices that are closest to a pickup point

distances = cuspatial.pairwise_point_distance(pickup_points, address_points)

pickups_indices = cp.arange((borough_pickups['OBJECTID'] == 2).sum())
addresses_indices = cudf.Series(cp.arange((borough_addresses['OBJECTID'] == 2).sum()))
pickups_index_map = pickups_indices.repeat((borough_addresses['OBJECTID'] ==2).sum())
address_index_map = addresses_indices.tile((borough_pickups['OBJECTID'] ==2).sum())
gb_df = cudf.DataFrame({
    'address': addresses_tiled.index,
    'pickup': pickups_tiled.index,
    'distances': distances
}) 
address_indices_of_nearest = gb_df[['address', 'distances']].groupby('address').idxmin()
pickup_indices_of_nearest = gb_df[['pickup', 'distances']].groupby('pickup').idxmin()
address_pickup_minimum_correspondence = gb_df.iloc[pickup_indices_of_nearest['distances']]
address_pickup_minimum_correspondence

Unnamed: 0,address,pickup,distances
70426,42742477,12253546,0.000268
79911,42742156,2902909,0.000557
79882,42741303,3499198,0.000403
69134,42739089,11924673,0.000227
59560,42738181,10678598,0.00124
59017,42739040,10078590,0.000433
62486,42737336,5004763,0.000201
67707,42743798,5340190,0.000547
76471,42736058,5858291,0.000187
70473,42735262,6452048,0.000254


In [60]:
nearest_pickups = taxi2015.iloc[address_pickup_minimum_correspondence['pickup']]
nearest_addresses = NAD_Street.loc[address_pickup_minimum_correspondence['address']]

Unnamed: 0,State,StN_PreDir,StreetName,StN_PosTyp,Add_Number
42742477,NY,,Allerton,Avenue,1079
42742156,NY,,Paulding,Avenue,2715
42741303,NY,,Hone,Avenue,2940
42739089,NY,East,Gun Hill,Road,1392
42738181,NY,,Woodhull,Avenue,2766
42739040,NY,,Morgan,Avenue,2931
42737336,NY,,Allerton,Avenue,1339
42743798,NY,,Laconia,Avenue,2290
42736058,NY,,Lodovick,Avenue,2317
42735262,NY,,Bruner,Avenue,2315


In [74]:
# concatenate address fields

def build_address_string(NAD_Street):
    blanks = cudf.Series([' '] * len(NAD_Street))
    blanks.index = NAD_Street.index
    NAD_Street['StN_PreDir'] = NAD_Street['StN_PreDir'].fillna('')
    NAD_Street['StN_PosTyp'] = NAD_Street['StN_PosTyp'].fillna('')
    street_names = NAD_Street['Add_Number'].astype('str').str.cat(
        blanks
    ).str.cat(
        NAD_Street['StN_PreDir']
    ).str.cat(
        blanks
    ).str.cat(
        NAD_Street['StreetName']
    ).str.cat(
        blanks
    ).str.cat(
        NAD_Street['StN_PosTyp']
    )
    return street_names.str.replace('  ', ' ')

build_address_string(nearest_addresses)

42742477       1079 Allerton Avenue
42742156       2715 Paulding Avenue
42741303           2940 Hone Avenue
42739089    1392 East Gun Hill Road
42738181       2766 Woodhull Avenue
42739040         2931 Morgan Avenue
42737336       1339 Allerton Avenue
42743798        2290 Laconia Avenue
42736058       2317 Lodovick Avenue
42735262         2315 Bruner Avenue
42735405    1990 East Gun Hill Road
42741639           930 Burke Avenue
42740972       3233 Pearsall Avenue
42737851       1502 Allerton Avenue
42739205         2922 Wilson Avenue
42740843         3252 Hering Avenue
42735977        2455 Wickham Avenue
42742418        2573 Laconia Avenue
42735423    1770 East Gun Hill Road
42735422           1875 Mace Avenue
42735478       1654 Allerton Avenue
42741753           2955 Boston Road
42737034         2450 Morgan Avenue
42735403            2401 Ely Avenue
42741306           2990 Boston Road
42743776            2222 Esplanade 
42742441        2702 Laconia Avenue
42735414        2498 Delanoy

In [86]:
no_index = nearest_pickups.reset_index()
no_index['addresses'] = build_address_string(nearest_addresses).reset_index(drop=True)
taxi_pickups_with_address = no_index.set_index(no_index['index'])
taxi_pickups_with_address
zones[zones['OBJECTID' == 2]]

KeyError: False

In [None]:
# The number of addresses per borough

address_counts = borough_addresses.groupby('OBJECTID').count()
count_df = cudf.DataFrame({
    'OBJECTID': address_counts.index,
    "Address Count": address_counts.polygon_index
})

In [None]:
# Add the address counts back to the zones dataframe

# Cudf doesn't know how to print `geometry` columns, so put it back into cuspatial
merged_zones = cuspatial.GeoDataFrame(zones.merge(count_df))
merged_zones.head()

In [None]:
# We have the street names in NAD above, let's add the street name to each pickup

borough_addresses
print(NAD.head())
print(taxi2015.head())
print(pickups.head())