# cuSpatial API demo
GTC April 2023 Michael Wang and Thomson Comer


The following notebook demonstrates the use of cuSpatial to perform analytics using large datasets.

The structure of the notebook is as follows:
1. Imports
1. Read datasets: National Address Database (NAD), NYC Taxi Boroughs Polygons, 2015 NYC Taxi pickup/dropoff information with lon/lat. Also convert epsg:2263 (NYC Long Island) to WSG.
1. Convert separate lon/lat columns in DataFrames into cuspatial.GeoSeries
1. Compute street names for each pickup and dropoff

In [1]:
import cudf
import cuspatial
import geopandas
import cupy as cp
import pandas as pd
cudf.set_option("spill", False) 

In [2]:
# I/O (18GB NAD, 265 borough polygons, 13m taxi pickups and 16m taxi dropoffs.
# Possible to use cudf spillover here?

NAD = cudf.read_csv('NAD_r11.txt', usecols=[
    'State',
    'Longitude',
    'Latitude',
])
NAD = NAD[NAD['State'] == 'NY']
# Read taxi_zones.zip shapefile with GeoPandas, then convert to epsg:4326 for lon/lat
host_zones = geopandas.read_file('taxi_zones.zip')
host_lonlat = host_zones.to_crs(epsg=4326)
zones = cuspatial.from_geopandas(host_lonlat)
taxi2015 = cudf.read_csv('taxi2015.csv')

In [3]:
# Utility function to convert dataframes into GeoSeries

def make_geoseries_from_lon_lat(lon, lat):
    # Scatter the two columns into one column
    assert len(lon) == len(lat)
    xy = cudf.Series(cp.zeros(len(lon) * 2))
    xy[::2] = lon
    xy[1::2] = lat

    return cuspatial.GeoSeries(cuspatial.core._column.geocolumn.GeoColumn._from_points_xy(xy._column))


In [4]:
# Convert DataFrames to GeoSeries

pickups = make_geoseries_from_lon_lat(
    taxi2015['pickup_longitude'],
    taxi2015['pickup_latitude']
)
addresses = make_geoseries_from_lon_lat(
    NAD['Longitude'],
    NAD['Latitude']
)

In [6]:
# addresses_pip = quadtree(zones['geometry'], addresses)
addresses_pip = zones['geometry'].contains_properly(addresses, allpairs=True)
addresses_pip

Unnamed: 0,polygon_index,point_index
25184,1,5648100
25185,1,5648101
25186,2,5202801
25187,2,5202802
25188,2,5202803
...,...,...
966784,262,5368821
966785,262,5368822
966786,262,5368823
966787,262,5368824


In [15]:
# pickups_pip = quadtree(zones['geometry'], pickups)
pickups_pip = zones['geometry'].iloc[0:120].contains_properly(pickups, allpairs=True)
pickups_pip

# You can do it one of two ways: .contains_properly, or write the pip yourself.

Unnamed: 0,polygon_index,point_index
21936,0,44084
21937,0,76169
21938,0,129737
21939,0,177939
21940,0,219859
...,...,...
3408268,119,12253904
3408269,119,12574064
3408270,119,12634955
3408271,119,12666699


In [18]:
# Let's compute the practical limit for actual boroughs.
# `value_counts()` may be faster than groupby, and is easier for the reader to understand.

pickup_counts = borough_pickups.groupby('OBJECTID').count()
address_counts = borough_addresses.groupby('OBJECTID').count()
pickup_counts = pickup_counts.fillna(0)
address_counts = address_counts.fillna(0)
comparison_size = pickup_counts.sort_index()['point_index'] * address_counts.sort_index()['point_index']
zones.index = cp.arange(1, len(zones) + 1)
BOROUGH_ID = 4 # Alphabet City, Manhattan

## Tile
## Cartesian Product

Descriptions of the work that is upcoming, to prepare for `cuspatial.pairwise_point_distance`

Create a diagram showing why this is useful.

A drawing of an addresses table and a pickups table, with a line connecting two rows together and
adding the address where it belongs in the pickups table.

In [19]:
# Let's make two GeoSeries: For each borough, create a GeoSeries with all address Points
# repeated the number of times there are pickups in that borough, and another GeoSeries with
# the opposite: all pickups Points repeated the number of times there are addresses in that
# borough.

# addresses
borough_address_point_ids = borough_addresses['point_index'][borough_addresses['OBJECTID'] == BOROUGH_ID]
pickups_count = len(borough_pickups[borough_pickups['OBJECTID'] == BOROUGH_ID])
addresses_tiled = NAD.iloc[
    borough_address_point_ids
].tile(pickups_count)

# pickups
addresses_ids = borough_address_point_ids.tile(pickups_count).reset_index(drop=True)
borough_pickup_point_ids = borough_pickups['point_index'][borough_pickups['OBJECTID'] == BOROUGH_ID]
addresses_count = len(borough_addresses[borough_addresses['OBJECTID'] == BOROUGH_ID])
pickups_tiled = taxi2015[[
    'pickup_longitude',
    'pickup_latitude'
]].iloc[
    borough_pickup_point_ids
].tile(addresses_count)

pickup_points = make_geoseries_from_lon_lat(
    pickups_tiled['pickup_longitude'],
    pickups_tiled['pickup_latitude']
)
address_points = make_geoseries_from_lon_lat(
    addresses_tiled['Longitude'],
    addresses_tiled['Latitude']
)

## What is pairwise point_distance, a drawing of two tables with a line connecting each cell.

In [20]:
# get the list of addresses and their indices that are closest to a pickup point

distances = cuspatial.pairwise_point_distance(pickup_points, address_points)

gb_df = cudf.DataFrame({
    'address': addresses_tiled.index,
    'pickup': pickups_tiled.index,
    'distances': distances
})

address_indices_of_nearest = gb_df[['address', 'distances']].groupby('address').idxmin()
pickup_indices_of_nearest = gb_df[['pickup', 'distances']].groupby('pickup').idxmin()
address_pickup_minimum_correspondence = gb_df.iloc[pickup_indices_of_nearest['distances']]

# List of pickups in ascending order of distance from that address.

# We're almost there

### We have the index of the addresses and their pickups

In [21]:
NAD_Street = cudf.read_csv('NAD_r11.txt', usecols=[
    'State',
    'StN_PreDir',
    'StreetName',
    'StN_PosTyp',
    'Add_Number',
    'Addr_Type'
])
NAD_Street = NAD_Street[NAD_Street['State'] == NY]
nearest_pickups = taxi2015.iloc[address_pickup_minimum_correspondence['pickup']]
nearest_addresses = NAD_Street.loc[address_pickup_minimum_correspondence['address']]

In [22]:
# concatenate address fields

def build_address_string(NAD_Street):
    blanks = cudf.Series([' '] * len(NAD_Street))
    blanks.index = NAD_Street.index
    NAD_Street['StN_PreDir'] = NAD_Street['StN_PreDir'].fillna('')
    NAD_Street['StN_PosTyp'] = NAD_Street['StN_PosTyp'].fillna('')
    street_names = NAD_Street['Add_Number'].astype('str').str.cat(
        blanks
    ).str.cat(
        NAD_Street['StN_PreDir']
    ).str.cat(
        blanks
    ).str.cat(
        NAD_Street['StreetName']
    ).str.cat(
        blanks
    ).str.cat(
        NAD_Street['StN_PosTyp']
    ).str.cat(
        NAD_Street['Addr_Type']
    )
    return street_names.str.replace('  ', ' ')

build_address_string(nearest_addresses)

43424481    828 Woodrow RoadUnknown
43422263     95 Forest GreenUnknown
43422261     88 Forest GreenUnknown
Name: Add_Number, dtype: object

In [23]:
no_index = nearest_pickups.reset_index()
no_index['address'] = build_address_string(nearest_addresses).reset_index(drop=True)
taxi_pickups_with_address = no_index.set_index(no_index['index'])
taxi_pickups_with_address.drop('index', inplace=True, axis=1)
taxi_pickups_with_address[[
    'VendorID',
    'tpep_pickup_datetime',
    'passenger_count',
    'trip_distance',
    'RateCodeID',
    'pickup_longitude',
    'pickup_latitude',
    'fare_amount',
    'tip_amount',
    'address'
]].iloc[0:10]
#taxi_pickups_with_address.columns

Unnamed: 0_level_0,VendorID,tpep_pickup_datetime,passenger_count,trip_distance,RateCodeID,pickup_longitude,pickup_latitude,fare_amount,tip_amount,address
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1291896,2,2015-01-07 19:56:41,1,0.19,5,-74.192703,40.545837,79.38,0.0,828 Woodrow RoadUnknown
41259,2,2015-01-07 03:22:46,5,0.0,1,-74.196159,40.554604,3.0,0.0,95 Forest GreenUnknown
178089,2,2015-01-07 03:25:32,5,0.0,1,-74.196159,40.554623,2.5,0.0,88 Forest GreenUnknown


# Use cuXfilter to display these coordinates