In [14]:
import pandas as pd
from shapely.wkt import loads

# source 2018 yellow cab taxi rides (112M lines; may sample down to 1M for convenience):
# https://data.cityofnewyork.us/Transportation/2018-Yellow-Taxi-Trip-Data/t29m-gskq
# source taxi zones:
# https://data.cityofnewyork.us/Transportation/NYC-Taxi-Zones/d3c5-ddgc
rides_df = pd.read_csv("./taxis.csv", delimiter=";", parse_dates=["tpep_pickup_datetime", "tpep_dropoff_datetime"])
zones_df = pd.read_csv("./taxi_zones.csv")

# generate a representative geo point for each taxi ride's pick-up and drop-off location, so that 
# we can linearize by geospace
representativeX = []
representativeY = []

for i, zone in enumerate(zones_df["the_geom"]):
  representative = loads(zone).representative_point()
  representativeX += [representative.xy[0][0]]
  representativeY += [representative.xy[1][0]]

zones_df = zones_df[["zone", "LocationID"]].set_index("LocationID")
zones_df["representativeX"] = representativeX
zones_df["representativeY"] = representativeY

# for convenience, append the representatives as columns to the rides table via join
zones_df = zones_df.rename(columns={
  "zone": "PUZone", 
  "representativeX": "PURepresentativeX", 
  "representativeY": "PURepresentativeY"
})
rides_df = rides_df.join(zones_df, on="PULocationID")
zones_df = zones_df.rename(columns={
  "PUZone": "DOZone", 
  "PURepresentativeX": "DORepresentativeX", 
  "PURepresentativeY": "DORepresentativeY"
})
rides_df = rides_df.join(zones_df, on="DOLocationID", rsuffix="DO")

# write the update data back to the file
rides_df.to_csv("./taxisData.csv", sep=";", index=False)

In [27]:
import pymorton as pm
import numpy as np

pos = rides_df[["PURepresentativeX", "PURepresentativeY"]]
hashes = pos.apply(lambda row: pm.interleave_latlng(row[0], row[1]), axis=1)
np.argsort(hashes)

0         164430
1         204086
2         338771
3         338763
4         711583
           ...  
999995    265693
999996    786733
999997    197280
999998    624809
999999    723491
Length: 1000343, dtype: int64