In [1]:
import pandas as pd
from shapely.wkt import loads

# source 2018 yellow cab taxi rides (112M lines; may sample down to 1M for convenience):
# https://data.cityofnewyork.us/Transportation/2018-Yellow-Taxi-Trip-Data/t29m-gskq
# source taxi zones:
# https://data.cityofnewyork.us/Transportation/NYC-Taxi-Zones/d3c5-ddgc
rides_df = pd.read_csv("./taxis.csv", delimiter=";", parse_dates=["tpep_pickup_datetime", "tpep_dropoff_datetime"])
zones_df = pd.read_csv("./taxi_zones.csv")

# generate a representative geo point for each taxi ride's pick-up and drop-off location, so that 
# we can linearize by geospace
representativeX = []
representativeY = []

for i, zone in enumerate(zones_df["the_geom"]):
  representative = loads(zone).representative_point()
  representativeX += [representative.xy[0][0]]
  representativeY += [representative.xy[1][0]]

zones_df = zones_df[["zone", "LocationID"]].set_index("LocationID")
zones_df["representativeX"] = representativeX
zones_df["representativeY"] = representativeY

# for convenience, append the representatives as columns to the rides table via join
zones_df = zones_df.rename(columns={
  "zone": "PUZone", 
  "representativeX": "PURepresentativeX", 
  "representativeY": "PURepresentativeY"
})
rides_df = rides_df.join(zones_df, on="PULocationID")
zones_df = zones_df.rename(columns={
  "PUZone": "DOZone", 
  "PURepresentativeX": "DORepresentativeX", 
  "PURepresentativeY": "DORepresentativeY"
})
rides_df = rides_df.join(zones_df, on="DOLocationID", rsuffix="DO")

# filter out NaN values
rides_df = rides_df[
  rides_df["PURepresentativeX"].notnull() & rides_df["PURepresentativeY"].notnull()
]

# write the update data back to the file
rides_df.to_csv("./taxisData.csv", sep=";", index=False)

Unnamed: 0,tripID,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,...,tip_amount,tolls_amount,improvement_surcharge,total_amount,PUZone,PURepresentativeX,PURepresentativeY,DOZone,DORepresentativeX,DORepresentativeY
0,31852922,2,2018-07-18 06:17:11,2018-07-18 06:23:06,2,2.00,1,N,249,161,...,1.66,0.0,0.3,9.96,West Village,-74.002497,40.734611,Midtown Center,-73.977432,40.758226
1,16785706,2,2018-01-09 02:35:11,2018-01-09 02:49:07,1,5.10,5,N,125,265,...,15.32,10.5,0.3,76.62,Hudson Sq,-74.007176,40.725376,,,
2,66379394,2,2018-11-24 17:07:34,2018-11-24 17:13:47,2,1.03,1,N,239,238,...,0.00,0.0,0.3,6.80,Upper West Side South,-73.978273,40.784107,Upper West Side North,-73.972814,40.791766
3,10428271,2,2018-06-15 08:41:34,2018-06-15 08:47:41,1,0.82,1,N,249,114,...,1.70,0.0,0.3,8.50,West Village,-74.002497,40.734611,Greenwich Village South,-73.998678,40.728612
4,23940933,1,2018-02-02 07:43:08,2018-02-02 07:50:18,2,1.80,1,N,239,166,...,0.42,0.0,0.3,8.72,Upper West Side South,-73.978273,40.784107,Morningside Heights,-73.961815,40.809570
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,48567407,2,2018-09-20 22:37:20,2018-09-20 22:46:31,4,1.90,1,N,141,142,...,1.96,0.0,0.3,11.76,Lenox Hill West,-73.959713,40.766839,Lincoln Square East,-73.981352,40.773906
999996,67297900,2,2018-11-28 11:09:59,2018-11-28 11:24:48,6,1.78,1,N,246,164,...,0.00,0.0,0.3,11.80,West Chelsea/Hudson Yards,-74.004513,40.752437,Midtown South,-73.985929,40.748808
999997,19204703,1,2018-01-17 11:41:44,2018-01-17 11:48:00,1,0.70,1,N,162,237,...,2.00,0.0,0.3,8.80,Midtown East,-73.972145,40.756816,Upper East Side South,-73.965691,40.768542
999998,53762823,4,2018-10-10 07:51:14,2018-10-10 07:56:34,1,0.84,1,N,249,125,...,1.26,0.0,0.3,7.56,West Village,-74.002497,40.734611,Hudson Sq,-74.007176,40.725376


In [27]:
import pymorton as pm
import numpy as np

# testing the pymorton sort function
pos = rides_df[["PURepresentativeX", "PURepresentativeY"]]
hashes = pos.apply(lambda row: pm.interleave_latlng(row[0], row[1]), axis=1)
np.argsort(hashes)

0         164430
1         204086
2         338771
3         338763
4         711583
           ...  
999995    265693
999996    786733
999997    197280
999998    624809
999999    723491
Length: 1000343, dtype: int64