# Analyze NYC Taxi RideShare Data

The RideShare data consists of the following companies:  Juno, Uber, Via, Lyft.

In [1]:
from datetime import timedelta
import os
import xgt

# For cloud instances, replace the localhost with the instance's IP address or use ssh tunneling
server = xgt.Connection(host='localhost', auth=xgt.BasicAuth('xgtd'))
server.set_default_namespace('nyctaxi')

In [2]:
# Select one month of NYC Taxi data to explore
try:
  year = int(input("Enter year (2019 to 2022)") or "2020")
  month = int(input("Enter month number (1 to 12)") or "6")
except:
  year = 2020
  month = 6

# Establish URLs for the selected month
fhv_trip_url = f"https://d37ci6vzurychx.cloudfront.net/trip-data/fhv_tripdata_{year}-{int(month):02d}.parquet"
fhvhv_trip_url = f"https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata_{year}-{int(month):02d}.parquet"

Enter year (2019 to 2022)2020
Enter month number (1 to 12)6


## Establish graph schema

Trovares xGT is a "schema-first" graph application.
The schemas are created from within the Python environment as shown below.

In [3]:
# Establish the schemas for the NYC Taxi dataset as a graph.
# Locations are the nodes of the graph, and each rideshare trip is an edge.
server.drop_frame('RideShare')
server.drop_frame('Locations')

locations = server.create_vertex_frame(
    name='Locations',
    schema=[['id', xgt.INT]],
    key='id')

rideshare_trip = server.create_edge_frame(
    name='RideShare',
    schema=[
          ['hvfhs_license_num', xgt.TEXT],
          ['dispatching_base_num', xgt.TEXT],
          ['pickup_datetime', xgt.DATETIME],
          ['dropoff_datetime', xgt.DATETIME],
          ['PULocationID', xgt.INT],
          ['DOLocationID', xgt.INT],
          ['SR_Flag', xgt.TEXT],
        ],
    source=locations,
    target=locations,
    source_key='PULocationID',
    target_key='DOLocationID',
)

In [4]:
# Ingest data from the selected month.
rideshare_trip.load(fhvhv_trip_url, frame_to_input_column_mapping =
    {'hvfhs_license_num' : 'hvfhs_license_num',  'dispatching_base_num' : 'dispatching_base_num',
     'pickup_datetime' : 'pickup_datetime', 'dropoff_datetime' : 'dropoff_datetime',
     'PULocationID' : 'PULocationID', 'DOLocationID' : 'DOLocationID', 'SR_Flag' : 'shared_request_flag'})

print(f"Location ID count: {locations.num_rows:,}")
print(f"Rideshare Trip edges: {rideshare_trip.num_rows:,}")
max_memory = server.max_user_memory_size
print(f"Memory footprint: {max_memory - server.free_user_memory_size:,.3f} GiB used out of {max_memory:,.3f} GiB available.")

Location ID count: 261
Rideshare Trip edges: 7,555,193
Memory footprint: 0.011 GiB used out of 32.000 GiB available.


### Explore basic data characterstics

In [5]:
# Produce a histogram of out-degree counts on each node
import pandas

job = server.run_job("""
MATCH (v:Locations)
RETURN v.id, outdegree(v) AS outdegree
ORDER BY outdegree DESC
""")
pdframe = job.get_data(format='pandas')
pdframe

Unnamed: 0,v_id,outdegree
0,61,159175
1,76,135150
2,42,116570
3,37,102990
4,35,93165
...,...,...
256,99,139
257,1,103
258,2,41
259,110,41


In [6]:
# Produce a histogram of in-degree counts on each node
import pandas

job = server.run_job("""
MATCH (v:Locations)
RETURN v.id, indegree(v) AS indegree
ORDER BY indegree DESC
""")
pdframe = job.get_data(format='pandas')
pdframe

Unnamed: 0,v_id,indegree
0,265,235144
1,61,155927
2,76,139159
3,42,107216
4,37,102174
...,...,...
256,30,431
257,99,201
258,2,50
259,110,44


In [7]:
# Compute ride time statistics
job = server.run_job("""
MATCH (start)-[t1:RideShare]->(finish)
RETURN
    avg((t1.dropoff_datetime - t1.pickup_datetime) / duration({second: 1})) AS average_duration,
    max(t1.dropoff_datetime - t1.pickup_datetime) AS max_duration,
    min(t1.dropoff_datetime - t1.pickup_datetime) AS min_duration
""")
pdframe = job.get_data(format='pandas')
pdframe

Unnamed: 0,average_duration,max_duration,min_duration
0,989.528503,0 days 23:49:55,0 days


### Search for some interesting patterns

In [8]:
# Find highest number of edges between two vertices
job = server.run_job("""
MATCH (start)-[:RideShare]->(finish)
RETURN start.id, finish.id, count(*) AS multiple_edges_count
ORDER BY multiple_edges_count DESC
LIMIT 10
""")
pdframe = job.get_data(format = 'pandas')
pdframe

Unnamed: 0,start_id,finish_id,multiple_edges_count
0,76,76,33184
1,26,26,20649
2,39,39,19452
3,61,61,19139
4,35,76,10972
5,42,42,10798
6,76,35,9993
7,35,35,9833
8,213,213,9476
9,89,89,9211


In [9]:
# Find bad data where pickup time is after dropoff time
job = server.run_job("""
MATCH (start)-[t1:RideShare]->(finish)
WHERE t1.pickup_datetime > t1.dropoff_datetime
WITH t1, tointeger(t1.dropoff_datetime) - tointeger(t1.pickup_datetime) AS duration
RETURN
     t1.pickup_datetime,
     t1.dropoff_datetime,
     duration
ORDER BY duration DESC
LIMIT 100
""")
pdframe = job.get_data(format='pandas')
pdframe

Unnamed: 0,t1_pickup_datetime,t1_dropoff_datetime,duration


In [10]:
# Find the longest rides
job = server.run_job("""
MATCH (start)-[t1:RideShare]->(finish)
WITH t1, t1.dropoff_datetime - t1.pickup_datetime AS duration
RETURN
     t1.dispatching_base_num,
     t1.pickup_datetime,
     t1.dropoff_datetime,
     duration
ORDER BY duration DESC
LIMIT 10
""")
pdframe = job.get_data(format='pandas')
pdframe

Unnamed: 0,t1_dispatching_base_num,t1_pickup_datetime,t1_dropoff_datetime,duration
0,B02800,2020-06-21 09:24:42,2020-06-22 09:14:37,0 days 23:49:55
1,B02800,2020-06-15 18:24:23,2020-06-16 16:16:55,0 days 21:52:32
2,B02800,2020-06-19 15:41:37,2020-06-20 13:15:27,0 days 21:33:50
3,B02800,2020-06-10 14:17:27,2020-06-11 11:10:51,0 days 20:53:24
4,B02510,2020-06-07 23:07:48,2020-06-08 17:24:30,0 days 18:16:42
5,B02800,2020-06-20 16:48:01,2020-06-21 10:56:49,0 days 18:08:48
6,B02800,2020-06-23 19:45:36,2020-06-24 11:50:46,0 days 16:05:10
7,B02510,2020-06-04 23:48:22,2020-06-05 11:45:30,0 days 11:57:08
8,B02836,2020-06-10 12:05:38,2020-06-10 21:54:11,0 days 09:48:33
9,B02510,2020-06-13 13:04:34,2020-06-13 22:07:43,0 days 09:03:09


In [11]:
# Find back-to-back long rides
# This assumes a histogram with lots of short duration rides.
# The gap between the end of the first ride and the beginning of the second ride
# must be less than some threshold.  We call this the down_time.
job = server.run_job("""
MATCH (start)-[t1:RideShare]->(finish)
WITH max(t1.dropoff_datetime - t1.pickup_datetime) AS max_duration
MATCH (start)-[t1:RideShare]->(mid)-[t2:RideShare]->(finish)
WHERE start <> mid AND mid <> finish
  AND t1.dispatching_base_num = t2.dispatching_base_num
  AND t2.pickup_datetime > t1.dropoff_datetime
  AND t2.pickup_datetime - t1.dropoff_datetime <= $down_time
  AND t1.dropoff_datetime - t1.pickup_datetime > max_duration / 10
  AND t2.dropoff_datetime - t2.pickup_datetime > max_duration / 10
RETURN
  t1.dispatching_base_num AS base_num,
  t1.pickup_datetime,
  t1.dropoff_datetime,
  t1.dropoff_datetime - t1.pickup_datetime AS duration1,
  t2.pickup_datetime,
  t2.dropoff_datetime,
  t2.dropoff_datetime - t2.pickup_datetime AS duration2
""", parameters={'down_time':timedelta(hours = 1)})
pdframe = job.get_data(format='pandas')
pdframe

Unnamed: 0,base_num,t1_pickup_datetime,t1_dropoff_datetime,duration1,t2_pickup_datetime,t2_dropoff_datetime,duration2
0,B02510,2020-06-03 12:40:20,2020-06-03 16:21:04,0 days 03:40:44,2020-06-03 16:39:43,2020-06-03 20:03:57,0 days 03:24:14


In [12]:
# Find back-to-back long rides ending where they began (two-cycle)
# This assumes a histogram with lots of short duration rides.
# The gap between the end of the first ride and the beginning of the second ride
# must be less than some threshold.  We call this the down_time.
job = server.run_job("""
MATCH (start)-[t1:RideShare]->(finish)
WITH max(t1.dropoff_datetime - t1.pickup_datetime) AS max_duration
MATCH (start)-[t1:RideShare]->(mid)-[t2:RideShare]->(start)
WHERE start <> mid
  AND t1.dispatching_base_num = t2.dispatching_base_num
  AND t2.pickup_datetime > t1.dropoff_datetime
  AND t2.pickup_datetime - t1.dropoff_datetime <= $down_time
  AND t1.dropoff_datetime - t1.pickup_datetime > max_duration / 10
  AND t2.dropoff_datetime - t2.pickup_datetime > max_duration / 10
RETURN
  t1.dispatching_base_num AS base_num,
  t1.pickup_datetime,
  t1.dropoff_datetime,
  t1.dropoff_datetime - t1.pickup_datetime AS duration1,
  t2.pickup_datetime,
  t2.dropoff_datetime,
  t2.dropoff_datetime - t2.pickup_datetime AS duration2
""", parameters={'down_time':timedelta(hours = 1)})
pdframe = job.get_data(format='pandas')
pdframe

Unnamed: 0,base_num,t1_pickup_datetime,t1_dropoff_datetime,duration1,t2_pickup_datetime,t2_dropoff_datetime,duration2


In [13]:
# Find back-to-back-to-back long rides ending at the beginning location (three-cycles)
# This assumes a histogram with lots of short duration rides.
# The gap between the end of the first ride and the beginning of the second ride
# as well as between the end of the second ride and the beginning of the third ride
# must be less than some threshold.  We call this the down_time.
job = server.run_job("""
MATCH (start)-[t1:RideShare]->(finish)
WITH max(t1.dropoff_datetime - t1.pickup_datetime) AS max_duration
MATCH (start)-[t1:RideShare]->(mid1)-[t3:RideShare]->(mid2)-[t2:RideShare]->(finish)
WHERE start <> mid1 AND start <> mid2 AND mid1 <> mid2 and unique_vertices(start, mid1, mid2, finish)
  AND t1.dispatching_base_num = t2.dispatching_base_num
  AND t2.dispatching_base_num = t3.dispatching_base_num
  AND t2.pickup_datetime > t1.dropoff_datetime
  AND t3.pickup_datetime > t2.dropoff_datetime
  AND t2.pickup_datetime - t1.dropoff_datetime <= $down_time
  AND t3.pickup_datetime - t2.dropoff_datetime <= $down_time
  AND t1.dropoff_datetime - t1.pickup_datetime > max_duration / 20
  AND t2.dropoff_datetime - t2.pickup_datetime > max_duration / 20
  AND t3.dropoff_datetime - t3.pickup_datetime > max_duration / 20
RETURN
  t1.dispatching_base_num AS base_num,
  t1.pickup_datetime,
  t1.dropoff_datetime,
  t2.pickup_datetime,
  t2.dropoff_datetime,
  t3.pickup_datetime,
  t3.dropoff_datetime,
  t1.dropoff_datetime - t1.pickup_datetime AS duration1,
  t2.dropoff_datetime - t2.pickup_datetime AS duration2,
  t2.dropoff_datetime - t2.pickup_datetime AS duration3
""", parameters={'down_time':timedelta(hours = 1)})
pdframe = job.get_data(format='pandas')
pdframe

Unnamed: 0,base_num,t1_pickup_datetime,t1_dropoff_datetime,t2_pickup_datetime,t2_dropoff_datetime,t3_pickup_datetime,t3_dropoff_datetime,duration1,duration2,duration3
0,B02510,2020-06-19 14:24:28,2020-06-19 15:42:54,2020-06-19 15:46:41,2020-06-19 17:31:54,2020-06-19 17:38:32,2020-06-19 19:02:04,0 days 01:18:26,0 days 01:45:13,0 days 01:45:13
1,B02510,2020-06-26 13:44:22,2020-06-26 15:15:58,2020-06-26 15:57:24,2020-06-26 17:22:08,2020-06-26 17:29:08,2020-06-26 18:41:12,0 days 01:31:36,0 days 01:24:44,0 days 01:24:44
2,B02510,2020-06-26 14:06:42,2020-06-26 15:28:39,2020-06-26 15:59:16,2020-06-26 17:16:30,2020-06-26 17:36:37,2020-06-26 18:50:58,0 days 01:21:57,0 days 01:17:14,0 days 01:17:14
3,B02510,2020-06-26 14:06:42,2020-06-26 15:28:39,2020-06-26 16:17:00,2020-06-26 17:29:12,2020-06-26 17:36:37,2020-06-26 18:50:58,0 days 01:21:57,0 days 01:12:12,0 days 01:12:12
4,B02764,2020-06-29 12:16:43,2020-06-29 13:33:26,2020-06-29 14:23:15,2020-06-29 15:39:35,2020-06-29 16:03:01,2020-06-29 17:18:52,0 days 01:16:43,0 days 01:16:20,0 days 01:16:20
5,B02510,2020-06-19 13:59:53,2020-06-19 15:19:51,2020-06-19 15:29:16,2020-06-19 16:46:16,2020-06-19 17:04:47,2020-06-19 18:19:51,0 days 01:19:58,0 days 01:17:00,0 days 01:17:00
6,B02510,2020-06-19 13:45:06,2020-06-19 15:02:24,2020-06-19 15:02:50,2020-06-19 16:18:43,2020-06-19 16:22:43,2020-06-19 17:34:25,0 days 01:17:18,0 days 01:15:53,0 days 01:15:53
7,B02510,2020-06-19 12:18:23,2020-06-19 13:50:34,2020-06-19 14:34:03,2020-06-19 15:50:41,2020-06-19 15:54:48,2020-06-19 17:16:35,0 days 01:32:11,0 days 01:16:38,0 days 01:16:38
8,B02510,2020-06-19 14:47:44,2020-06-19 16:44:49,2020-06-19 16:46:32,2020-06-19 18:03:12,2020-06-19 18:33:44,2020-06-19 19:45:51,0 days 01:57:05,0 days 01:16:40,0 days 01:16:40
9,B02510,2020-06-12 13:59:25,2020-06-12 15:12:35,2020-06-12 15:24:54,2020-06-12 16:38:20,2020-06-12 16:39:43,2020-06-12 17:59:08,0 days 01:13:10,0 days 01:13:26,0 days 01:13:26
