# Analyze NYC Taxi RideShare Data

The RideShare data consists of the following companies:  Juno, Uber, Via, Lyft.

In [1]:
import os
import requests
import xgt
# For cloud instances, replace the localhost with the instance's IP address
server = xgt.Connection(host='localhost', userid='xgtd')
server.set_default_namespace('nyctaxi')

In [2]:
# Select one month of NYC Taxi data to explore
year = input("Enter year (2019 to 2022)")
month = input("Enter month number (1 to 12)")

# Establish URLs for the selected month
fhv_trip_url = f"https://d37ci6vzurychx.cloudfront.net/trip-data/fhv_tripdata_{year}-{int(month):02d}.parquet"
fhvhv_trip_url = f"https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata_{year}-{int(month):02d}.parquet"

Enter year (2019 to 2022)2020
Enter month number (1 to 12)6


In [3]:
file_object = requests.get(fhvhv_trip_url)
with open('fhvhv.parquet', 'wb') as local_file:
    local_file.write(file_object.content)

In [4]:
directory_path = os.getcwd()
fhvhv_file = os.path.join(directory_path, 'fhvhv.parquet')

## Establish graph schema

Trovares xGT is a "schema-first" graph application.
The schemas are created from within the Python environment as shown below.

In [5]:
# Establish the schemas for the NYC Taxi dataset as a graph.
# Locations are the nodes of the graph, and each rideshare trip is an edge.
server.drop_frame('RideShare')
server.drop_frame('Locations')

locations = server.create_vertex_frame(
    name='Locations',
    schema=[['id', xgt.INT]],
    key='id')

rideshare_trip = server.create_edge_frame(
    name='RideShare',
    schema=[
          ['hvfhs_license_num', xgt.TEXT],
          ['dispatching_base_num', xgt.TEXT],
          ['pickup_datetime', xgt.DATETIME],
          ['dropoff_datetime', xgt.DATETIME],
          ['PULocationID', xgt.INT],
          ['DOLocationID', xgt.INT],
          ['SR_Flag', xgt.TEXT],
        ],
    source=locations,
    target=locations,
    source_key='PULocationID',
    target_key='DOLocationID',
)

In [6]:
# Ingest data from the selected month.
rideshare_trip.load('xgt://' + fhvhv_file, frame_to_file_column_mapping =
    {'hvfhs_license_num' : 'hvfhs_license_num',  'dispatching_base_num' : 'dispatching_base_num',
     'pickup_datetime' : 'pickup_datetime', 'dropoff_datetime' : 'dropoff_datetime',
     'PULocationID' : 'PULocationID', 'DOLocationID' : 'DOLocationID', 'SR_Flag' : 'shared_request_flag'})

print(f"Location ID count: {locations.num_rows:,}")
print(f"Rideshare Trip edges: {rideshare_trip.num_rows:,}")
max_memory = server.max_user_memory_size
print(f"Memory footprint: {max_memory - server.free_user_memory_size:,.3f} GiB used out of {max_memory:,.3f} GiB available.")

Location ID count: 261
Rideshare Trip edges: 7,555,193
Memory footprint: 0.750 GiB used out of 52.000 GiB available.


### Explore basic data characterstics

In [7]:
# Produce a histogram of out-degree counts on each node
import pandas

job = server.run_job("""
MATCH (v:Locations)
RETURN v.id, outdegree(v) AS outdegree
ORDER BY outdegree DESC
""")
pdframe = job.get_data_pandas()
pdframe

Unnamed: 0,v_id,outdegree
0,61,159175
1,76,135150
2,42,116570
3,37,102990
4,35,93165
...,...,...
256,99,139
257,1,103
258,110,41
259,2,41


In [8]:
# Produce a histogram of in-degree counts on each node
import pandas

job = server.run_job("""
MATCH (v:Locations)
RETURN v.id, indegree(v) AS indegree
ORDER BY indegree DESC
""")
pdframe = job.get_data_pandas()
pdframe

Unnamed: 0,v_id,indegree
0,265,235144
1,61,155927
2,76,139159
3,42,107216
4,37,102174
...,...,...
256,30,431
257,99,201
258,2,50
259,110,44


In [9]:
# Compute ride time statistics
job = server.run_job("""
MATCH (start)-[t1:RideShare]->(finish)
RETURN
    avg(tointeger(t1.dropoff_datetime) - tointeger(t1.pickup_datetime)) AS average_duration,
    max(tointeger(t1.dropoff_datetime) - tointeger(t1.pickup_datetime)) AS max_duration,
    min(tointeger(t1.dropoff_datetime) - tointeger(t1.pickup_datetime)) AS min_duration
""")
pdframe = job.get_data_pandas()
pdframe

Unnamed: 0,average_duration,max_duration,min_duration
0,989,85795,0


### Search for some interesting patterns

In [10]:
# Find highest number of edges between two vertices
job = server.run_job("""
MATCH (start)-[:RideShare]->(finish)
RETURN start.id, finish.id, count(*) AS multiple_edges_count
ORDER BY multiple_edges_count DESC
LIMIT 10
""")
pdframe = job.get_data_pandas()
pdframe

Unnamed: 0,start_id,finish_id,multiple_edges_count
0,76,76,33184
1,26,26,20649
2,39,39,19452
3,61,61,19139
4,35,76,10972
5,42,42,10798
6,76,35,9993
7,35,35,9833
8,213,213,9476
9,89,89,9211


In [11]:
# Find bad data where pickup time is after dropoff time
job = server.run_job("""
MATCH (start)-[t1:RideShare]->(finish)
WHERE t1.pickup_datetime > t1.dropoff_datetime
WITH t1, tointeger(t1.dropoff_datetime) - tointeger(t1.pickup_datetime) AS duration
RETURN
     t1.pickup_datetime,
     t1.dropoff_datetime,
     duration
ORDER BY duration DESC
LIMIT 100
""")
pdframe = job.get_data_pandas()
pdframe

Unnamed: 0,t1_pickup_datetime,t1_dropoff_datetime,duration


In [12]:
# Find the longest rides
job = server.run_job("""
MATCH (start)-[t1:RideShare]->(finish)
WITH t1, tointeger(t1.dropoff_datetime) - tointeger(t1.pickup_datetime) AS duration
RETURN
     t1.dispatching_base_num,
     t1.pickup_datetime,
     t1.dropoff_datetime,
     duration
ORDER BY duration DESC
LIMIT 10
""")
pdframe = job.get_data_pandas()
pdframe

Unnamed: 0,t1_dispatching_base_num,t1_pickup_datetime,t1_dropoff_datetime,duration
0,B02800,2020-06-21T09:24:42,2020-06-22T09:14:37,85795
1,B02800,2020-06-15T18:24:23,2020-06-16T16:16:55,78752
2,B02800,2020-06-19T15:41:37,2020-06-20T13:15:27,77630
3,B02800,2020-06-10T14:17:27,2020-06-11T11:10:51,75204
4,B02510,2020-06-07T23:07:48,2020-06-08T17:24:30,65802
5,B02800,2020-06-20T16:48:01,2020-06-21T10:56:49,65328
6,B02800,2020-06-23T19:45:36,2020-06-24T11:50:46,57910
7,B02510,2020-06-04T23:48:22,2020-06-05T11:45:30,43028
8,B02836,2020-06-10T12:05:38,2020-06-10T21:54:11,35313
9,B02510,2020-06-13T13:04:34,2020-06-13T22:07:43,32589


In [13]:
# Find back-to-back long rides
# This assumes a histogram with lots of short duration rides.
# The gap between the end of the first ride and the beginning of the second ride
# must be less than some threshold.  We call this the down_time.
job = server.run_job("""
MATCH (start)-[t1:RideShare]->(finish)
WITH max(tointeger(t1.dropoff_datetime) - tointeger(t1.pickup_datetime)) AS max_duration
MATCH (start)-[t1:RideShare]->(mid)-[t2:RideShare]->(finish)
WHERE start <> mid AND mid <> finish
  AND t1.dispatching_base_num = t2.dispatching_base_num
  AND t2.pickup_datetime > t1.dropoff_datetime
  AND tointeger(t2.pickup_datetime) - tointeger(t1.dropoff_datetime) <= $down_time
  AND tointeger(t1.dropoff_datetime) - tointeger(t1.pickup_datetime) > max_duration / 10
  AND tointeger(t2.dropoff_datetime) - tointeger(t2.pickup_datetime) > max_duration / 10
RETURN
  t1.dispatching_base_num AS base_num,
  t1.pickup_datetime,
  t1.dropoff_datetime,
  tointeger(t1.dropoff_datetime) - tointeger(t1.pickup_datetime) AS duration1,
  t2.pickup_datetime,
  t2.dropoff_datetime,
  tointeger(t2.dropoff_datetime) - tointeger(t2.pickup_datetime) AS duration2
""", parameters={'down_time':3600})
pdframe = job.get_data_pandas()
pdframe

Unnamed: 0,base_num,t1_pickup_datetime,t1_dropoff_datetime,duration1,t2_pickup_datetime,t2_dropoff_datetime,duration2
0,B02510,2020-06-03T12:40:20,2020-06-03T16:21:04,13244,2020-06-03T16:39:43,2020-06-03T20:03:57,12254


In [14]:
# Find back-to-back long rides ending where they began (two-cycle)
# This assumes a histogram with lots of short duration rides.
# The gap between the end of the first ride and the beginning of the second ride
# must be less than some threshold.  We call this the down_time.
job = server.run_job("""
MATCH (start)-[t1:RideShare]->(finish)
WITH max(tointeger(t1.dropoff_datetime) - tointeger(t1.pickup_datetime)) AS max_duration
MATCH (start)-[t1:RideShare]->(mid)-[t2:RideShare]->(start)
WHERE start <> mid
  AND t1.dispatching_base_num = t2.dispatching_base_num
  AND t2.pickup_datetime > t1.dropoff_datetime
  AND tointeger(t2.pickup_datetime) - tointeger(t1.dropoff_datetime) <= $down_time
  AND tointeger(t1.dropoff_datetime) - tointeger(t1.pickup_datetime) > max_duration / 10
  AND tointeger(t2.dropoff_datetime) - tointeger(t2.pickup_datetime) > max_duration / 10
RETURN
  t1.dispatching_base_num AS base_num,
  t1.pickup_datetime,
  t1.dropoff_datetime,
  tointeger(t1.dropoff_datetime) - tointeger(t1.pickup_datetime) AS duration1,
  t2.pickup_datetime,
  t2.dropoff_datetime,
  tointeger(t2.dropoff_datetime) - tointeger(t2.pickup_datetime) AS duration2
""", parameters={'down_time':3600})
pdframe = job.get_data_pandas()
pdframe

Unnamed: 0,base_num,t1_pickup_datetime,t1_dropoff_datetime,duration1,t2_pickup_datetime,t2_dropoff_datetime,duration2


In [15]:
# Find back-to-back-to-back long rides ending at the beginning location (three-cycles)
# This assumes a histogram with lots of short duration rides.
# The gap between the end of the first ride and the beginning of the second ride
# as well as between the end of the second ride and the beginning of the third ride
# must be less than some threshold.  We call this the down_time.
job = server.run_job("""
MATCH (start)-[t1:RideShare]->(finish)
WITH max(tointeger(t1.dropoff_datetime) - tointeger(t1.pickup_datetime)) AS max_duration
MATCH (start)-[t1:RideShare]->(mid1)-[t3:RideShare]->(mid2)-[t2:RideShare]->(finish)
WHERE start <> mid1 AND start <> mid2 AND mid1 <> mid2 and unique_vertices(start, mid1, mid2, finish)
  AND t1.dispatching_base_num = t2.dispatching_base_num
  AND t2.dispatching_base_num = t3.dispatching_base_num
  AND t2.pickup_datetime > t1.dropoff_datetime
  AND t3.pickup_datetime > t2.dropoff_datetime
  AND tointeger(t2.pickup_datetime) - tointeger(t1.dropoff_datetime) <= $down_time
  AND tointeger(t3.pickup_datetime) - tointeger(t2.dropoff_datetime) <= $down_time
  AND tointeger(t1.dropoff_datetime) - tointeger(t1.pickup_datetime) > max_duration / 20
  AND tointeger(t2.dropoff_datetime) - tointeger(t2.pickup_datetime) > max_duration / 20
  AND tointeger(t3.dropoff_datetime) - tointeger(t3.pickup_datetime) > max_duration / 20
RETURN
  t1.dispatching_base_num AS base_num,
  t1.pickup_datetime,
  t1.dropoff_datetime,
  t2.pickup_datetime,
  t2.dropoff_datetime,
  t3.pickup_datetime,
  t3.dropoff_datetime,
  tointeger(t1.dropoff_datetime) - tointeger(t1.pickup_datetime) AS duration1,
  tointeger(t2.dropoff_datetime) - tointeger(t2.pickup_datetime) AS duration2,
  tointeger(t2.dropoff_datetime) - tointeger(t2.pickup_datetime) AS duration3
""", parameters={'down_time':3600})
pdframe = job.get_data_pandas()
pdframe

Unnamed: 0,base_num,t1_pickup_datetime,t1_dropoff_datetime,t2_pickup_datetime,t2_dropoff_datetime,t3_pickup_datetime,t3_dropoff_datetime,duration1,duration2,duration3
0,B02510,2020-06-19T14:24:28,2020-06-19T15:42:54,2020-06-19T15:46:41,2020-06-19T17:31:54,2020-06-19T17:38:32,2020-06-19T19:02:04,4706,6313,6313
1,B02510,2020-06-12T13:59:25,2020-06-12T15:12:35,2020-06-12T15:24:54,2020-06-12T16:38:20,2020-06-12T16:39:43,2020-06-12T17:59:08,4390,4406,4406
2,B02510,2020-06-05T13:06:01,2020-06-05T15:54:00,2020-06-05T16:46:47,2020-06-05T18:07:58,2020-06-05T18:26:20,2020-06-05T21:04:18,10079,4871,4871
3,B02510,2020-06-05T13:06:01,2020-06-05T15:54:00,2020-06-05T16:39:32,2020-06-05T18:18:51,2020-06-05T18:26:20,2020-06-05T21:04:18,10079,5959,5959
4,B02510,2020-06-19T14:07:13,2020-06-19T15:50:58,2020-06-19T16:20:29,2020-06-19T17:43:17,2020-06-19T18:27:22,2020-06-19T19:50:17,6225,4968,4968
5,B02510,2020-06-19T14:07:13,2020-06-19T15:50:58,2020-06-19T16:09:55,2020-06-19T17:40:09,2020-06-19T18:27:22,2020-06-19T19:50:17,6225,5414,5414
6,B02510,2020-06-19T14:07:13,2020-06-19T15:50:58,2020-06-19T16:12:55,2020-06-19T17:59:43,2020-06-19T18:27:22,2020-06-19T19:50:17,6225,6408,6408
7,B02510,2020-06-19T14:07:13,2020-06-19T15:50:58,2020-06-19T16:17:14,2020-06-19T17:43:51,2020-06-19T18:27:22,2020-06-19T19:50:17,6225,5197,5197
8,B02510,2020-06-10T15:51:29,2020-06-10T17:23:18,2020-06-10T18:18:23,2020-06-10T19:32:27,2020-06-10T20:05:19,2020-06-10T21:18:49,5509,4444,4444
9,B02510,2020-06-19T13:38:15,2020-06-19T14:53:57,2020-06-19T15:15:54,2020-06-19T16:28:00,2020-06-19T17:04:47,2020-06-19T18:19:51,4542,4326,4326
