# Analyze NYC Taxi RideShare Data

The RideShare data consists of the following companies:  Juno, Uber, Via, Lyft.

In [1]:
import xgt
# For cloud instances, replace the localhost with the instance's IP address
server = xgt.Connection(host='localhost', userid='xgtd')
server.set_default_namespace('nyctaxi')

In [2]:
# Select one month of NYC Taxi data to explore
year = input("Enter year (2010 to 2020)")
month = input("Enter month number (1 to 12)")

# Establish URLs for the selected month
fhv_trip_url = f"https://nyc-tlc.s3.amazonaws.com/trip+data/fhv_tripdata_{year}-{int(month):02d}.csv"
fhvhv_trip_url = f"https://nyc-tlc.s3.amazonaws.com/trip+data/fhvhv_tripdata_{year}-{int(month):02d}.csv"

Enter year (2010 to 2020)2020
Enter month number (1 to 12)6


## Establish graph schema

Trovares xGT is a "schema-first" graph application.
The schemas are created from within the Python environment as shown below.

In [3]:
# Establish the schemas for the NYC Taxi dataset as a graph.
# Locations are the nodes of the graph, and each rideshare trip is an edge.
locations = server.create_vertex_frame(
    name='Locations',
    schema=[['id', xgt.INT]],
    key='id')

rideshare_trip = server.create_edge_frame(
    name='RideShare',
    schema=[
          ['hvfhs_license_num', xgt.TEXT],
          ['dispatching_base_num', xgt.TEXT],
          ['pickup_datetime', xgt.DATETIME],
          ['dropoff_datetime', xgt.DATETIME],
          ['PULocationID', xgt.INT],
          ['DOLocationID', xgt.INT],
          ['SR_Flag', xgt.INT],
        ],
    source=locations,
    target=locations,
    source_key='PULocationID',
    target_key='DOLocationID',
)

In [4]:
# Ingest data from the selected month.
rideshare_trip.load(fhvhv_trip_url, xgt.HeaderMode.NORMAL)
print(f"Location ID count: {locations.num_rows:,}")
print(f"Rideshare Trip edges: {rideshare_trip.num_rows:,}")
max_memory = server.max_user_memory_size
print(f"Memory footprint: {max_memory - server.free_user_memory_size:,.3f} GiB used out of {max_memory:,.3f} GiB available.")

Location ID count: 261
Rideshare Trip edges: 7,550,935
Memory footprint: 0.744 GiB used out of 8.000 GiB available.


### Explore basic data characterstics

In [5]:
# Produce a histogram of out-degree counts on each node
import pandas

job = server.run_job("""
MATCH (v:Locations)
RETURN v.id, outdegree(v) AS outdegree
ORDER BY outdegree DESC
""")
pdframe = job.get_data_pandas()
pdframe

Unnamed: 0,v_id,outdegree
0,61,159075
1,76,135072
2,42,116533
3,37,102936
4,35,93117
...,...,...
256,99,139
257,1,103
258,110,41
259,2,41


In [6]:
# Produce a histogram of in-degree counts on each node
import pandas

job = server.run_job("""
MATCH (v:Locations)
RETURN v.id, indegree(v) AS indegree
ORDER BY indegree DESC
""")
pdframe = job.get_data_pandas()
pdframe

Unnamed: 0,v_id,outdegree
0,265,235062
1,61,155822
2,76,139083
3,42,107167
4,37,102107
...,...,...
256,30,431
257,99,201
258,2,50
259,110,44


In [7]:
# Compute ride time statistics
job = server.run_job("""
MATCH (start)-[t1:RideShare]->(finish)
RETURN
    avg(tointeger(t1.dropoff_datetime) - tointeger(t1.pickup_datetime)) AS average_duration,
    max(tointeger(t1.dropoff_datetime) - tointeger(t1.pickup_datetime)) AS max_duration,
    min(tointeger(t1.dropoff_datetime) - tointeger(t1.pickup_datetime)) AS min_duration
""")
pdframe = job.get_data_pandas()
pdframe

Unnamed: 0,average_duration,max_duration,min_duration
0,989,85795,0


### Search for some interesting patterns

In [8]:
# Find highest number of edges between two vertices
job = server.run_job("""
MATCH (start)-[:RideShare]->(finish)
RETURN start.id, finish.id, count(*) AS multiple_edges_count
ORDER BY multiple_edges_count DESC
LIMIT 10
""")
pdframe = job.get_data_pandas()
pdframe

Unnamed: 0,start_id,finish_id,multiple_edges_count
0,76,76,33163
1,26,26,20622
2,39,39,19438
3,61,61,19127
4,35,76,10966
5,42,42,10795
6,76,35,9986
7,35,35,9828
8,213,213,9473
9,89,89,9200


In [9]:
# Find bad data where pickup time is after dropoff time
job = server.run_job("""
MATCH (start)-[t1:RideShare]->(finish)
WHERE t1.pickup_datetime > t1.dropoff_datetime
WITH t1, tointeger(t1.dropoff_datetime) - tointeger(t1.pickup_datetime) AS duration
RETURN
     t1.pickup_datetime,
     t1.dropoff_datetime,
     duration
ORDER BY duration DESC
LIMIT 100
""")
pdframe = job.get_data_pandas()
pdframe

Unnamed: 0,t1_pickup_datetime,t1_dropoff_datetime,duration


In [10]:
# Find the longest rides
job = server.run_job("""
MATCH (start)-[t1:RideShare]->(finish)
WITH t1, tointeger(t1.dropoff_datetime) - tointeger(t1.pickup_datetime) AS duration
RETURN
     t1.dispatching_base_num,
     t1.pickup_datetime,
     t1.dropoff_datetime,
     duration
ORDER BY duration DESC
LIMIT 10
""")
pdframe = job.get_data_pandas()
pdframe

Unnamed: 0,t1_dispatching_base_num,t1_pickup_datetime,t1_dropoff_datetime,duration
0,B02800,2020-06-21T09:24:42,2020-06-22T09:14:37,85795
1,B02800,2020-06-15T18:24:23,2020-06-16T16:16:55,78752
2,B02800,2020-06-19T15:41:37,2020-06-20T13:15:27,77630
3,B02800,2020-06-10T14:17:27,2020-06-11T11:10:51,75204
4,B02510,2020-06-07T23:07:48,2020-06-08T17:24:30,65802
5,B02800,2020-06-20T16:48:01,2020-06-21T10:56:49,65328
6,B02800,2020-06-23T19:45:36,2020-06-24T11:50:46,57910
7,B02510,2020-06-04T23:48:22,2020-06-05T11:45:30,43028
8,B02836,2020-06-10T12:05:38,2020-06-10T21:54:11,35313
9,B02510,2020-06-13T13:04:34,2020-06-13T22:07:43,32589


In [11]:
# Find back-to-back long rides
# This assumes a histogram with lots of short duration rides.
# The gap between the end of the first ride and the beginning of the second ride
# must be less than some threshold.  We call this the down_time.
job = server.run_job("""
MATCH (start)-[t1:RideShare]->(finish)
WITH max(tointeger(t1.dropoff_datetime) - tointeger(t1.pickup_datetime)) AS max_duration
MATCH (start)-[t1:RideShare]->(mid)-[t2:RideShare]->(finish)
WHERE start <> mid AND mid <> finish
  AND t1.dispatching_base_num = t2.dispatching_base_num
  AND t2.pickup_datetime > t1.dropoff_datetime
  AND tointeger(t2.pickup_datetime) - tointeger(t1.dropoff_datetime) <= $down_time
  AND tointeger(t1.dropoff_datetime) - tointeger(t1.pickup_datetime) > max_duration / 10
  AND tointeger(t2.dropoff_datetime) - tointeger(t2.pickup_datetime) > max_duration / 10
RETURN
  t1.dispatching_base_num AS base_num,
  t1.pickup_datetime,
  t1.dropoff_datetime,
  tointeger(t1.dropoff_datetime) - tointeger(t1.pickup_datetime) AS duration1,
  t2.pickup_datetime,
  t2.dropoff_datetime,
  tointeger(t2.dropoff_datetime) - tointeger(t2.pickup_datetime) AS duration2
""", parameters={'down_time':3600})
pdframe = job.get_data_pandas()
pdframe

Unnamed: 0,base_num,t1_pickup_datetime,t1_dropoff_datetime,duration1,t2_pickup_datetime,t2_dropoff_datetime,duration2
0,B02510,2020-06-03T12:40:20,2020-06-03T16:21:04,13244,2020-06-03T16:39:43,2020-06-03T20:03:57,12254


In [12]:
# Find back-to-back long rides ending where they began (two-cycle)
# This assumes a histogram with lots of short duration rides.
# The gap between the end of the first ride and the beginning of the second ride
# must be less than some threshold.  We call this the down_time.
job = server.run_job("""
MATCH (start)-[t1:RideShare]->(finish)
WITH max(tointeger(t1.dropoff_datetime) - tointeger(t1.pickup_datetime)) AS max_duration
MATCH (start)-[t1:RideShare]->(mid)-[t2:RideShare]->(start)
WHERE start <> mid
  AND t1.dispatching_base_num = t2.dispatching_base_num
  AND t2.pickup_datetime > t1.dropoff_datetime
  AND tointeger(t2.pickup_datetime) - tointeger(t1.dropoff_datetime) <= $down_time
  AND tointeger(t1.dropoff_datetime) - tointeger(t1.pickup_datetime) > max_duration / 10
  AND tointeger(t2.dropoff_datetime) - tointeger(t2.pickup_datetime) > max_duration / 10
RETURN
  t1.dispatching_base_num AS base_num,
  t1.pickup_datetime,
  t1.dropoff_datetime,
  tointeger(t1.dropoff_datetime) - tointeger(t1.pickup_datetime) AS duration1,
  t2.pickup_datetime,
  t2.dropoff_datetime,
  tointeger(t2.dropoff_datetime) - tointeger(t2.pickup_datetime) AS duration2
""", parameters={'down_time':3600})
pdframe = job.get_data_pandas()
pdframe

Unnamed: 0,base_num,t1_pickup_datetime,t1_dropoff_datetime,duration1,t2_pickup_datetime,t2_dropoff_datetime,duration2


In [13]:
# Find back-to-back-to-back long rides ending at the beginning location (three-cycles)
# This assumes a histogram with lots of short duration rides.
# The gap between the end of the first ride and the beginning of the second ride
# as well as between the end of the second ride and the beginning of the third ride
# must be less than some threshold.  We call this the down_time.
job = server.run_job("""
MATCH (start)-[t1:RideShare]->(finish)
WITH max(tointeger(t1.dropoff_datetime) - tointeger(t1.pickup_datetime)) AS max_duration
MATCH (start)-[t1:RideShare]->(mid1)-[t3:RideShare]->(mid2)-[t2:RideShare]->(finish)
WHERE start <> mid1 AND start <> mid2 AND mid1 <> mid2 and unique_vertices(start, mid1, mid2, finish)
  AND t1.dispatching_base_num = t2.dispatching_base_num
  AND t2.dispatching_base_num = t3.dispatching_base_num
  AND t2.pickup_datetime > t1.dropoff_datetime
  AND t3.pickup_datetime > t2.dropoff_datetime
  AND tointeger(t2.pickup_datetime) - tointeger(t1.dropoff_datetime) <= $down_time
  AND tointeger(t3.pickup_datetime) - tointeger(t2.dropoff_datetime) <= $down_time
  AND tointeger(t1.dropoff_datetime) - tointeger(t1.pickup_datetime) > max_duration / 20
  AND tointeger(t2.dropoff_datetime) - tointeger(t2.pickup_datetime) > max_duration / 20
  AND tointeger(t3.dropoff_datetime) - tointeger(t3.pickup_datetime) > max_duration / 20
RETURN
  t1.dispatching_base_num AS base_num,
  t1.pickup_datetime,
  t1.dropoff_datetime,
  t2.pickup_datetime,
  t2.dropoff_datetime,
  t3.pickup_datetime,
  t3.dropoff_datetime,
  tointeger(t1.dropoff_datetime) - tointeger(t1.pickup_datetime) AS duration1,
  tointeger(t2.dropoff_datetime) - tointeger(t2.pickup_datetime) AS duration2,
  tointeger(t2.dropoff_datetime) - tointeger(t2.pickup_datetime) AS duration3
""", parameters={'down_time':3600})
pdframe = job.get_data_pandas()
pdframe

Unnamed: 0,base_num,t1_pickup_datetime,t1_dropoff_datetime,t2_pickup_datetime,t2_dropoff_datetime,t3_pickup_datetime,t3_dropoff_datetime,duration1,duration2,duration3
0,B02510,2020-06-19T14:24:28,2020-06-19T15:42:54,2020-06-19T15:46:41,2020-06-19T17:31:54,2020-06-19T17:38:32,2020-06-19T19:02:04,4706,6313,6313
1,B02510,2020-06-19T13:45:06,2020-06-19T15:02:24,2020-06-19T15:02:50,2020-06-19T16:18:43,2020-06-19T16:22:43,2020-06-19T17:34:25,4638,4553,4553
2,B02510,2020-06-19T13:59:53,2020-06-19T15:19:51,2020-06-19T15:29:16,2020-06-19T16:46:16,2020-06-19T17:04:47,2020-06-19T18:19:51,4798,4620,4620
3,B02510,2020-06-19T12:18:23,2020-06-19T13:50:34,2020-06-19T14:34:03,2020-06-19T15:50:41,2020-06-19T15:54:48,2020-06-19T17:16:35,5531,4598,4598
4,B02510,2020-06-19T12:10:42,2020-06-19T13:27:06,2020-06-19T13:45:06,2020-06-19T15:02:24,2020-06-19T15:07:20,2020-06-19T17:02:08,4584,4638,4638
5,B02510,2020-06-11T12:58:58,2020-06-11T14:27:17,2020-06-11T14:49:05,2020-06-11T16:07:05,2020-06-11T16:18:24,2020-06-11T17:39:00,5299,4680,4680
6,B02510,2020-06-19T12:07:28,2020-06-19T13:22:37,2020-06-19T13:46:09,2020-06-19T15:06:23,2020-06-19T15:38:08,2020-06-19T17:07:23,4509,4814,4814
7,B02510,2020-06-19T13:03:56,2020-06-19T14:20:28,2020-06-19T14:58:17,2020-06-19T16:15:16,2020-06-19T16:44:24,2020-06-19T17:59:03,4592,4619,4619
8,B02510,2020-06-26T12:42:18,2020-06-26T14:06:07,2020-06-26T14:56:14,2020-06-26T16:17:06,2020-06-26T16:36:38,2020-06-26T17:57:42,5029,4852,4852
9,B02510,2020-06-19T13:38:15,2020-06-19T14:53:57,2020-06-19T15:15:54,2020-06-19T16:28:00,2020-06-19T17:04:47,2020-06-19T18:19:51,4542,4326,4326
