In [1]:
import requests
import numpy as np
import json
import math
from datetime import datetime, timezone, timedelta
import time
import os

In [2]:
def list_string(arr):
    return f"{[*map(str,arr)]}".replace("'",'"')

In [3]:
def get_some_times(route, dt_params=(2018,10,15,0,1,0), hours=20, per_hour=1):
    route_data=[]
    if type(route) != list:
        route = [route]
        
    per_hour=int(per_hour)
    timestamp=int(datetime(*dt_params, tzinfo = timezone(timedelta(hours = -8))).timestamp())*1000 # use PST for tzinfo
    T0 = time.time()
    for i in range(hours*per_hour):
        t0=time.time()
        start_time = timestamp + i*36e5/per_hour
        end_time   = start_time + 36e5/per_hour

        # added direction id to query, needed to obtain stops
        query = f"""{{
          trynState(agency: "muni", startTime: "{start_time}", endTime: "{end_time}", routes: {list_string(route)}) {{
            agency
            startTime
            routes {{
              rid
              stops {{
                sid
                name
                lat
                lon
              }}
              routeStates {{
                vtime
                vehicles {{
                  vid
                  lat
                  lon
                  heading
                  did
                }}
              }}
            }}
          }}
        }}
        """ # Braces need to be doubled for f-string

        query_url = "https://06o8rkohub.execute-api.us-west-2.amazonaws.com/dev/graphql?query="+query
        print('Getting data...',i)
        r = requests.get(query_url)
        print(f'That took {round(time.time()-t0,2)} seconds.')

        data = json.loads(r.text)
        
        try:
            data['data']
        except KeyError as err:
            print(f"Error for time range {startTime}-{endTime}: {err}")
        else:
            if len(data['data']['trynState']['routes']):
                route_data.extend(data['data']['trynState']['routes'])
    r_sort = {}
    print('Final formatting...')
    for x in route_data:
        if x['rid'] not in r_sort.keys():
            r_sort[x['rid']] = {'rid':x['rid'],'routeStates':[],'stops':x['stops']}
        r_sort[x['rid']]['routeStates'].extend(sorted(x['routeStates'], key=lambda rs: int(rs['vtime'])))
    r_sort=list(r_sort.values())
    print('Done')
    print(f'That all took {round(time.time()-T0,2)} seconds.')
    return r_sort

Testing main scraping function:

In [14]:
a=get_some_times(['1','14','N','14X','12'],dt_params=(2018,10,15,12),hours=2)

Getting data... 0
That took 9.77 seconds.
Getting data... 1
That took 5.93 seconds.
Final formatting...
Done
That all took 15.81 seconds.


List of all routes (scraped from https://github.com/trynmaps/opentransit-map/blob/master/src/res/muniRoutes2.json)

In [15]:
all_routes = ["1","2","3","5","6","7","8","9","10","12","14","18","19","21","22","23","24","25","27","28","29","30","31","33","35","36","37","38","39","41","43","44","45","47","48","49","52","54","55","56","57","66","67","88","90","91","83X","30X","7R","J","N","14R","14X","PH","E","38R","82X","81X","T-OWL","M-OWL","7X","N-OWL","L-OWL","76X","9R","C","38AX","38BX","31BX","1AX","31AX","1BX","K-OWL","F","8AX","8BX","M","L","5R","PM","K/T","NX","28R","BUS"]

Get 24 hours of data in chunks (more than ~30 routes per call gives an error)

In [16]:
def get24all(dt_params=(2018,10,15,7)):
    result = []
    t0=time.time()
    # changing the query reduced the number of routes that it could handle before causing errors
    # divide up the routes into 4 sets of 21
    for i in range(4):
        print(f'Chunk {i+1} of 4:')
        result.extend(get_some_times(all_routes[i*21:(i+1)*21],dt_params,hours=24))
    print(f'Whole thing took {round(time.time()-t0,2)} seconds.')
    return result

Change `dt_params` to whatever date you want. `(year,month,day,hour)`

In [17]:
allofem=get24all(dt_params=(2018,10,15,0))

Chunk 1 of 4:
Getting data... 0
That took 3.26 seconds.
Getting data... 1
That took 9.12 seconds.
Getting data... 2
That took 13.53 seconds.
Getting data... 3
That took 3.93 seconds.
Getting data... 4
That took 3.95 seconds.
Getting data... 5
That took 6.75 seconds.
Getting data... 6
That took 6.95 seconds.
Getting data... 7
That took 7.57 seconds.
Getting data... 8
That took 6.15 seconds.
Getting data... 9
That took 7.62 seconds.
Getting data... 10
That took 7.35 seconds.
Getting data... 11
That took 7.44 seconds.
Getting data... 12
That took 5.76 seconds.
Getting data... 13
That took 7.37 seconds.
Getting data... 14
That took 26.8 seconds.
Getting data... 15
That took 10.94 seconds.
Getting data... 16
That took 7.85 seconds.
Getting data... 17
That took 8.01 seconds.
Getting data... 18
That took 7.98 seconds.
Getting data... 19
That took 6.75 seconds.
Getting data... 20
That took 10.97 seconds.
Getting data... 21
That took 10.42 seconds.
Getting data... 22
That took 5.8 seconds.
Gett

Dump data to json file. Here, the final output is 185 MB, and zips to 8 MB.

In [4]:
# testing sample routes scraping with tzinfo fix
with open('sample_routes_data_pst_15s.json', 'w') as outfile:
    json.dump(get_some_times(['1', '14'], hours = 24), outfile)

Getting data... 0
That took 4.88 seconds.
Getting data... 1
That took 3.28 seconds.
Getting data... 2
That took 3.95 seconds.
Getting data... 3
That took 3.84 seconds.
Getting data... 4
That took 3.9 seconds.
Getting data... 5
That took 4.55 seconds.
Getting data... 6
That took 4.29 seconds.
Getting data... 7
That took 4.34 seconds.
Getting data... 8
That took 4.89 seconds.
Getting data... 9
That took 3.99 seconds.
Getting data... 10
That took 3.32 seconds.
Getting data... 11
That took 4.28 seconds.
Getting data... 12
That took 4.0 seconds.
Getting data... 13
That took 4.18 seconds.
Getting data... 14
That took 5.43 seconds.
Getting data... 15
That took 4.32 seconds.
Getting data... 16
That took 4.39 seconds.
Getting data... 17
That took 4.2 seconds.
Getting data... 18
That took 4.12 seconds.
Getting data... 19
That took 3.75 seconds.
Getting data... 20
That took 9.7 seconds.
Getting data... 21
That took 3.84 seconds.
Getting data... 22
That took 3.64 seconds.
Getting data... 23
That t

In [18]:
with open ('routes24h20181015v2.json','w') as outfile:
    json.dump(allofem,outfile)
# Final output is 185 MB, zips to 8 MB.

In [5]:
# get a week's worth of data from route 14
daterange = [(2019, 1, 14 + i, 0) for i in range(7)]

result = []

for day in daterange:
    print(f"{datetime.now()}: Starting query for {datetime(*day)}.")
    result.extend(get_some_times(['14'], day, hours = 24))

2019-02-27 19:30:18.818793: Starting query for 2019-01-14 00:00:00.
Getting data... 0
That took 3.99 seconds.
Getting data... 1
That took 2.53 seconds.
Getting data... 2
That took 3.63 seconds.
Getting data... 3
That took 3.56 seconds.
Getting data... 4
That took 3.88 seconds.
Getting data... 5
That took 3.51 seconds.
Getting data... 6
That took 4.69 seconds.
Getting data... 7
That took 5.2 seconds.
Getting data... 8
That took 5.7 seconds.
Getting data... 9
That took 7.45 seconds.
Getting data... 10
That took 5.22 seconds.
Getting data... 11
That took 5.08 seconds.
Getting data... 12
That took 4.96 seconds.
Getting data... 13
That took 5.21 seconds.
Getting data... 14
That took 6.18 seconds.
Getting data... 15
That took 4.9 seconds.
Getting data... 16
That took 6.05 seconds.
Getting data... 17
That took 5.57 seconds.
Getting data... 18
That took 5.2 seconds.
Getting data... 19
That took 3.05 seconds.
Getting data... 20
That took 4.33 seconds.
Getting data... 21
That took 4.45 seconds.


In [7]:
with open('route_14_week_data.json', 'w') as f:
    json.dump(result, f)