In [1]:
import requests
import numpy as np
import json
import math
from datetime import datetime
import time
import os

In [2]:
def list_string(arr):
    return f"{[*map(str,arr)]}".replace("'",'"')

In [3]:
def get_some_times(route, dt_params=(2018,10,15,0,1,0), hours=20, per_hour=1):
    route_data=[]
    if type(route) != list:
        route = [route]
        
    per_hour=int(per_hour)
    timestamp=int(datetime(*dt_params).timestamp())*1000
    T0 = time.time()
    for i in range(hours*per_hour):
        t0=time.time()
        start_time = timestamp + i*36e5/per_hour
        end_time   = start_time + 36e5/per_hour

        query = f"""{{
          trynState(agency: "muni", startTime: "{start_time}", endTime: "{end_time}", routes: {list_string(route)}) {{
            agency
            startTime
            routes {{
              rid
              stops {{
                sid
                name
                lat
                lon
              }}
              routeStates {{
                vtime
                vehicles {{
                  vid
                  lat
                  lon
                  heading
                  did
                }}
              }}
            }}
          }}
        }}
        """ # Braces need to be doubled for f-string

        query_url = "https://06o8rkohub.execute-api.us-west-2.amazonaws.com/dev/graphql?query="+query
#         print(query_url)
        print('Getting data...',i)
        r = requests.get(query_url)
        print(f'That took {round(time.time()-t0,2)} seconds.')

        data = json.loads(r.text)
        try:
            data['data']
            
            if len(data['data']['trynState']['routes']):
                route_data.extend(data['data']['trynState']['routes'])
        except KeyError:
            print(f"Error for time range {startTime}-{endTime}.")
    
    r_sort = {}
    print('Final formatting...')
    for x in route_data:
        if x['rid'] not in r_sort.keys():
            r_sort[x['rid']] = {'rid':x['rid'],'routeStates':[],'stops':x['stops']}
        r_sort[x['rid']]['routeStates'].extend(sorted(x['routeStates'], key=lambda rs: int(rs['vtime'])))
    r_sort=list(r_sort.values())
    print('Done')
    print(f'That all took {round(time.time()-T0,2)} seconds.')
#     return route_data
    return r_sort

Testing main scraping function:

In [231]:
a=get_some_times(['1','14','N','14X','12'],dt_params=(2018,10,15,12),hours=2)

Getting data... 0
That took 5.59 seconds.
Getting data... 1
That took 4.89 seconds.
Final formatting...
Done
That all took 10.59 seconds.


List of all routes (scraped from https://github.com/trynmaps/opentransit-map/blob/master/src/res/muniRoutes2.json)

In [4]:
all_routes = ["1","2","3","5","6","7","8","9","10","12","14","18","19","21","22","23","24","25","27","28","29","30","31","33","35","36","37","38","39","41","43","44","45","47","48","49","52","54","55","56","57","66","67","88","90","91","83X","30X","7R","J","N","14R","14X","PH","E","38R","82X","81X","T-OWL","M-OWL","7X","N-OWL","L-OWL","76X","9R","C","38AX","38BX","31BX","1AX","31AX","1BX","K-OWL","F","8AX","8BX","M","L","5R","PM","K/T","NX","28R","BUS"]

Get 24 hours of data in chunks (more than ~30 routes per call gives an error)

In [207]:
def get24all(dt_params=(2018,10,15,7)):
    result = []
    t0=time.time()
    for i in range(3):
        print(f'Chunk {i+1} of 3:')
        result.extend(get_some_times(all_routes[i*30:(i+1)*30],dt_params,hours=24))
    print(f'Whole thing took {round(time.time()-t0,2)} seconds.')
    return result

Change `dt_params` to whatever date you want. `(year,month,day,hour)`

In [216]:
allofem=get24all(dt_params=(2018,10,15,0))

Chunk 1 of 3:
Getting data... 0
That took 7.88 seconds.
Getting data... 1
That took 7.19 seconds.
Getting data... 2
That took 6.86 seconds.
Getting data... 3
That took 6.44 seconds.
Getting data... 4
That took 6.81 seconds.
Getting data... 5
That took 6.92 seconds.
Getting data... 6
That took 8.22 seconds.
Getting data... 7
That took 6.79 seconds.
Getting data... 8
That took 7.18 seconds.
Getting data... 9
That took 7.77 seconds.
Getting data... 10
That took 7.54 seconds.
Getting data... 11
That took 6.8 seconds.
Getting data... 12
That took 5.91 seconds.
Getting data... 13
That took 5.61 seconds.
Getting data... 14
That took 4.6 seconds.
Getting data... 15
That took 3.88 seconds.
Getting data... 16
That took 4.02 seconds.
Getting data... 17
That took 3.58 seconds.
Getting data... 18
That took 2.21 seconds.
Getting data... 19
That took 1.98 seconds.
Getting data... 20
That took 2.03 seconds.
Getting data... 21
That took 2.33 seconds.
Getting data... 22
That took 3.95 seconds.
Getting d

Dump data to json file. Here, the final output is 185 MB, and zips to 8 MB.

In [221]:
with open(os.path.join('..','data','routes24hv1.json'),'w') as outfile:
    json.dump(allofem,outfile)