In [25]:
import json
import requests
import pandas as pd
import numpy as np

In [12]:
%run desktop/opentransit-metrics/mykelu/eclipses.py

In [13]:
def load_json(filename = 'routes24h20181015v2.json'):
    with open(filename, 'r') as f:
        return json.load(f)

In [14]:
test = load_json()

In [15]:
# current: refactor get_stops to allow passing in data instead of querying
def get_all_stops(data):
    bus_stops = pd.DataFrame(columns = ["VID", "DATE", "TIME", "SID", "DID", "ROUTE"])
    
    for route in [ele['rid'] for ele in test]:
        print(f"{datetime.now().strftime('%a %b %d %I:%M:%S %p')}: Starting with {route}.")
        try:
            stop_ids = [stop['id']
                for stop
                in requests.get(f"http://restbus.info/api/agencies/sf-muni/routes/{route}").json()['stops']]
                 
            route_data = [ele for ele in test if ele['rid'] == route]

            for stop_id in stop_ids:
                try:
                    stops = produce_stops(route_data, route)
                except: # TODO: figure out why requests aren't working; maybe need to feed it df with only that route data?
                    print(f"{datetime.now().strftime('%a %b %d %I:%M:%S %p')}: could not produce stops df for {stop_id} on route {route}. Skipping.")
                    break

                buses = produce_buses(route_data)

                stop = stops[stops['SID'] == stop_id].squeeze()

                try: 
                    buses = buses[buses['DID'] == stop['DID']]
                except ValueError: # accounts for stops with no associated direction
                    print(f"{datetime.now().strftime('%a %b %d %I:%M:%S %p')}: no direction associated to {stop_id} on route {route}. Skipping.")
                    continue

                eclipses = find_eclipses(buses, stop)
                nadirs = find_nadirs(eclipses)

                try:
                    nadirs["TIME"] = nadirs["TIME"].apply(lambda x: datetime.fromtimestamp(x//1000, timezone(timedelta(hours = -8))))
                    nadirs['DATE'] = nadirs['TIME'].apply(lambda x: x.date())
                    nadirs['TIME'] = nadirs['TIME'].apply(lambda x: x.time())
                    nadirs["SID"] = stop_id
                    nadirs["DID"] = stop["DID"]
                    nadirs["ROUTE"] = route
                    bus_stops = bus_stops.append(nadirs, sort = True)
                except:
                    print(f"{datetime.now().strftime('%a %b %d %I:%M:%S %p')}: could not produce stops df for {stop_id} on route {route}. Skipping.", end = "\r")              
        except KeyError:
            print(f"{datetime.now().strftime('%a %b %d %I:%M:%S %p')}: KeyError at {route}!")
            continue
                      
    if len(bus_stops) > 0:
        bus_stops['timestamp'] = bus_stops[['DATE', 'TIME']].apply(lambda x: datetime.strptime(f"{x['DATE'].isoformat()} {x['TIME'].isoformat()} -0800", 
                                                                                       "%Y-%m-%d %H:%M:%S %z"), axis = 'columns')
    
    return bus_stops

In [16]:
df = get_all_stops(test)

Sat Feb 23 04:51:05 AM: Starting with 1.
Sat Feb 23 04:54:43 AM: Starting with 3.
Sat Feb 23 04:55:18 AM: no direction associated to 6088 on route 3. Skipping.
Sat Feb 23 04:56:00 AM: Starting with 5.
Sat Feb 23 04:57:58 AM: no direction associated to 3183 on route 5. Skipping.
Sat Feb 23 04:57:59 AM: no direction associated to 3923 on route 5. Skipping.
Sat Feb 23 04:58:00 AM: no direction associated to 3200 on route 5. Skipping.
Sat Feb 23 04:58:01 AM: no direction associated to 34735 on route 5. Skipping.
Sat Feb 23 04:59:08 AM: no direction associated to 4850 on route 5. Skipping.
Sat Feb 23 04:59:09 AM: no direction associated to 7884 on route 5. Skipping.
Sat Feb 23 04:59:10 AM: no direction associated to 37563 on route 5. Skipping.
Sat Feb 23 04:59:37 AM: no direction associated to 3161 on route 5. Skipping.
Sat Feb 23 04:59:38 AM: no direction associated to 7775 on route 5. Skipping.
Sat Feb 23 04:59:38 AM: Starting with 6.
Sat Feb 23 05:01:05 AM: no direction associated to 740

In [17]:
df.info(memory_usage = 'deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 214783 entries, 1 to 28135
Data columns (total 7 columns):
DATE         214783 non-null object
DID          214783 non-null object
ROUTE        214783 non-null object
SID          214783 non-null object
TIME         214783 non-null object
VID          214783 non-null object
timestamp    214783 non-null datetime64[ns, UTC-08:00]
dtypes: datetime64[ns, UTC-08:00](1), object(6)
memory usage: 72.3 MB


In [None]:
unique, counts = np.unique(df.index.values, return_counts = True)
dict(zip(unique, counts))

In [31]:
df[df.index == 970]

Unnamed: 0,DATE,DID,ROUTE,SID,TIME,VID,timestamp
970,2018-10-14,9____O_N00,9,6569,23:58:15,8800,2018-10-14 23:58:15-08:00
970,2018-10-14,9____O_N00,9,6583,23:58:15,8800,2018-10-14 23:58:15-08:00
970,2018-10-14,14___I_F00,14,5693,23:31:45,7211,2018-10-14 23:31:45-08:00
970,2018-10-15,19___I_F00,19,6999,00:05:30,8422,2018-10-15 00:05:30-08:00
970,2018-10-14,24___O_F00,24,5095,23:49:15,5505,2018-10-14 23:49:15-08:00
970,2018-10-14,24___O_F00,24,5884,23:49:15,5505,2018-10-14 23:49:15-08:00
970,2018-10-15,36___O_F00,36,6228,18:20:53,8523,2018-10-15 18:20:53-08:00
970,2018-10-15,44___O_F00,44,3694,00:08:00,8840,2018-10-15 00:08:00-08:00
970,2018-10-15,55___I_F00,55,3291,05:57:53,8869,2018-10-15 05:57:53-08:00
970,2018-10-15,55___I_F00,55,33293,05:57:53,8869,2018-10-15 05:57:53-08:00


In [33]:
with open('routes20181015v2stops.json', 'w') as f:
    df.reset_index().to_json(f)