In [1]:
import json
import requests
import pandas as pd
import numpy as np

In [2]:
%run desktop/opentransit-metrics/mykelu/eclipses.py

In [3]:
def load_json(filename = 'routes24h20181015v2.json'):
    with open(filename, 'r') as f:
        return json.load(f)

In [4]:
test = load_json()

In [5]:
# get all stops from a given data set
def get_all_stops(data):
    bus_stops = pd.DataFrame(columns = ["VID", "DATE", "TIME", "SID", "DID", "ROUTE"])
    
    for route in [ele['rid'] for ele in test]:
        print(f"{datetime.now().strftime('%a %b %d %I:%M:%S %p')}: Starting with {route}.")
        try:
            stop_ids = [stop['id']
                for stop
                in requests.get(f"http://restbus.info/api/agencies/sf-muni/routes/{route}").json()['stops']]
                 
            route_data = [ele for ele in test if ele['rid'] == route]

            for stop_id in stop_ids:
                try:
                    stops = produce_stops(route_data, route)
                except:
                    print(f"{datetime.now().strftime('%a %b %d %I:%M:%S %p')}: could not produce stops df for {stop_id} on route {route}. Skipping.")
                    break

                buses = produce_buses(route_data)

                stop = stops[stops['SID'] == stop_id].squeeze()

                try: 
                    buses = buses[buses['DID'] == stop['DID']]
                except ValueError: # accounts for stops with no associated direction
                    print(f"{datetime.now().strftime('%a %b %d %I:%M:%S %p')}: no direction associated to {stop_id} on route {route}. Skipping.")
                    continue

                eclipses = find_eclipses(buses, stop)
                nadirs = find_nadirs(eclipses)

                try:
                    nadirs["TIME"] = nadirs["TIME"].apply(lambda x: datetime.fromtimestamp(x//1000, timezone(timedelta(hours = -8))))
                    nadirs['DATE'] = nadirs['TIME'].apply(lambda x: x.date())
                    nadirs['TIME'] = nadirs['TIME'].apply(lambda x: x.time())
                    nadirs["SID"] = stop_id
                    nadirs["DID"] = stop["DID"]
                    nadirs["ROUTE"] = route
                    bus_stops = bus_stops.append(nadirs, sort = True)
                except:
                    print(f"{datetime.now().strftime('%a %b %d %I:%M:%S %p')}: could not produce stops df for {stop_id} on route {route}. Skipping.", end = "\r")              
        except KeyError:
            print(f"{datetime.now().strftime('%a %b %d %I:%M:%S %p')}: KeyError at {route}!")
            continue
                      
    if len(bus_stops) > 0:
        bus_stops['timestamp'] = bus_stops[['DATE', 'TIME']].apply(lambda x: datetime.strptime(f"{x['DATE'].isoformat()} {x['TIME'].isoformat()} -0800", 
                                                                                       "%Y-%m-%d %H:%M:%S %z"), axis = 'columns')
    
    return bus_stops

In [6]:
df = get_all_stops(test)

Sun Feb 24 10:56:36 PM: Starting with 1.
Sun Feb 24 11:04:17 PM: Starting with 3.
Sun Feb 24 11:05:25 PM: no direction associated to 6088 on route 3. Skipping.
Sun Feb 24 11:06:54 PM: Starting with 5.
Sun Feb 24 11:09:18 PM: no direction associated to 3183 on route 5. Skipping.
Sun Feb 24 11:09:20 PM: no direction associated to 3923 on route 5. Skipping.
Sun Feb 24 11:09:21 PM: no direction associated to 3200 on route 5. Skipping.
Sun Feb 24 11:09:22 PM: no direction associated to 34735 on route 5. Skipping.
Sun Feb 24 11:10:49 PM: no direction associated to 4850 on route 5. Skipping.
Sun Feb 24 11:10:51 PM: no direction associated to 7884 on route 5. Skipping.
Sun Feb 24 11:10:52 PM: no direction associated to 37563 on route 5. Skipping.
Sun Feb 24 11:11:24 PM: no direction associated to 3161 on route 5. Skipping.
Sun Feb 24 11:11:25 PM: no direction associated to 7775 on route 5. Skipping.
Sun Feb 24 11:11:25 PM: Starting with 6.
Sun Feb 24 11:14:36 PM: no direction associated to 740

In [7]:
df.info(memory_usage = 'deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 318777 entries, 1 to 28135
Data columns (total 7 columns):
DATE         318777 non-null object
DID          318777 non-null object
ROUTE        318777 non-null object
SID          318777 non-null object
TIME         318777 non-null object
VID          318777 non-null object
timestamp    318777 non-null datetime64[ns, UTC-08:00]
dtypes: datetime64[ns, UTC-08:00](1), object(6)
memory usage: 107.3 MB


In [9]:
# duplicate indices are coming from a combination of combining multiple dataframes and duplicate stops (simultaneous stops at different stop ids)
df[df.index == 970]

Unnamed: 0,DATE,DID,ROUTE,SID,TIME,VID,timestamp
970,2018-10-14,9____O_N00,9,6569,23:58:15,8800,2018-10-14 23:58:15-08:00
970,2018-10-14,9____O_N00,9,6583,23:58:15,8800,2018-10-14 23:58:15-08:00
970,2018-10-14,14___I_F00,14,5693,23:31:45,7211,2018-10-14 23:31:45-08:00
970,2018-10-15,19___I_F00,19,6999,00:05:30,8422,2018-10-15 00:05:30-08:00
970,2018-10-15,44___O_F00,44,3694,00:08:00,8840,2018-10-15 00:08:00-08:00
970,2018-10-15,55___I_F00,55,3291,05:57:53,8869,2018-10-15 05:57:53-08:00
970,2018-10-15,55___I_F00,55,33293,05:57:53,8869,2018-10-15 05:57:53-08:00
970,2018-10-15,66___I_F00,66,4861,06:12:53,8519,2018-10-15 06:12:53-08:00
970,2018-10-15,66___I_F00,66,4859,06:12:53,8519,2018-10-15 06:12:53-08:00
970,2018-10-15,91___I_N00,91,5792,00:03:15,8617,2018-10-15 00:03:15-08:00


In [10]:
with open('routes20181015v2stops.json', 'w') as f:
    df.reindex(index = range(len(df)).to_json(f)