# Imports

In [2]:
import json
import requests

from datetime import datetime, timedelta, timezone, time, date

import pandas as pd
import numpy as np

In [4]:
# import functions from eclipses notebook
%run desktop/opentransit-metrics/mykelu/eclipses.py

# Get Stops

In [6]:
def get_stops(dates, routes, directions = [], new_stops = [], times = ("00:00", "23:59")):
    """
    get_stops
    
    Description:
        Returns every instance of a bus stopping at a given set of stops, on a given set of routes, during a given time period.

    Parameters:
        dates: an array of dates, formatted as strings in the form YYYY-MM-DD
        routes: an array of routes, each represented as a string
        directions: an array of strings representing the directions to filter
        stops: an array of strings representing the stops to filter
        times: a tuple with the start and end times (in UTC -8:00) as strings in the form HH:MM 

    Returns:
        stops: a DataFrame, filtered by the given directions and stops, with the following columns:
            VID: the vehicle ID
            Time: a datetime object representing the date/time of the stop
            Route: the route on which the stop occurred
            Stop: the stop at which the stop occurred
            Dir: the direction in which the stop occurred
    """
    bus_stops = pd.DataFrame(columns = ["VID", "DATE", "TIME", "SID", "DID", "ROUTE"])
    
    for route in routes:
        stop_ids = [stop['id']
            for stop
            in requests.get(f"http://restbus.info/api/agencies/sf-muni/routes/{route}").json()['stops']][2:4]

        for stop_id in stop_ids:
            # check if stops to filter were provided, or if the stop_id is in the list of filtered stops
            if (stop_id in new_stops) ^ (len(new_stops) == 0):
                for date in dates:
                    print(f"{datetime.now().strftime('%a %b %d %I:%M:%S %p')}: starting processing on stop {stop_id} on route {route} on {date}.")
                    start_time = int(datetime.strptime(f"{date} {timespan[0]} -0800", "%Y-%m-%d %H:%M %z").timestamp())*1000
                    end_time   = int(datetime.strptime(f"{date} {timespan[1]} -0800", "%Y-%m-%d %H:%M %z").timestamp())*1000

                    data = query_graphql(start_time, end_time, route)
                    print(f"{datetime.now().strftime('%a %b %d %I:%M:%S %p')}: performed query.")
                          
                    if data is None:  # API might refuse to cooperate
                        print("API probably timed out")
                        continue
                    elif len(data) == 0:  # some days somehow have no data
                        print(f"no data for {month}/{day}")
                        continue
                    else:
                        stops = produce_stops(data, route)
                        print(f"{datetime.now().strftime('%a %b %d %I:%M:%S %p')}: produced stops.")
                              
                        buses = produce_buses(data)
                        print(f"{datetime.now().strftime('%a %b %d %I:%M:%S %p')}: produced buses.")

                        stop = stops[stops['SID'] == stop_id].squeeze()
                        buses = buses[buses['DID'] == stop['DID']]

                        eclipses = find_eclipses(buses, stop)
                        print(f"{datetime.now().strftime('%a %b %d %I:%M:%S %p')}: found eclipses.")
                              
                        nadirs = find_nadirs(eclipses)
                        print(f"{datetime.now().strftime('%a %b %d %I:%M:%S %p')}: found nadirs.")
                            
                        nadirs["TIME"] = nadirs["TIME"].apply(lambda x: datetime.fromtimestamp(x//1000, timezone(timedelta(hours = -8))))
                        nadirs['DATE'] = nadirs['TIME'].apply(lambda x: x.date())
                        nadirs['TIME'] = nadirs['TIME'].apply(lambda x: x.time())
                        nadirs["SID"] = stop_id
                        nadirs["DID"] = stop["DID"]
                        nadirs["ROUTE"] = route
                        old_length = len(bus_stops)
                        bus_stops = bus_stops.append(nadirs, sort = True)
                        print(f"{datetime.now().strftime('%a %b %d %I:%M:%S %p')}: finished processing.")

    # filter for directions
    if len(directions) > 0:
        bus_stops = bus_stops.loc[bus_stops['DID'].apply(lambda x: x in directions)]
    
    return bus_stops

In [7]:
# get some stops
route = ["12", "14"]

timespan = ("08:00",
            "11:00")

dates = [
    "2018-11-12",
    "2018-11-13",
    "2018-11-14",
    "2018-11-15",
    "2018-11-16",
]

new_stops = get_stops(dates, route, times = timespan)

Sun Jan 27 07:15:06 PM: starting processing on stop 5851 on route 12 on 2018-11-12.
Sun Jan 27 07:15:22 PM: performed query.
Sun Jan 27 07:15:24 PM: produced stops.
Sun Jan 27 07:15:24 PM: produced buses.
Sun Jan 27 07:15:49 PM: found eclipses.
Sun Jan 27 07:15:49 PM: found nadirs.
Sun Jan 27 07:15:49 PM: finished processing.
Sun Jan 27 07:15:49 PM: starting processing on stop 5851 on route 12 on 2018-11-13.
Sun Jan 27 07:16:02 PM: performed query.
Sun Jan 27 07:16:07 PM: produced stops.
Sun Jan 27 07:16:07 PM: produced buses.
Sun Jan 27 07:16:25 PM: found eclipses.
Sun Jan 27 07:16:25 PM: found nadirs.
Sun Jan 27 07:16:25 PM: finished processing.
Sun Jan 27 07:16:25 PM: starting processing on stop 5851 on route 12 on 2018-11-14.
Sun Jan 27 07:16:35 PM: performed query.
Sun Jan 27 07:16:36 PM: produced stops.
Sun Jan 27 07:16:36 PM: produced buses.
Sun Jan 27 07:16:56 PM: found eclipses.
Sun Jan 27 07:16:56 PM: found nadirs.
Sun Jan 27 07:16:56 PM: finished processing.
Sun Jan 27 07:16

In [8]:
# TODO: parse direction as inbound/outbound? (remove route indicator)
# test directions (find unique directions) - possibly parse directions
new_stops.head()

Unnamed: 0,DATE,DID,ROUTE,SID,TIME,VID
1,2018-11-12,12___O_F00,12,5851,08:00:13,8776
5270,2018-11-12,12___O_F00,12,5851,10:28:06,8776
79,2018-11-12,12___O_F00,12,5851,08:02:13,8705
4835,2018-11-12,12___O_F00,12,5851,10:16:05,8705
979,2018-11-12,12___O_F00,12,5851,08:28:00,8912


# Prepare Timestamp Data

In [9]:
# TODO: when to use date/time columns vs timestamps?
new_stops['timestamp'] = new_stops[['DATE', 'TIME']].apply(lambda x: datetime.strptime(f"{x['DATE'].isoformat()} {x['TIME'].isoformat()} -0800", 
                                                                                       "%Y-%m-%d %H:%M:%S %z"),  
                                                           axis = 'columns')

In [10]:
new_stops['timestamp'].head()

1      2018-11-12 08:00:13-08:00
5270   2018-11-12 10:28:06-08:00
79     2018-11-12 08:02:13-08:00
4835   2018-11-12 10:16:05-08:00
979    2018-11-12 08:28:00-08:00
Name: timestamp, dtype: datetime64[ns, UTC-08:00]

# Compute Average Waiting Time

In [11]:
# find the smallest nonnegative waiting time
def absmin(series):
    return series[series >= 0].min()

In [12]:
# input: df with entries from one day
def minimum_waiting_times(df, start_time, end_time, group):
    minute_range = [start_time + timedelta(minutes = i) for i in range((end_time - start_time).seconds//60)]
    wait_times = pd.DataFrame(columns = [])
    
    for minute in minute_range:
        df['WAIT'] = df['timestamp'].apply(lambda x: (x - minute).total_seconds())
        pivot = df[group + ['WAIT']].pivot_table(values = ['WAIT'], index = group, aggfunc = absmin)
        pivot['TIME'] = minute
        pivot = pivot.reset_index()
        wait_times = wait_times.append(pivot, sort = True)
        
    return wait_times

In [14]:
# find average wait times as defined in the MVP specification
def average_wait_times(df, timespan, group, aggfuncs):
    dates = df['DATE'].unique()
    avg_over_pd = pd.DataFrame(columns = group + ['DATE', 'WAIT'])
    
    for date in new_stops['DATE'].unique():
        start_time = datetime.strptime(f"{date.isoformat()} {timespan[0]} -0800", "%Y-%m-%d %H:%M %z")
        end_time   = datetime.strptime(f"{date.isoformat()} {timespan[1]} -0800", "%Y-%m-%d %H:%M %z")
        daily_wait = minimum_waiting_times(new_stops[new_stops['DATE'] == date], start_time, end_time, group)
        daily_wait = daily_wait.pivot_table(values = ['WAIT'], index = group).reset_index()
        daily_wait['DATE'] = date
        avg_over_pd = avg_over_pd.append(daily_wait, sort = True)
    
    return avg_over_pd.pivot_table(values = ['WAIT'], index = group, aggfunc = aggfuncs)

In [15]:
average_wait_times(new_stops, ("08:00", "11:00"), ['SID'], {'WAIT': [np.mean, np.median]})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


Unnamed: 0_level_0,WAIT,WAIT
Unnamed: 0_level_1,mean,median
SID,Unnamed: 1_level_2,Unnamed: 2_level_2
5528,353.748889,357.388889
5579,343.594444,349.361111
5844,555.461213,559.958084
5851,592.512036,552.542169


In [16]:
# TODO:
# find route combinations to test (w/overlapping stops/directions)
# optimize avg_wait_times
# write other metrics