# Imports

In [499]:
import json
import requests

from datetime import datetime, timedelta, timezone

import pandas as pd
import numpy as np
from geopy.distance import distance

import math
import pprint
import time

#import matplotlib.pyplot as plt
#from matplotlib.ticker import FuncFormatter

from typing import List, Union

# Query GraphQL

In [500]:
def query_graphql(start_time: int, end_time: int, route: str) -> list:
    query = f"""{{
        trynState(agency: "muni",
                  startTime: "{start_time}",
                  endTime: "{end_time}",
                  routes: ["{route}"]) {{
            agency
            startTime
            routes {{
                stops {{
                    sid
                    lat
                    lon
                }}
                routeStates {{
                    vtime
                    vehicles {{
                        vid
                        lat
                        lon
                        did
                    }}
                }}
            }}
        }}
    }}
    """
    query_url = f"https://06o8rkohub.execute-api.us-west-2.amazonaws.com/dev/graphql?query={query}"

    request = requests.get(query_url).json()
    try:
        return request['data']['trynState']['routes']
    except KeyError:
        return None

# Produce Datatables

In [501]:
def produce_stops(data: list, route: str) -> pd.DataFrame:
    stops = pd.io.json.json_normalize(data,
                                      record_path=['stops']) \
            .rename(columns={'lat': 'LAT',
                             'lon': 'LON',
                             'sid': 'SID'}) \
            .reindex(['SID', 'LAT', 'LON'], axis='columns')
    
    # obtain stop directions
    stops['DID'] = stops['SID'].map({stop: direction['id']
                                     for direction in requests
                                                      .get(f"http://restbus.info/api/agencies/sf-muni/routes/{route}")
                                                      .json()['directions']
                                     for stop in direction['stops']})
    
    # remove stops that don't have an associated direction
    stops = stops.dropna(axis='index', subset=['DID'])
    
    # obtain stop ordinals
    stops['ORD'] = stops['SID'].map({stop_meta['id']: ordinal
                                     for ordinal, stop_meta
                                     in enumerate(requests
                                                  .get("http://restbus.info/api/agencies/sf-muni/"
                                                       f"routes/{route}")
                                                  .json()['stops'])})
    
    return stops

In [502]:
def produce_buses(data: list) -> pd.DataFrame:
     return pd.io.json.json_normalize(data,
                                      record_path=['routeStates', 'vehicles'],
                                      meta=[['routeStates', 'vtime']]) \
            .rename(columns={'lat': 'LAT',
                             'lon': 'LON',
                             'vid': 'VID',
                             'did': 'DID',
                             'routeStates.vtime': 'TIME'}) \
            .reindex(['TIME', 'VID', 'LAT', 'LON', 'DID'], axis='columns')

# Eclipses

In [None]:
"""
def distance_func(dstop,dbus):
    latmid = (dstop['LON']+ dbus['LAT'])/2
    m_per_lat = 111132.954 - 559.822 * np.cos( 2 * latmid ) + 1.175 * np.cos( 4 * latmid);
    m_per_lon = 111132.954 * np.cos(latmid );
    
    delta_lat = math.fabs(dbus['LAT'] - dstop['LAT'])
    delta_lon = math.fabs(dbus['LON'] - dstop['LON'])
    
    distance = np.sqrt(pow(delta_lon * m_per_lat,2) + pow(delta_lon * m_per_lon ,2));
    
    
    return distance

"""

def distance_func(dstop,dbus):
    
    eradius = 6378.137
    
    lat1 = dstop['LAT']
    lat2 = dbus['LAT']
    lon1 = dstop['LON']
    lon2 = dbus['LON']
    
    distance = eradius * np.arccos(
        np.sin(lat1) * np.sin(lat2)
        + np.cos(lat1) * np.cos(lat2) * np.cos(lon2-lon1))
    
    #distance = 1
    return distance

def find_eclipses(buses, stop):
    """
    Find movement of buses relative to the stop, in distance as a function of time.
    """
    def split_eclipses(eclipses, threshold=30*60*1000) -> List[pd.DataFrame]:
        """
        Split buses' movements when they return to a stop after completing the route.
        """
        disjoint_eclipses = []
        for bus_id in eclipses['VID'].unique():
            # obtain distance data for this bus
            bus = eclipses[eclipses['VID'] == bus_id].sort_values('TIME')

            # split data into groups when there is at least a `threshold`-ms gap between data points
            group_ids = (bus['TIME'] > (bus['TIME'].shift() + threshold)).cumsum()

            # store groups
            for _, group in bus.groupby(group_ids):
                disjoint_eclipses.append(group)
        return disjoint_eclipses

    eclipses = buses.copy()
    
    #print('elicpses = buses.copy()')
    #print(eclipses)
    #print('stop:')
    #print(stop)
    #pprint.pprint(stop[['LAT', 'LON']])
    #pprint.pprint(bus[['LAT', 'LON']])
    
    print('eclipses lmbda start')
    starttime=time.time()
    
    stopcopy = stop[['LAT', 'LON']].values
    buscopy = eclipses[['LAT', 'LON']].values
    
    #eclipses['DIST'] = eclipses.apply(lambda row: distance(stop[['LAT','LON']],row[['LAT','LON']]).meters,axis=1)
    dfromstop = []   
    for row,value in enumerate(buscopy):
        busdistance = distance(stopcopy,value)
        dfromstop.append(busdistance)
    
    eclipses['DIST'] = dfromstop
    
    # with defined great circle simplified method, 10.033411979675293 seconds
    # with geopy distance, 10.88465404510498 seconds
    # reduced timespan from 3 hour to 1 hour, as expected elapsed time 3.1001999378204346 seconds
    
    print('eclipses lmbda end')
    print('elapsed time for eclipses lmbda is: %s seconds' % (time.time()-starttime))
          
    eclipses['TIME'] = eclipses['TIME'].astype(np.int64)
    eclipses = eclipses[['TIME', 'VID', 'DIST']]
    
    # only keep positions within 750 meters
    eclipses = eclipses[eclipses['DIST'] < 750]
    
    eclipses = split_eclipses(eclipses)
    
    return eclipses

def find_nadirs(eclipses):
    """
    Find points where buses are considered to have encountered the stop.
    
    Nadir is an astronomical term that describes the lowest point reached by an orbiting body.
    """
    def calc_nadir(eclipse: pd.DataFrame) -> Union[pd.Series, None]:
        nadir = eclipse.iloc[eclipse['DIST'].values.argmin()]
        if nadir['DIST'] < 100:  # if min dist < 100, then reasonable candidate for nadir
            return nadir
        else:  # otherwise, hardcore datasci is needed
            rev_eclipse = eclipse.iloc[::-1]
            rev_nadir = rev_eclipse.iloc[rev_eclipse['DIST'].values.argmin()]
            if nadir['TIME'] == rev_nadir['TIME']:  # if eclipse has a global min
                return nadir  # then it's the best candidate for nadir
            else:  # if eclipse's min occurs at two times
                mid_nadir = nadir.copy()
                mid_nadir['DIST'] = (nadir['DIST'] + rev_nadir['DIST'])/2
                return mid_nadir  # take the midpoint of earliest and latest mins
    
    nadirs = []
    for eclipse in eclipses:
        nadirs.append(calc_nadir(eclipse)[['VID', 'TIME']])
        
    return pd.DataFrame(nadirs)
            
    
def show_stop(eclipses, nadirs):
    fig = plt.figure(figsize=(18, 9))
    ax = fig.add_subplot(111)

    for eclipse in eclipses:
        plt.plot(*eclipse[['TIME', 'DIST']].values.T)
        
    for nadir_time in nadirs['TIME']:
        plt.axvline(nadir_time, linestyle='--', linewidth=.5)

    # format plot
    ax.get_xaxis().set_major_formatter(  # convert x-axis tick labels to time of day
        FuncFormatter(lambda x, p: datetime.fromtimestamp(int(x)//1000).strftime('%I:%M%p')))
    plt.title(f"Eclipses at Stop {stop_id}"
              f" from {datetime.fromtimestamp(int(start_time)//1000).strftime('%a %b %d %I:%M%p')}"
              f" to {datetime.fromtimestamp(int(end_time)//1000).strftime('%a %b %d %I:%M%p')}")
    plt.xlabel("Time")
    plt.ylabel("Distance from Stop (meters)")

    plt.show()

# Hardcore Data Science

In [None]:
route = ["12", "14"]

timespan = ("08:00",
            "11:00")

dates = [
    "2018-11-12",
    "2018-11-13",
    "2018-11-14",
    "2018-11-15",
    "2018-11-16",
]

Issue: The plot is labeled based on the machine's current timezone, which may not necessarily match the times sent to the API. To remedy this, the logic for displaying the plot would have to be adjusted to account for the UTC offset of the epochs we get back from the API, which I'm hoping there's a module for but I'm not presently familiar with any.

Issue: currently, if a trajectory looks like `/~V`, the left edge is selected as the nadir. Based on the data, I suspect that the initial upslope may be a GPS glitch as it's being initialized, I believe the trough on the right should be selected instead.

In [None]:
class BusData:
    def __init__(self):
        self.data = {}
    
    @property
    def routes(self):
        return list(self.data.keys())
    
    @property
    def stops(self, route_id):
        return list(self.data.get(route_id, {}).keys())
    
    def append(self, other_data):
        for route_id, other_route in other_data.items():
            route = self.data.get(route_id)
            if route:
                for stop_id, other_stop in other_route.items():
                    stop = route.get(stop_id)
                    if stop:
                        stop['eclipses'].extend(other_stop['eclipses'])
                    else:
                        route[stop_id] = other_stop
            else:
                self.data[route_id] = other_route
    
    @classmethod
    def read_file(cls, filename):
        bus_data = cls()
        with open(filename, 'r') as f:
            bus_data.append(json.load(f))
        return bus_data
                
    
    def write_file(self, filename):
        with open(filename, 'w') as f:
            json.dump(self.data, f)

`BusData.data` specification:
```
{
    route_id: {  # route_id is a str
        stop_id: {  # stop_id is a str
            direction_id: str,
            order: int,
            lat: float,
            lon: float,
            eclipses: [
                {
                    bus_id: int,
                    timestamp: int,
                },
                {
                    bus_id: int,
                    timestamp: int,
                },
                ...
            ]
        },
        ...
    },
    ...
}
```

In [None]:
# grab a couple of sequential stops to look at
# get_stop_times(date, stop [time, route, dir])
# returns a df w/columns: vID, date, time, stop, route, dir
# ISSUE: really slow, any way to speed up graphQL query?

# routes = pd.DataFrame(columns = ["VID", "TIME", "SID", "DID", "ROUTE"])

# stop_ids = [stop['id']
#             for stop
#             in requests.get(f"http://restbus.info/api/agencies/sf-muni/routes/{route}").json()['stops']][2:4]

# bus_data = BusData()

# for stop_id in stop_ids:
#     for date in dates:
#         start_time = int(datetime.strptime(f"{date} {timespan[0]} -0800", "%Y-%m-%d %H:%M %z").timestamp())*1000
#         end_time   = int(datetime.strptime(f"{date} {timespan[1]} -0800", "%Y-%m-%d %H:%M %z").timestamp())*1000

#         data = query_graphql(start_time, end_time, route)

#         if data is None:  # API might refuse to cooperate
#             print("API probably timed out")
#             continue
#         elif len(data) == 0:  # some days somehow have no data
#             print(f"no data for {month}/{day}")
#             continue
#         else:
#             stops = produce_stops(data)
#             buses = produce_buses(data)

#             stop = stops[stops['SID'] == stop_id].squeeze()
#             buses = buses[buses['DID'] == stop['DID']]

#             eclipses = find_eclipses(buses, stop)
#             nadirs = find_nadirs(eclipses)
#             nadirs["TIME"] = nadirs["TIME"].apply(lambda x: datetime.fromtimestamp(x//1000, timezone(timedelta(hours = -8))).strftime('%a %b %d %I:%M%p'))
#             nadirs["SID"] = stop_id
#             nadirs["DID"] = stop["DID"]
#             nadirs["ROUTE"] = route
#             routes = routes.append(nadirs)


#             show_stop(eclipses, nadirs)

#             bus_data.append({
#                 route: {
#                     stop_id: {
#                         'direction_id': stop['DID'],
#                         'order': int(stop['ORD']),
#                         'lat': stop['LAT'],
#                         'lon': stop['LON'],
#                         'eclipses': [
#                             {
#                                 'bus_id': bus_id,
#                                 'timestamp': int(timestamp)
#                             }
#                             for bus_id, timestamp in zip(nadirs['VID'].tolist(),
#                                                          nadirs['TIME'].tolist())
#                         ]
#                     }
#                 }
#             })
        
# bus_data.write_file("bus_data.json")
                      

In [None]:
# get_stops
# ------------------------------------------------------------------------------------------
# parameters:
# dates: an array of dates, formatted as strings in the form YYYY-MM-DD
# routes: an array of routes, each represented as a string
# directions: an array of strings representing the directions to filter
# stops: an array of strings representing the stops to filter
# times: a tuple with the start and end times (in UTC -8:00) as strings in the form HH:MM 
# 
# returns:
# stops: a DataFrame, filtered by the given directions and stops, with the following columns:
# VID: the vehicle ID
# Time: a datetime object representing the date/time of the stop
# Route: the route on which the stop occurred
# Stop: the stop at which the stop occurred
# Dir: the direction in which the stop occurred
# -------------------------------------------------------------------------------------------
def get_stops(dates, routes, directions = [], new_stops = [], times = ("00:00", "23:59")):
    bus_stops = pd.DataFrame(columns = ["VID", "TIME", "SID", "DID", "ROUTE"])
    
    for route in routes:
        stop_ids = [stop['id']
            for stop
            in requests.get(f"http://restbus.info/api/agencies/sf-muni/routes/{route}").json()['stops']][2:4]

        for stop_id in stop_ids:
            # check if stops to filter were provided, or if the stop_id is in the list of filtered stops
            if (stop_id in new_stops) ^ (len(new_stops) == 0):
                for date in dates:
                    print(f"{datetime.now().strftime('%a %b %d %I:%M:%S %p')}: starting processing on stop {stop_id} on route {route} on {date}.")
                    start_time = int(datetime.strptime(f"{date} {timespan[0]} -0800", "%Y-%m-%d %H:%M %z").timestamp())*1000
                    end_time   = int(datetime.strptime(f"{date} {timespan[1]} -0800", "%Y-%m-%d %H:%M %z").timestamp())*1000

                    data = query_graphql(start_time, end_time, route)
                    print(f"{datetime.now().strftime('%a %b %d %I:%M:%S %p')}: performed query.")
                          
                    if data is None:  # API might refuse to cooperate
                        print("API probably timed out")
                        continue
                    elif len(data) == 0:  # some days somehow have no data
                        print(f"no data for {month}/{day}")
                        continue
                    else:
                        stops = produce_stops(data, route)
                        print(f"{datetime.now().strftime('%a %b %d %I:%M:%S %p')}: produced stops.")
                        #pprint.pprint(stops)
                              
                        buses = produce_buses(data)
                        print(f"{datetime.now().strftime('%a %b %d %I:%M:%S %p')}: produced buses.")
                        #pprint.pprint(buses)
                        
                        # select single stop that match stop_id
                        stop = stops[stops['SID'] == stop_id].squeeze()
                        # select buses that have matching DID with the stop. note* this will not select inbound
                        buses = buses[buses['DID'] == stop['DID']]
                        
                        #pprint.pprint(buses)
                        #pprint.pprint(stop)

                        eclipses = find_eclipses(buses, stop)
                        print(f"{datetime.now().strftime('%a %b %d %I:%M:%S %p')}: found eclipses.")
                              
                        nadirs = find_nadirs(eclipses)
                        print(f"{datetime.now().strftime('%a %b %d %I:%M:%S %p')}: found nadirs.")
                            
                        nadirs["TIME"] = nadirs["TIME"].apply(lambda x: datetime.fromtimestamp(x//1000, timezone(timedelta(hours = -8))).strftime('%a %b %d %Y %I:%M%p'))
                        nadirs["SID"] = stop_id
                        nadirs["DID"] = stop["DID"]
                        nadirs["ROUTE"] = route
                        old_length = len(bus_stops)
                        bus_stops = bus_stops.append(nadirs, sort = True)
                        print(f"{datetime.now().strftime('%a %b %d %I:%M:%S %p')}: finished processing.")

    # filter for directions
    if len(directions) > 0:
        bus_stops = bus_stops.loc[bus_stops['DID'].apply(lambda x: x in directions)]
    
    return bus_stops

In [None]:
new_stops = get_stops(dates, route, directions = ['14___O_F00'], new_stops = ['5528'], times = timespan)

Fri Jan 25 04:00:32 PM: starting processing on stop 5528 on route 14 on 2018-11-12.
Fri Jan 25 04:00:44 PM: performed query.
Fri Jan 25 04:00:45 PM: produced stops.
Fri Jan 25 04:00:45 PM: produced buses.
eclipses lmbda start
eclipses lmbda end
elapsed time for eclipses lmbda is: 1.4268510341644287 seconds
Fri Jan 25 04:00:46 PM: found eclipses.
Fri Jan 25 04:00:46 PM: found nadirs.
Fri Jan 25 04:00:46 PM: finished processing.
Fri Jan 25 04:00:46 PM: starting processing on stop 5528 on route 14 on 2018-11-13.
Fri Jan 25 04:00:56 PM: performed query.
Fri Jan 25 04:00:59 PM: produced stops.
Fri Jan 25 04:00:59 PM: produced buses.
eclipses lmbda start
eclipses lmbda end
elapsed time for eclipses lmbda is: 1.2461671829223633 seconds
Fri Jan 25 04:01:01 PM: found eclipses.
Fri Jan 25 04:01:01 PM: found nadirs.
Fri Jan 25 04:01:01 PM: finished processing.
Fri Jan 25 04:01:01 PM: starting processing on stop 5528 on route 14 on 2018-11-14.


In [None]:
# TODO: parse direction as inbound/outbound? (remove route indicator)
# filter by direction/stop if provided
# split date/time
new_stops

In [None]:
new_stops['DID'].unique()

In [None]:
new_stops['SID'].unique()

In [None]:
def average_waiting_time(df, start_time, end_time):
    minute_range = [start_time.replace(minute = start_time.minute + i) for i in range(end_time.minute - start_time.minute)]
    wait_times = pd.DataFrame(columns = ["ROUTE", "TIME", "WAIT"])
    
    for minute in minute_range:
        print("hi")

In [None]:
new_stops["timestamp"] = new_stops["TIME"].apply(lambda x: datetime.strptime(x, '%a %b %d %Y %I:%M%p').timestamp())

In [None]:
new_stops['date'] = new_stops["TIME"].apply(lambda x: datetime.strptime(x, '%a %b %d %Y %I:%M%p').date())

In [None]:
pivot = new_stops[['date', 'timestamp']].pivot_table(values = ['timestamp'], index = ['date'])

In [None]:
pivot['timestamp'] = pivot['timestamp'].apply(lambda x: )