# Pathfinding Model
Objectives:
- build a pathfinding model wrapped inside a function called `find_shortest_path`
- this function should accept the user's starting and ending geocoordinates along with the current time
- data has been preprocessed in a previous notebook and filtered only include regular weekday service; ie. stop times are for any given Monday to Friday

---
## Import Modules & Data

In [8]:
import time
import pandas as pd
from datetime import timedelta as td

In [9]:
stops = pd.read_feather('data/model/stops.ftr')
schedule = pd.read_feather('data/model/schedule.ftr')
# trips = pd.read_feather('data/model/trips.ftr')
# stop_times = pd.read_feather('data/model/stop_times.ftr')

In [10]:
schedule.head(1)

Unnamed: 0,trip_id,route_short_name,route_long_name,shape_id,trip_headsign,stop_sequence,stop_time,stop_time_delta,stop_id,stop_code,stop_name,stop_lat,stop_lon
0,42990004,10,VAN HORNE,886387,EAST - 10 VAN HORNE towards VICTORIA PARK,1,7:00:29,0 days 07:00:29,14155,14633,Don Mills Station,43.776222,-79.347048


In [11]:
stops.head(1)

Unnamed: 0,stop_id,stop_code,stop_name,stop_lat,stop_lon
0,262,662,Danforth Rd at Kennedy Rd,43.714379,-79.260939


---
## Define Helper Functions

In [12]:
def find_closest_stop_id(input_lat, input_lon):
    stop_distance = stops.loc[:, ['stop_id', 'stop_lat', 'stop_lon']]
    stop_distance['distance'] = ( abs(input_lat - stop_distance['stop_lat'])**2 + abs(input_lon - stop_distance['stop_lon'])**2 )**(1/2)
    closest_stop_id = stop_distance.sort_values(by = 'distance').stop_id.iloc[0]
    return closest_stop_id

In [13]:
def find_closest_stops(input_lat, input_lon):
    stop_distance = stops.loc[:, ['stop_id', 'stop_lat', 'stop_lon']]
    stop_distance['distance'] = ( abs(input_lat - stop_distance['stop_lat'])**2 + abs(input_lon - stop_distance['stop_lon'])**2 )**(1/2)
    stop_distance = stop_distance.sort_values(by = 'distance').reset_index(drop = True)
    return stop_distance

In [14]:
def walking_speed_estimate():
    stop_id_A = 917
    stop_id_B = 9946
    time_in_minutes = 20
    stop_A_lat = stops.loc[stops.stop_id == stop_id_A, 'stop_lat'].values[0]
    stop_A_lon = stops.loc[stops.stop_id == stop_id_A, 'stop_lon'].values[0]
    stop_B_lat = stops.loc[stops.stop_id == stop_id_B, 'stop_lat'].values[0]
    stop_B_lon = stops.loc[stops.stop_id == stop_id_B, 'stop_lon'].values[0]
    distance = abs(stop_B_lat - stop_A_lat) + abs(stop_B_lon - stop_A_lon)
    walking_speed_in_seconds = distance / (time_in_minutes * 60)
    return walking_speed_in_seconds

In [15]:
def build_shortest_path_table(start_stop_id, current_time_delta):
    start_stop_lat = stops.loc[stops.stop_id == start_stop_id, 'stop_lat'].values[0]
    start_stop_lon = stops.loc[stops.stop_id == start_stop_id, 'stop_lon'].values[0]
    shortest_path = stops.copy()
    walking_distance = abs(shortest_path['stop_lat'] - start_stop_lat) + abs(shortest_path['stop_lon'] - start_stop_lon)
    walking_speed = walking_speed_estimate()
    walking_time = round(walking_distance / walking_speed, 0)
    shortest_path['arrival_time_delta'] = current_time_delta + pd.to_timedelta(walking_time, 'seconds')
    shortest_path = shortest_path.sort_values(by = 'arrival_time_delta').reset_index(drop = True)
    shortest_path['previous_stop'] = start_stop_id
    shortest_path['previous_mode'] = 'W'
    shortest_path['trip_id'] = None
    shortest_path['visited'] = False
    return shortest_path

In [84]:
def build_stop_schedule(start_stop_id, current_time_delta):
    
    stop_schedule = schedule.query('stop_id == @start_stop_id') # query creates a copy and is faster than .loc
    stop_schedule = stop_schedule.sort_values(by = 'stop_time_delta') # avoid resetting index
    stop_schedule = stop_schedule.drop_duplicates(subset = 'shape_id') # perhaps I can find a better way than to have to sort through our values and then drop duplicates
    stop_schedule['next_day'] = False
    mask = stop_schedule.stop_time_delta < current_time_delta
    stop_schedule.loc[mask, 'next_day'] = True
    stop_schedule.loc[mask, 'stop_time_delta'] += td(days = 1)
    return stop_schedule

# ERROR - why is my next day not only populating when relevant or else,
# why are all the dates for the home stop for the next day?

# figure out if a full dataframe needs to be returned or if a list of index values might suffice
# how can using numpy with pandas help me out to speed up this one function?

In [64]:
def build_stop_schedule(start_stop_id, current_time_delta):
    step_start = time.perf_counter()
    
    stop_schedule = schedule.copy()
    print('1.', (time.perf_counter() - step_start) * 1000, 'ms')
    step_start = time.perf_counter()
    
    stop_schedule = stop_schedule.loc[stop_schedule.stop_id == start_stop_id]
    print('2.', (time.perf_counter() - step_start) * 1000, 'ms')
    step_start = time.perf_counter()
    
    stop_schedule['next_day'] = False
    print('3.', (time.perf_counter() - step_start) * 1000, 'ms')
    step_start = time.perf_counter()
    
    stop_schedule.loc[stop_schedule.stop_time_delta < current_time_delta, 'next_day'] = True
    print('4.', (time.perf_counter() - step_start) * 1000, 'ms')
    step_start = time.perf_counter()
    
    stop_schedule.loc[stop_schedule.stop_time_delta < current_time_delta, 'stop_time_delta'] = stop_schedule.loc[stop_schedule.stop_time_delta < current_time_delta, 'stop_time_delta'] + td(days = 1)
    print('5.', (time.perf_counter() - step_start) * 1000, 'ms')
    step_start = time.perf_counter()
    
    stop_schedule = stop_schedule.sort_values(by = 'stop_time_delta').reset_index(drop = True)
    print('6.', (time.perf_counter() - step_start) * 1000, 'ms')
    step_start = time.perf_counter()
        
    stop_schedule = stop_schedule.drop_duplicates(subset = 'shape_id', keep = 'first', ignore_index = True)
    print('7.', (time.perf_counter() - step_start) * 1000, 'ms')
    step_start = time.perf_counter()
    
    return stop_schedule

In [83]:
%%time

build_stop_schedule(917, td(hours = 19))

CPU times: user 23.2 ms, sys: 2.53 ms, total: 25.7 ms
Wall time: 19.9 ms


Unnamed: 0,trip_id,route_short_name,route_long_name,shape_id,trip_headsign,stop_sequence,stop_time,stop_time_delta,stop_id,stop_code,stop_name,stop_lat,stop_lon,next_day
533695,43005606,395,YORK MILLS NIGHT BUS,887271,WEST - 395 YORK MILLS towards YORK MILLS STATION,53,4:46:41,1 days 04:46:41,917,9083,York Mills Rd at Sandover Dr (1222 York Mills),43.759813,-79.331751,True
1182616,43027100,95,YORK MILLS,888281,WEST - 95 YORK MILLS towards YORK MILLS STATION,39,4:53:42,1 days 04:53:42,917,9083,York Mills Rd at Sandover Dr (1222 York Mills),43.759813,-79.331751,True
1182966,43027105,95,YORK MILLS,888274,WEST - 95 YORK MILLS towards YORK MILLS STATION,48,5:47:39,1 days 05:47:39,917,9083,York Mills Rd at Sandover Dr (1222 York Mills),43.759813,-79.331751,True
1183214,43027110,95,YORK MILLS,888280,WEST - 95C YORK MILLS towards YORK MILLS STATION,17,6:16:39,1 days 06:16:39,917,9083,York Mills Rd at Sandover Dr (1222 York Mills),43.759813,-79.331751,True
1254411,43029282,995,YORK MILLS EXPRESS,888390,WEST - 995 YORK MILLS EXPRESS towards YORK MIL...,21,6:32:47,1 days 06:32:47,917,9083,York Mills Rd at Sandover Dr (1222 York Mills),43.759813,-79.331751,True


In [72]:
def update_shortest_path(shortest_path, stop_schedule, current_time_delta, visiting_stop_id):
    
    step_start = time.perf_counter()
    print('start')
    print()

    upcoming_trip_stops = []
    for i in range(0, len(stop_schedule)):
        current_trip_id = stop_schedule.loc[i, 'trip_id']
        current_stop_sequence = stop_schedule.loc[i, 'stop_sequence']
        next_day = stop_schedule.loc[i, 'next_day']
        next_stops = schedule.loc[(schedule.trip_id == current_trip_id) & (schedule.stop_sequence > current_stop_sequence)].copy()
        if (next_day):
            next_stops.stop_time_delta = next_stops.stop_time_delta + td(days = 1)        
        upcoming_trip_stops.append(next_stops)
    upcoming_trip_stops = pd.concat(upcoming_trip_stops)
    upcoming_trip_stops = upcoming_trip_stops.sort_values(by = 'stop_time_delta').drop_duplicates(subset = 'stop_id', keep = 'first').reset_index(drop = True)
    
    print('step one', (time.perf_counter() - step_start) * 100, 'hundredths of a second')
    print('----------')
    step_start = time.perf_counter()
    
    for i in range(0, len(upcoming_trip_stops)):
        current_stop_id = upcoming_trip_stops.loc[i, 'stop_id']
        current_stop_time_delta = upcoming_trip_stops.loc[i, 'stop_time_delta']
        current_trip_id = upcoming_trip_stops.loc[i, 'trip_id']
        current_arrival_time = shortest_path.loc[shortest_path.stop_id == current_stop_id, 'arrival_time_delta'].values[0]
        
        if (current_arrival_time > current_stop_time_delta):
            shortest_path.loc[shortest_path.stop_id == current_stop_id, 'arrival_time_delta'] = current_stop_time_delta
            shortest_path.loc[shortest_path.stop_id == current_stop_id, 'previous_stop'] = visiting_stop_id
            shortest_path.loc[shortest_path.stop_id == current_stop_id, 'previous_mode'] = 'T'
            shortest_path.loc[shortest_path.stop_id == current_stop_id, 'trip_id'] = current_trip_id
    
    print('step two', (time.perf_counter() - step_start) * 100, 'hundredths of a second')
    print('----------')
    step_start = time.perf_counter()
            
    shortest_path = shortest_path.sort_values(by = 'arrival_time_delta').reset_index(drop = True)
    shortest_path.loc[shortest_path.stop_id == visiting_stop_id, 'visited'] = True
    
    print('step three', (time.perf_counter() - step_start) * 100, 'hundredths of a second')
    print('----------')
    step_start = time.perf_counter()
    
    print()
    print()
    
    return shortest_path

In [18]:
def update_walking_path(shortest_path, current_time_delta, visiting_stop_id):
    visiting_stop_lat = stops.loc[stops.stop_id == visiting_stop_id, 'stop_lat'].values[0]
    visiting_stop_lon = stops.loc[stops.stop_id == visiting_stop_id, 'stop_lon'].values[0]
    walking_distance = abs(shortest_path['stop_lat'] - visiting_stop_lat) + abs(shortest_path['stop_lon'] - visiting_stop_lon)
    walking_speed = walking_speed_estimate()
    walking_time = round(walking_distance / walking_speed, 0)
    shortest_path['walking_arrival_time_delta'] = current_time_delta + pd.to_timedelta(walking_time, 'seconds')
    
    mask = shortest_path.arrival_time_delta > shortest_path.walking_arrival_time_delta
    shortest_path.loc[mask, 'arrival_time_delta'] = shortest_path.loc[mask, 'walking_arrival_time_delta']
    shortest_path.loc[mask, 'previous_stop'] = visiting_stop_id
    shortest_path.loc[mask, 'previous_mode'] = 'W'
    shortest_path.loc[mask, 'trip_id'] = None
    
#     for i in range(0, len(shortest_path)):
        
#         arrival_time_delta = shortest_path.loc[i, 'arrival_time_delta']
#         walking_arrival_time_delta = shortest_path.loc[i, 'walking_arrival_time_delta']
#         if (arrival_time_delta > walking_arrival_time_delta):
#             shortest_path.loc[i, 'arrival_time_delta'] = walking_arrival_time_delta
#             shortest_path.loc[i, 'previous_stop'] = visiting_stop_id
#             shortest_path.loc[i, 'previous_mode'] = 'W'
#             shortest_path.loc[i, 'trip_id'] = None
    
    shortest_path = shortest_path.sort_values(by = 'arrival_time_delta').reset_index(drop = True)
    shortest_path = shortest_path.drop(columns = 'walking_arrival_time_delta')
    
    return shortest_path

---
## Define Master Function

In [29]:
def find_shortest_path(start_lat, start_lon, end_lat, end_lon, start_time_delta):
    
    time_start = time.perf_counter()
    #print('start', time.perf_counter() - time_start, 'seconds')
    
    start_stop_id = find_closest_stop_id(start_lat, start_lon)
    #print('find_closest_stop_id', time.perf_counter() - time_start, 'seconds')
    
    shortest_path = build_shortest_path_table(start_stop_id, start_time_delta)
#     print('build_shortest_path_table', time.perf_counter() - time_start, 'seconds')
#     print('---')
#     print()
#     print()
    
    for i in range(0, len(shortest_path)):
               
        if (i < 40):
            pass
        else:
            break
        
        time_loop = time.perf_counter()
        #print('loop_start', time.perf_counter() - time_start, 'seconds')
        
        next_stop_record = shortest_path.loc[shortest_path.visited == False].iloc[0]
        current_stop_id = next_stop_record.stop_id
        current_time_delta = next_stop_record.arrival_time_delta
        previous_mode = next_stop_record.previous_mode
        #print('loop_values', i, current_stop_id, current_time_delta, previous_mode)
        
        #print('loop_setup', time.perf_counter() - time_loop, 'seconds')
        
        if (previous_mode == 'T'):
            shortest_path = update_walking_path(shortest_path, current_time_delta, current_stop_id)
            #print('*')
            #print('update_walking_path', time.perf_counter() - time_loop, 'seconds')
            
            pass
            # insert logic for calculating walking time to other stops
            # update shortest_path if walking time < current arrival time
        stop_schedule = build_stop_schedule(current_stop_id, current_time_delta)
        #print('build_stop_schedule', time.perf_counter() - time_loop, 'seconds')

        shortest_path = update_shortest_path(shortest_path, stop_schedule, current_time_delta, current_stop_id)
        #print('update_shortest_path', time.perf_counter() - time_loop, 'seconds')
        #print()
        #print()

    #print('finish', time.perf_counter() - time_start, 'seconds')
    
    return shortest_path

---
## Test Functions

In [20]:
home = [43.76008911645013, -79.33181124795766]
longos = [43.75447805630398, -79.35689569243047]
gonoe = [43.7459232592541, -79.34612864369309]
current_time_delta = td(hours = 19, minutes = 0, seconds = 0)

In [40]:
%%time

output = find_shortest_path(home[0], home[1], longos[0], longos[1], current_time_delta)

start

step one 3.7717373999839765 hundredths of a second
----------
step two 2.793814199958433 hundredths of a second
----------
step three 0.36732970002049115 hundredths of a second
----------


start

step one 3.4971104999385716 hundredths of a second
----------
step two 5.731409500003792 hundredths of a second
----------
step three 0.49234999996770057 hundredths of a second
----------


start

step one 4.293627800052491 hundredths of a second
----------
step two 6.167611700038833 hundredths of a second
----------
step three 0.43354859999453765 hundredths of a second
----------


start

step one 2.663741400010622 hundredths of a second
----------
step two 1.9864240000060818 hundredths of a second
----------
step three 0.418367699967348 hundredths of a second
----------


start

step one 3.213504399991507 hundredths of a second
----------
step two 1.4932628999304143 hundredths of a second
----------
step three 0.4873277999649872 hundredths of a second
----------


start

step one 3.9

In [22]:
output

Unnamed: 0,stop_id,stop_code,stop_name,stop_lat,stop_lon,arrival_time_delta,previous_stop,previous_mode,trip_id,visited
0,917,9083,York Mills Rd at Sandover Dr (1222 York Mills),43.759813,-79.331751,0 days 19:00:00,917,W,,True
1,8081,9088,York Mills Rd at Valley Woods Rd East Side,43.759595,-79.332036,0 days 19:00:35,917,W,,True
2,5191,11397,1200-1202 York Mills Rd,43.759132,-79.333893,0 days 19:01:15,917,T,43027160,True
3,10256,11533,113 Valley Woods Rd,43.758960,-79.332778,0 days 19:02:12,917,W,,True
4,6913,11527,Valley Woods Rd at York Mills Rd,43.758994,-79.333070,0 days 19:02:22,5191,W,,True
...,...,...,...,...,...,...,...,...,...,...
9414,4305,1632,The West Mall at Sherway Dr North Side,43.609781,-79.560638,1 days 02:00:53,6330,W,,False
9415,7093,1631,The West Mall at Trillium Health Centre (North...,43.610381,-79.561480,1 days 02:01:10,6330,W,,False
9416,880,1633,The West Mall at Sherway Dr (Trillium Health C...,43.609464,-79.560579,1 days 02:01:11,6330,W,,False
9417,477,4435,Long Branch Loop,43.592111,-79.543700,1 days 02:01:44,6330,W,,False


In [None]:
output.to_feather('data/export/output.ftr')

In [None]:
find_closest_stop_id(gonoe[0], gonoe[1])

In [None]:
output

In [None]:
output.query('stop_id == 917')

In [None]:
schedule.query('(stop_id == 9834 | stop_id == 917) & trip_id == 43027160')

In [None]:
output.query('stop_id == 9834')

In [None]:
output.query('stop_id == 4590')

In [None]:
schedule.query('(stop_id == 4590 | stop_id == 3736) & trip_id == 43000162')

In [None]:
output.query('stop_id == 3736')

In [None]:
output.query('stop_id == 5740')

In [None]:
output