# Pathfinding Model
Objectives:
- build a pathfinding model wrapped inside a function called `find_shortest_path`
- this function should accept the user's starting and ending geocoordinates along with the current time
- data has been preprocessed in a previous notebook and filtered only include regular weekday service; ie. stop times are for any given Monday to Friday

---
## Import Modules & Data

In [1]:
import time
import pandas as pd
from datetime import timedelta as td

In [2]:
stops = pd.read_feather('data/model/stops.ftr')
schedule = pd.read_feather('data/model/schedule.ftr')
# trips = pd.read_feather('data/model/trips.ftr')
# stop_times = pd.read_feather('data/model/stop_times.ftr')

In [3]:
schedule.head(1)

Unnamed: 0,trip_id,route_short_name,route_long_name,shape_id,trip_headsign,stop_sequence,stop_time,stop_time_delta,stop_id,stop_code,stop_name,stop_lat,stop_lon
0,42990004,10,VAN HORNE,886387,EAST - 10 VAN HORNE towards VICTORIA PARK,1,7:00:29,0 days 07:00:29,14155,14633,Don Mills Station,43.776222,-79.347048


In [4]:
stops.head(1)

Unnamed: 0,stop_id,stop_code,stop_name,stop_lat,stop_lon
0,262,662,Danforth Rd at Kennedy Rd,43.714379,-79.260939


---
## Define Helper Functions

In [5]:
def find_closest_stop_id(input_lat, input_lon):
    stop_distance = stops.loc[:, ['stop_id', 'stop_lat', 'stop_lon']]
    stop_distance['distance'] = ( abs(input_lat - stop_distance['stop_lat'])**2 + abs(input_lon - stop_distance['stop_lon'])**2 )**(1/2)
    closest_stop_id = stop_distance.sort_values(by = 'distance').stop_id.iloc[0]
    return closest_stop_id

In [6]:
def find_closest_stops(input_lat, input_lon):
    stop_distance = stops.loc[:, ['stop_id', 'stop_lat', 'stop_lon']]
    stop_distance['distance'] = ( abs(input_lat - stop_distance['stop_lat'])**2 + abs(input_lon - stop_distance['stop_lon'])**2 )**(1/2)
    stop_distance = stop_distance.sort_values(by = 'distance').reset_index(drop = True)
    return stop_distance

In [7]:
def walking_speed_estimate():
    stop_id_A = 917
    stop_id_B = 9946
    time_in_minutes = 20
    stop_A_lat = stops.loc[stops.stop_id == stop_id_A, 'stop_lat'].values[0]
    stop_A_lon = stops.loc[stops.stop_id == stop_id_A, 'stop_lon'].values[0]
    stop_B_lat = stops.loc[stops.stop_id == stop_id_B, 'stop_lat'].values[0]
    stop_B_lon = stops.loc[stops.stop_id == stop_id_B, 'stop_lon'].values[0]
    distance = abs(stop_B_lat - stop_A_lat) + abs(stop_B_lon - stop_A_lon)
    walking_speed_in_seconds = distance / (time_in_minutes * 60)
    return walking_speed_in_seconds

In [8]:
def build_shortest_path_table(start_stop_id, current_time_delta):
    start_stop_lat = stops.loc[stops.stop_id == start_stop_id, 'stop_lat'].values[0]
    start_stop_lon = stops.loc[stops.stop_id == start_stop_id, 'stop_lon'].values[0]
    shortest_path = stops.copy()
    walking_distance = abs(shortest_path['stop_lat'] - start_stop_lat) + abs(shortest_path['stop_lon'] - start_stop_lon)
    walking_speed = walking_speed_estimate()
    walking_time = round(walking_distance / walking_speed, 0)
    shortest_path['arrival_time_delta'] = current_time_delta + pd.to_timedelta(walking_time, 'seconds')
    shortest_path = shortest_path.sort_values(by = 'arrival_time_delta').reset_index(drop = True)
    shortest_path['previous_stop'] = start_stop_id
    shortest_path['previous_mode'] = 'W'
    shortest_path['trip_id'] = None
    shortest_path['visited'] = False
    return shortest_path

In [9]:
stop_schedule_master = schedule[['trip_id', 'shape_id', 'stop_sequence', 'stop_id', 'stop_time_delta']]
stop_schedule_master = stop_schedule_master.sort_values(by = 'stop_time_delta').reset_index(drop = True)
stop_schedule_master['next_day'] = False

In [10]:
stop_schedule_master.head(3)

Unnamed: 0,trip_id,shape_id,stop_sequence,stop_id,stop_time_delta,next_day
0,43003469,887148,1,8533,0 days 03:32:05,False
1,43003469,887148,2,14963,0 days 03:32:35,False
2,43003469,887148,3,9271,0 days 03:33:32,False


In [11]:
%load_ext line_profiler

In [24]:
import numexpr

In [None]:
import numexpr

def ne(df):
    x = df.A.values
    return df[numexpr.evaluate('(x > 5)')]
print (ne(df))


def ne(x):
    x = x.A.values
    return x[numexpr.evaluate('(x > 5)')]

In [79]:
def build_stop_schedule(start_stop_id, current_time_delta):
    i = stop_schedule_master.stop_id.values
    df = stop_schedule_master[numexpr.evaluate('(i == start_stop_id)')].copy()
    # df = df.query('stop_id == @start_stop_id') # query creates a copy and is faster than .loc
    df.loc[df.stop_time_delta < current_time_delta, 'next_day'] = True
    df.loc[df.next_day == True, 'stop_time_delta'] = df.loc[df.next_day == True, 'stop_time_delta'].copy() + td(days = 1)
    df = pd.concat([df[df.next_day == False], df[df.next_day == True]])
    # df = df.sort_values(by = 'stop_time_delta') # avoid resetting index
    df = df.drop_duplicates(subset = 'shape_id') # perhaps I can find a better way than to have to sort through our values and then drop duplicates
    return df

# ERROR - why is my next day not only populating when relevant or else,
# why are all the dates for the home stop for the next day?

# figure out if a full dataframe needs to be returned or if a list of index values might suffice
# how can using numpy with pandas help me out to speed up this one function?

In [55]:
df = build_stop_schedule(917, td(hours = 19))
df

Unnamed: 0,trip_id,shape_id,stop_sequence,stop_id,stop_time_delta,next_day
922720,43027160,888274,48,917,0 days 19:00:42,False
925255,43029300,888390,21,917,0 days 19:02:47,False
930993,43027159,888280,17,917,0 days 19:07:45,False
1005591,43027215,888281,39,917,0 days 20:20:45,False
1279910,43005604,887271,53,917,1 days 02:52:20,False


In [51]:
%lprun -f build_stop_schedule build_stop_schedule(917, td(hours = 19))

Timer unit: 1e-06 s

Total time: 0.020837 s
File: <ipython-input-50-804ec1225870>
Function: build_stop_schedule at line 1

Line #      Hits         Time  Per Hit   % Time  Line Contents
     1                                           def build_stop_schedule(start_stop_id, current_time_delta):
     2         1         70.0     70.0      0.3      i = stop_schedule_master.stop_id.values
     3         1       4485.0   4485.0     21.5      df = stop_schedule_master[numexpr.evaluate('(i == start_stop_id)')]
     4                                               # df = df.query('stop_id == @start_stop_id') # query creates a copy and is faster than .loc
     5         1        904.0    904.0      4.3      df.loc[df.stop_time_delta < current_time_delta, 'next_day'] = True
     6         1       1836.0   1836.0      8.8      df.loc[df.next_day == True, 'stop_time_delta'] += td(days = 1)
     7         1       8391.0   8391.0     40.3      df = pd.concat([df[df.next_day == False], df[df.next_day 

In [43]:
%%time

df = build_stop_schedule(917, td(hours = 19))
df

CPU times: user 17.6 ms, sys: 1.78 ms, total: 19.4 ms
Wall time: 13.2 ms


Unnamed: 0,trip_id,shape_id,stop_sequence,stop_id,stop_time_delta,next_day
922720,43027160,888274,48,917,0 days 19:00:42,False
925255,43029300,888390,21,917,0 days 19:02:47,False
930993,43027159,888280,17,917,0 days 19:07:45,False
937917,43027200,888274,48,917,0 days 19:13:45,False
939078,43029301,888390,21,917,0 days 19:14:47,False
...,...,...,...,...,...,...
898606,43029298,888390,21,917,1 days 18:40:47,True
902229,43027244,888280,17,917,1 days 18:43:42,True
909570,43027242,888274,48,917,1 days 18:49:42,True
912103,43029299,888390,21,917,1 days 18:51:47,True


In [52]:
%%timeit

df2 = build_stop_schedule(917, td(hours = 19))
df2

10.9 ms ± 273 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [None]:
df.query('next_day == True')

In [None]:
def build_stop_schedule(start_stop_id, current_time_delta):
    stop_schedule = schedule.copy()
    stop_schedule = stop_schedule.loc[stop_schedule.stop_id == start_stop_id]
    stop_schedule['next_day'] = False
    stop_schedule.loc[stop_schedule.stop_time_delta < current_time_delta, 'next_day'] = True
    stop_schedule.loc[stop_schedule.stop_time_delta < current_time_delta, 'stop_time_delta'] = stop_schedule.loc[stop_schedule.stop_time_delta < current_time_delta, 'stop_time_delta'] + td(days = 1)
    stop_schedule = stop_schedule.sort_values(by = 'stop_time_delta').reset_index(drop = True)
    stop_schedule = stop_schedule.drop_duplicates(subset = 'shape_id', keep = 'first', ignore_index = True)
    return stop_schedule

In [None]:
def build_stop_schedule(start_stop_id, current_time_delta):
    step_start = time.perf_counter()
    
    stop_schedule = schedule.copy()
    print('1.', (time.perf_counter() - step_start) * 1000, 'ms')
    step_start = time.perf_counter()
    
    stop_schedule = stop_schedule.loc[stop_schedule.stop_id == start_stop_id]
    print('2.', (time.perf_counter() - step_start) * 1000, 'ms')
    step_start = time.perf_counter()
    
    stop_schedule['next_day'] = False
    print('3.', (time.perf_counter() - step_start) * 1000, 'ms')
    step_start = time.perf_counter()
    
    stop_schedule.loc[stop_schedule.stop_time_delta < current_time_delta, 'next_day'] = True
    print('4.', (time.perf_counter() - step_start) * 1000, 'ms')
    step_start = time.perf_counter()
    
    stop_schedule.loc[stop_schedule.stop_time_delta < current_time_delta, 'stop_time_delta'] = stop_schedule.loc[stop_schedule.stop_time_delta < current_time_delta, 'stop_time_delta'] + td(days = 1)
    print('5.', (time.perf_counter() - step_start) * 1000, 'ms')
    step_start = time.perf_counter()
    
    stop_schedule = stop_schedule.sort_values(by = 'stop_time_delta').reset_index(drop = True)
    print('6.', (time.perf_counter() - step_start) * 1000, 'ms')
    step_start = time.perf_counter()
        
    stop_schedule = stop_schedule.drop_duplicates(subset = 'shape_id', keep = 'first', ignore_index = True)
    print('7.', (time.perf_counter() - step_start) * 1000, 'ms')
    step_start = time.perf_counter()
    
    return stop_schedule

In [66]:
def update_shortest_path(shortest_path, stop_schedule, current_time_delta, visiting_stop_id):
    
#     step_start = time.perf_counter()
#     print('start')
#     print()

    upcoming_trip_stops = []
    for i in range(0, len(stop_schedule)):
        current_trip_id = stop_schedule['trip_id'].values[i]
        current_stop_sequence = stop_schedule['stop_sequence'].values[i]
        next_day = stop_schedule['next_day'].values[i]
        next_stops = schedule.loc[(schedule.trip_id == current_trip_id) & (schedule.stop_sequence > current_stop_sequence)].copy()
        if (next_day):
            next_stops.stop_time_delta = next_stops.stop_time_delta + td(days = 1)        
        upcoming_trip_stops.append(next_stops)
    upcoming_trip_stops = pd.concat(upcoming_trip_stops)
    upcoming_trip_stops = upcoming_trip_stops.sort_values(by = 'stop_time_delta').drop_duplicates(subset = 'stop_id', keep = 'first').reset_index(drop = True)
    
#     print('step one', (time.perf_counter() - step_start) * 100, 'hundredths of a second')
#     print('----------')
#     step_start = time.perf_counter()
    
    for i in range(0, len(upcoming_trip_stops)):
        current_stop_id = upcoming_trip_stops.loc[i, 'stop_id']
        current_stop_time_delta = upcoming_trip_stops.loc[i, 'stop_time_delta']
        current_trip_id = upcoming_trip_stops.loc[i, 'trip_id']
        current_arrival_time = shortest_path.loc[shortest_path.stop_id == current_stop_id, 'arrival_time_delta'].values[0]
        
        if (current_arrival_time > current_stop_time_delta):
            shortest_path.loc[shortest_path.stop_id == current_stop_id, 'arrival_time_delta'] = current_stop_time_delta
            shortest_path.loc[shortest_path.stop_id == current_stop_id, 'previous_stop'] = visiting_stop_id
            shortest_path.loc[shortest_path.stop_id == current_stop_id, 'previous_mode'] = 'T'
            shortest_path.loc[shortest_path.stop_id == current_stop_id, 'trip_id'] = current_trip_id
    
#     print('step two', (time.perf_counter() - step_start) * 100, 'hundredths of a second')
#     print('----------')
#     step_start = time.perf_counter()
            
    shortest_path = shortest_path.sort_values(by = 'arrival_time_delta').reset_index(drop = True)
    shortest_path.loc[shortest_path.stop_id == visiting_stop_id, 'visited'] = True
    
#     print('step three', (time.perf_counter() - step_start) * 100, 'hundredths of a second')
#     print('----------')
#     step_start = time.perf_counter()
    
#     print()
#     print()
    
    return shortest_path

In [46]:
def update_walking_path(shortest_path, current_time_delta, visiting_stop_id):
    visiting_stop_lat = stops.loc[stops.stop_id == visiting_stop_id, 'stop_lat'].values[0]
    visiting_stop_lon = stops.loc[stops.stop_id == visiting_stop_id, 'stop_lon'].values[0]
    walking_distance = abs(shortest_path['stop_lat'] - visiting_stop_lat) + abs(shortest_path['stop_lon'] - visiting_stop_lon)
    walking_speed = walking_speed_estimate()
    walking_time = round(walking_distance / walking_speed, 0)
    shortest_path['walking_arrival_time_delta'] = current_time_delta + pd.to_timedelta(walking_time, 'seconds')
    
    mask = shortest_path.arrival_time_delta > shortest_path.walking_arrival_time_delta
    shortest_path.loc[mask, 'arrival_time_delta'] = shortest_path.loc[mask, 'walking_arrival_time_delta']
    shortest_path.loc[mask, 'previous_stop'] = visiting_stop_id
    shortest_path.loc[mask, 'previous_mode'] = 'W'
    shortest_path.loc[mask, 'trip_id'] = None
    
#     for i in range(0, len(shortest_path)):
        
#         arrival_time_delta = shortest_path.loc[i, 'arrival_time_delta']
#         walking_arrival_time_delta = shortest_path.loc[i, 'walking_arrival_time_delta']
#         if (arrival_time_delta > walking_arrival_time_delta):
#             shortest_path.loc[i, 'arrival_time_delta'] = walking_arrival_time_delta
#             shortest_path.loc[i, 'previous_stop'] = visiting_stop_id
#             shortest_path.loc[i, 'previous_mode'] = 'W'
#             shortest_path.loc[i, 'trip_id'] = None
    
    shortest_path = shortest_path.sort_values(by = 'arrival_time_delta').reset_index(drop = True)
    shortest_path = shortest_path.drop(columns = 'walking_arrival_time_delta')
    
    return shortest_path

---
## Define Master Function

In [81]:
def find_shortest_path(start_lat, start_lon, end_lat, end_lon, start_time_delta):
    
    time_start = time.perf_counter()
    #print('start', time.perf_counter() - time_start, 'seconds')
    
    start_stop_id = find_closest_stop_id(start_lat, start_lon)
    #print('find_closest_stop_id', time.perf_counter() - time_start, 'seconds')
    
    shortest_path = build_shortest_path_table(start_stop_id, start_time_delta)
#     print('build_shortest_path_table', time.perf_counter() - time_start, 'seconds')
#     print('---')
#     print()
#     print()
    
    for i in range(0, len(shortest_path)):
               
#         if (i < 940):
#             pass
#         else:
#             break
        
        time_loop = time.perf_counter()
        #print('loop_start', time.perf_counter() - time_start, 'seconds')
        
        next_stop_record = shortest_path.loc[shortest_path.visited == False].iloc[0]
        current_stop_id = next_stop_record.stop_id
        current_time_delta = next_stop_record.arrival_time_delta
        previous_mode = next_stop_record.previous_mode
        #print('loop_values', i, current_stop_id, current_time_delta, previous_mode)
        
        #print('loop_setup', time.perf_counter() - time_loop, 'seconds')
        
        if (previous_mode == 'T'):
            shortest_path = update_walking_path(shortest_path, current_time_delta, current_stop_id)
            #print('*')
            #print('update_walking_path', time.perf_counter() - time_loop, 'seconds')
            
            pass
            # insert logic for calculating walking time to other stops
            # update shortest_path if walking time < current arrival time
        stop_schedule = build_stop_schedule(current_stop_id, current_time_delta)
        #print('build_stop_schedule', time.perf_counter() - time_loop, 'seconds')

        shortest_path = update_shortest_path(shortest_path, stop_schedule, current_time_delta, current_stop_id)
        #print('update_shortest_path', time.perf_counter() - time_loop, 'seconds')
        #print()
        #print()

    #print('finish', time.perf_counter() - time_start, 'seconds')
    
    return shortest_path

---
## Test Functions

In [48]:
home = [43.76008911645013, -79.33181124795766]
longos = [43.75447805630398, -79.35689569243047]
gonoe = [43.7459232592541, -79.34612864369309]
current_time_delta = td(hours = 19, minutes = 0, seconds = 0)

In [83]:
%%time

output = find_shortest_path(home[0], home[1], longos[0], longos[1], current_time_delta)

CPU times: user 11min 35s, sys: 18.6 s, total: 11min 53s
Wall time: 8min 6s


In [None]:
output

In [None]:
output.to_feather('data/export/output.ftr')

In [None]:
find_closest_stop_id(gonoe[0], gonoe[1])

In [None]:
output

In [None]:
output.query('stop_id == 917')

In [None]:
schedule.query('(stop_id == 9834 | stop_id == 917) & trip_id == 43027160')

In [None]:
output.query('stop_id == 9834')

In [None]:
output.query('stop_id == 4590')

In [None]:
schedule.query('(stop_id == 4590 | stop_id == 3736) & trip_id == 43000162')

In [None]:
output.query('stop_id == 3736')

In [None]:
output.query('stop_id == 5740')

In [None]:
output