# Pathfinding Logic
- Attempt to implement Dijkstra's Shortest Path Algorithm

## Setup & Data Import

In [1]:
import folium
import numpy as np
import pandas as pd
from datetime import datetime as dt
from datetime import timedelta as td
import matplotlib.pyplot as plt

In [2]:
data_tables = ['calendar_dates', 'calendar', 'routes', 'shapes', 'stop_times', 'stops', 'trips']

In [4]:
# import data preprocessed and saved in feather format

for table in data_tables:
    globals()[table] = pd.read_feather(f'data/store/{table}.ftr')

## Helper Function: Table Values

In [5]:
# print the shape of a dataframe and value counts operation on each column within

def table_values(df):
    df_shape = df.shape
    df_rows = df_shape[0]
    df_columns = df_shape[1]
    print(f'{df_rows} rows x {df_columns} columns')
    print()
    for column in df:
        print(column.upper())
        print(df[column].value_counts().sort_index(ascending = False).sort_values(ascending = False))
        print()

## Experimental Input Values

In [6]:
# Instantiate all landmarks to be used for pathfinding experimentation

home = [43.76008911645013, -79.33181124795766]
longos = [43.75447805630398, -79.35689569243047]
gonoe = [43.7459232592541, -79.34612864369309]

## Helper Functions: Find Closest Stop(s)

In [7]:
# Return a list of closest stops

def find_closest_stops(input_lat, input_lon):
    stop_distance_df = stops.loc[:, ['stop_id', 'stop_name', 'stop_lat', 'stop_lon']]
    stop_distance_df['distance'] = ( abs(input_lat - stop_distance_df['stop_lat'])**2 + abs(input_lon - stop_distance_df['stop_lon'])**2 )**(1/2)
    return stop_distance_df.sort_values(by = 'distance').reset_index(drop = True)

In [8]:
# Return just the stop_id for the closest stop

def find_closest_stop_id(input_lat, input_lon):
    stop_distance_df = stops.loc[:, ['stop_id', 'stop_lat', 'stop_lon']]
    stop_distance_df['distance'] = ( abs(input_lat - stop_distance_df['stop_lat'])**2 + abs(input_lon - stop_distance_df['stop_lon'])**2 )**(1/2)
    return stop_distance_df.sort_values(by = 'distance').reset_index(drop = True).loc[0, 'stop_id']

In [9]:
# Example of using closest stop functions

display(find_closest_stops(home[0], home[1]).head(3))
display(f'closest stop_id: {find_closest_stop_id(home[0], home[1])}')

Unnamed: 0,stop_id,stop_name,stop_lat,stop_lon,distance
0,917,York Mills Rd at Sandover Dr (1222 York Mills),43.759813,-79.331751,0.000283
1,8081,York Mills Rd at Valley Woods Rd East Side,43.759595,-79.332036,0.000543
2,10256,113 Valley Woods Rd,43.75896,-79.332778,0.001486


'closest stop_id: 917'

---
## Master Function: Find Shortest Path

In [10]:
def find_shortest_path(start_lat, start_lon, end_lat, end_lon):
    start_stop_id = find_closest_stop_id(start_lat, start_lon)
    current_stop_id = start_stop_id
    current_dt = dt(1900, 1, 1, 19, 0, 0)
    print(start_stop_id, current_dt.strftime('%H:%M:%S'))

In [11]:
find_shortest_path(home[0], home[1], longos[0], longos[1])

917 19:00:00


In [12]:
stops.head(3)

Unnamed: 0,stop_id,stop_code,stop_name,stop_lat,stop_lon,wheelchair_boarding
0,262,662,Danforth Rd at Kennedy Rd,43.714379,-79.260939,2
1,263,929,Davenport Rd at Bedford Rd,43.674448,-79.399659,1
2,264,940,Davenport Rd at Dupont St,43.675511,-79.401938,2


## Manual First Attempt
*Eventually consolidate all code below into the master function above.*

Manually calculate all routes from home to Longo's and Gonoe.

---
### Create the shortest path table and calculate the distance from the home stop
*The shortest path table is where we will store all information that we discover on the way towards implementing Dijkstra's algorithm. We will update this table step by step until we discover the shortest path from the current stop.*

When populating this table for the first time, we we will update the initial distance values by taking into account the differences between the latitude and longitude coordinates of each stop to the home/current stop.

In [13]:
# store the home stop id into a dedicated variable
current_stop_id = 917

# create a new reference dataframe containing only the home stop
current_stop_df = stops[stops.stop_id == current_stop_id]

# store the home stop's geocoordinates into dedicated variables
current_stop_lat = current_stop_df.stop_lat.item()
current_stop_lon = current_stop_df.stop_lon.item()

In [14]:
# create new dataframe built off the stops table but with fewer columns
shortest_path_df = stops.loc[:, ['stop_id', 'stop_name', 'stop_lat', 'stop_lon']]

# create a new column called distance 
# which measures the distance between the home stop and all other stops
# using Pythagorean Theorem
shortest_path_df['distance'] = abs(current_stop_lat - shortest_path_df['stop_lat']) + abs(current_stop_lon - shortest_path_df['stop_lon'])

# sort table by distance values, low to high
shortest_path_df.sort_values(by = 'distance', inplace = True)

# reset the index of the sorted table
shortest_path_df.reset_index(drop = True, inplace = True )

In [15]:
shortest_path_df.head(3)

Unnamed: 0,stop_id,stop_name,stop_lat,stop_lon,distance
0,917,York Mills Rd at Sandover Dr (1222 York Mills),43.759813,-79.331751,0.0
1,8081,York Mills Rd at Valley Woods Rd East Side,43.759595,-79.332036,0.000503
2,10256,113 Valley Woods Rd,43.75896,-79.332778,0.00188


---
### Estimate walking distance

In [16]:
# store the reference point stop into a dedicated variable
# walk to this stop from home stop takes me approximately 20 minutes to walk
walk_est_point = shortest_path_df[shortest_path_df.stop_id == 9946]
walk_est_point

Unnamed: 0,stop_id,stop_name,stop_lat,stop_lon,distance
81,9946,Victoria Park Ave at York Mills Rd South Side,43.762734,-79.317551,0.017121


In [17]:
# distance divided by 20 minutes
walk_speed_min = walk_est_point.distance.item() / 20

# distance divided by 1200 seconds
walk_speed_sec = walk_est_point.distance.item() / (20 * 60)

In [18]:
# create a new column to store walking duration time
# while setting the default value to zero
shortest_path_df['duration'] = 0

In [19]:
# loop through every stop record on the shortest_path_df
for i in range(0, len(shortest_path_df)):
    
    # calculate walking time based on distance
    walking_time = round(shortest_path_df.loc[i, 'distance'] / walk_speed_sec, 0)
    
    # update the duration and seconds column using walking time
    shortest_path_df.loc[i, 'duration'] = td(seconds = walking_time)
    shortest_path_df.loc[i, 'seconds'] = walking_time
    
# convert the seconds column into integer type (removing all decimal points)
shortest_path_df.seconds = shortest_path_df.seconds.astype('int')

In [20]:
walk_speed_sec

1.4267500000002541e-05

---
### Further formatting of the shortest path table

In [19]:
# create two new columns while setting their default values
shortest_path_df['previous_stop'] = 0
shortest_path_df['visited'] = False

In [20]:
# set the home stop as the current stop
current_stop = shortest_path_df.loc[0, 'stop_id']

In [21]:
shortest_path_df.head(3)

Unnamed: 0,stop_id,stop_name,stop_lat,stop_lon,distance,duration,seconds,previous_stop,visited
0,917,York Mills Rd at Sandover Dr (1222 York Mills),43.759813,-79.331751,0.0,0:00:00,0,0,False
1,8081,York Mills Rd at Valley Woods Rd East Side,43.759595,-79.332036,0.000503,0:00:35,35,0,False
2,10256,113 Valley Woods Rd,43.75896,-79.332778,0.00188,0:02:12,132,0,False


---
### Process the stop_times and trips tables
*Eliminate all data not related to regular weekday service and irrelevant for pathfinding logic.*

#### To-do: must update this section to also create my_stops (only those stops that are part of the regular weekday service). Use my_stops to generate the shortest_path_df.

In [22]:
# Filter trips table to only those with service_id = 1; ie. regular weekday service
# Delete unnecessary columns
my_trips = trips[trips.service_id == 1]
my_trips = my_trips.drop(columns = ['service_id', 'block_id', 'shape_id', 'bikes_allowed'])

In [23]:
# Filter stop_times table to only those trips that remain on the my_trips table; ie. those trips with regular service
# Delete unnecessary columns
my_stop_times = stop_times[stop_times.trip_id.isin(my_trips.trip_id)]
my_stop_times = my_stop_times.drop(columns = ['pickup_type', 'drop_off_type', 'shape_dist_traveled'])

---
### Consolidate data for all trips via current stop

*Sort the `stop_times` table for all trips that visit the current stop. Then pull their metadata from the `trips` table.*

In [24]:
# create a new df based of stop_times which only contains records for the home stop
current_stop_times = my_stop_times.loc[my_stop_times.stop_id == current_stop]

In [25]:
current_stop_times.shape

(254, 4)

In [26]:
current_stop_times.head(3)

Unnamed: 0,trip_id,stop_time,stop_id,stop_sequence
536368,43005604,26:52:20,917,53
536443,43005605,5:16:41,917,53
536518,43005606,4:46:41,917,53


In [27]:
# create a new df based of trips which only contains trips which visit the home stop
current_stop_trips = my_trips.loc[my_trips.trip_id.isin(current_stop_times.trip_id)]

In [28]:
current_stop_trips.head(3)

Unnamed: 0,route_id,trip_id,trip_headsign,direction_id
15477,64914,43005606,WEST - 395 YORK MILLS towards YORK MILLS STATION,1
15478,64914,43005605,WEST - 395 YORK MILLS towards YORK MILLS STATION,1
15479,64914,43005604,WEST - 395 YORK MILLS towards YORK MILLS STATION,1


In [29]:
# 1-to-1 merge on trip_id of the two df's above
current_stop_trip_times = pd.merge(current_stop_times, current_stop_trips, on = 'trip_id', how = 'inner')

# This now contains data for all trips that pass through the current stop

In [30]:
# delete irrelevant columns (only 1 value each)
current_stop_trip_times = current_stop_trip_times.drop(columns = ['stop_id', 'direction_id'])

In [31]:
current_stop_trip_times.head(3)

Unnamed: 0,trip_id,stop_time,stop_sequence,route_id,trip_headsign
0,43005604,26:52:20,53,64914,WEST - 395 YORK MILLS towards YORK MILLS STATION
1,43005605,5:16:41,53,64914,WEST - 395 YORK MILLS towards YORK MILLS STATION
2,43005606,4:46:41,53,64914,WEST - 395 YORK MILLS towards YORK MILLS STATION


---
### Testing to determine the unique sequences of stops

In [32]:
# group all trips into their unique templates (sequence of stops)
current_stop_trip_times.loc[:, ['route_id', 'trip_headsign', 'stop_sequence']].value_counts().sort_index()

# Note that the stop_sequence number is sufficient on its own to distinguish groups of trips from each other
# Must make sure that final model confirms a unique stop_sequence number for each route_id & trip_headsign combination

route_id  trip_headsign                                             stop_sequence
64914     WEST - 395 YORK MILLS towards YORK MILLS STATION          53                 6
65004     WEST - 95 YORK MILLS towards YORK MILLS STATION           39                14
                                                                    48               107
          WEST - 95C YORK MILLS towards YORK MILLS STATION          17                69
65018     WEST - 995 YORK MILLS EXPRESS towards YORK MILLS STATION  21                58
dtype: int64

---
### Manual testing to discover the nearest neighbors for the current stop
*This section may now be irrelevant to the looped logic below which populates `next_stop_id`. (Dedicated list of nearest neighbors is not needed, as nearest neighbors should simply be the unique list of all possible `next_stop_id` values.)*

In [33]:
# create a new reference df for logic below
trip_sequence = current_stop_trip_times.loc[:, ['trip_id', 'stop_sequence']]

In [34]:
trip_sequence.head(3)

Unnamed: 0,trip_id,stop_sequence
0,43005604,53
1,43005605,53
2,43005606,53


In [35]:
trip_sequence.loc[0, 'trip_id']

43005604

In [36]:
# empty list to store nearest neighbors
nearest_neighbors = []

# loop through each trip visiting current stop
for i in range(0, len(trip_sequence)):
    
    # save the values for the current record into dedicated variables
    trip_id = trip_sequence.loc[i, 'trip_id']
    stop_sequence = trip_sequence.loc[i, 'stop_sequence']
    
    # for the current trip, check to see if a stop sequence exists beyond just the current one
    if stop_times.loc[stop_times.trip_id == trip_id].loc[stop_times.stop_sequence == (stop_sequence + 1)].shape[0] == 1:
        
        # if so...
        # mark the stop_id at the next stop sequence into the list of nearest neighbors
        nearest_neighbors.append(stop_times.loc[stop_times.trip_id == trip_id].loc[stop_times.stop_sequence == (stop_sequence + 1)].stop_id.item())
        
# consider checking to see if the next stop is already in the nearest neighbors list
# in contrast to just appending each individual instance

In [37]:
pd.Series(nearest_neighbors).value_counts()

5191    196
9834     58
dtype: int64

In [38]:
# perhaps using multiple conditions in the same filter will be more efficient
# vs chain multiple .loc filters (see below)
stop_times.loc[(stop_times.trip_id == 43005604) & (stop_times.stop_sequence == (53 + 1))]

Unnamed: 0,trip_id,stop_time,stop_id,stop_sequence,pickup_type,drop_off_type,shape_dist_traveled
536369,43005604,26:52:42,5191,54,0,0,17.2497


In [39]:
# alternative method of filtering as above
# also showing how to access the specific cell value of a df
stop_times.loc[stop_times.trip_id == 43005604].loc[stop_times.stop_sequence == (53 + 1)].stop_id.item()

5191

---
### Manual test to find the next stop_id in sequence

In [40]:
current_stop_trip_times.head(3)

# Now the objective will be to update the table below with 'next_stop_id'
# (The 'stop_id' value at the next stop sequence for each 'trip_id')

Unnamed: 0,trip_id,stop_time,stop_sequence,route_id,trip_headsign
0,43005604,26:52:20,53,64914,WEST - 395 YORK MILLS towards YORK MILLS STATION
1,43005605,5:16:41,53,64914,WEST - 395 YORK MILLS towards YORK MILLS STATION
2,43005606,4:46:41,53,64914,WEST - 395 YORK MILLS towards YORK MILLS STATION


In [41]:
test = stop_times.loc[stop_times.trip_id == 43005604]

In [42]:
test[test.stop_sequence.isin([53, 54])]

Unnamed: 0,trip_id,stop_time,stop_id,stop_sequence,pickup_type,drop_off_type,shape_dist_traveled
536368,43005604,26:52:20,917,53,0,0,17.0625
536369,43005604,26:52:42,5191,54,0,0,17.2497


In [43]:
type(test.stop_sequence[536319])

numpy.int64

In [44]:
test.stop_sequence.unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,
       52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68,
       69, 70, 71, 72, 73, 74, 75])

In [45]:
53 in test.stop_sequence.unique()

True

In [46]:
pd.Series(53).isin(test.stop_sequence)

0    True
dtype: bool

In [47]:
stop_times.loc[stop_times.trip_id == 43005604].loc[stop_times.stop_sequence == 54, 'stop_id']

536369    5191
Name: stop_id, dtype: int64

---
### Manual attempt at calculating duration between two stops

In [48]:
stop_times[(stop_times.trip_id == 43005604) & (stop_times.stop_sequence.isin([53, 54]))]

Unnamed: 0,trip_id,stop_time,stop_id,stop_sequence,pickup_type,drop_off_type,shape_dist_traveled
536368,43005604,26:52:20,917,53,0,0,17.0625
536369,43005604,26:52:42,5191,54,0,0,17.2497


In [49]:
start_time = stop_times[(stop_times.trip_id == 43005604) & (stop_times.stop_sequence == 53)].stop_time.item()
end_time = stop_times[(stop_times.trip_id == 43005604) & (stop_times.stop_sequence == 54)].stop_time.item()

In [50]:
start_time, end_time

('26:52:20', '26:52:42')

In [51]:
stop_times['time_delta'] = pd.to_timedelta(stop_times.stop_time)

In [52]:
start_tdelta = stop_times[(stop_times.trip_id == 43005604) & (stop_times.stop_sequence == 53)].time_delta.item()
end_tdelta = stop_times[(stop_times.trip_id == 43005604) & (stop_times.stop_sequence == 54)].time_delta.item()

In [53]:
start_tdelta, end_tdelta

(Timedelta('1 days 02:52:20'), Timedelta('1 days 02:52:42'))

In [54]:
time_diff = end_tdelta - start_tdelta

In [55]:
time_diff.total_seconds()

22.0

In [56]:
td(seconds=0)

datetime.timedelta(0)

---
### Loop logic to build the next_stop_id column

In [57]:
# create the next_stop_id column with the default value as None
current_stop_trip_times['next_stop_id'] = None
current_stop_trip_times['duration'] = td(seconds = 0)
current_stop_trip_times['seconds'] = 0

# loop over each of the trips on the current_stop_trip_times_df
for i in range(0, len(current_stop_trip_times)):
    
    # store the trip_id value for the current record
    trip_id = current_stop_trip_times.loc[i, 'trip_id']
    
    # determine the value for the next stop in sequence
    current_stop_sequence = int(current_stop_trip_times.loc[i, 'stop_sequence'])
    next_stop_sequence = current_stop_sequence + 1
    
    # filter the stop_times df to only show stop times for the current trip in loop
    selected_trip_stops = stop_times[stop_times.trip_id == trip_id]
    
    # check to see if there is a next stop available for the current trip
    # returns True or False
    next_stop_exists = next_stop_sequence in selected_trip_stops.stop_sequence.unique()
    
    if (next_stop_exists):
        
        # store the stop_id value at the next stop sequence
        next_stop_id = stop_times.loc[((stop_times.trip_id == trip_id) & (stop_times.stop_sequence == next_stop_sequence)), 'stop_id'].item()
        
        # start & end times
        start_time = stop_times.loc[((stop_times.trip_id == trip_id) & (stop_times.stop_sequence == current_stop_sequence)), 'time_delta'].item()
        end_time = stop_times.loc[((stop_times.trip_id == trip_id) & (stop_times.stop_sequence == next_stop_sequence)), 'time_delta'].item()
        
        # duration & seconds
        duration = end_time - start_time
        seconds = int(duration.total_seconds())
        
        # update the next_stop_id value for the current record
        current_stop_trip_times.loc[i, 'next_stop_id'] = next_stop_id
        current_stop_trip_times.loc[i, 'duration'] = duration
        current_stop_trip_times.loc[i, 'seconds'] = seconds

        
# possible better alternatives: 
#   consolidate all unique combinations of trip stops & sequence
#   and only search through that much smaller set
# or
#   develop pre-generated nearest neighbors & the time delta for each stop_id
# or
#   consolidate and pre-generate a unique combo of route_id, stop_id, stop_sequence, next_stop_id, next_stop_sequence

In [58]:
current_stop_trip_times

Unnamed: 0,trip_id,stop_time,stop_sequence,route_id,trip_headsign,next_stop_id,duration,seconds
0,43005604,26:52:20,53,64914,WEST - 395 YORK MILLS towards YORK MILLS STATION,5191,0 days 00:00:22,22
1,43005605,5:16:41,53,64914,WEST - 395 YORK MILLS towards YORK MILLS STATION,5191,0 days 00:00:27,27
2,43005606,4:46:41,53,64914,WEST - 395 YORK MILLS towards YORK MILLS STATION,5191,0 days 00:00:27,27
3,43005600,27:52:20,53,64914,WEST - 395 YORK MILLS towards YORK MILLS STATION,5191,0 days 00:00:22,22
4,43005602,27:22:20,53,64914,WEST - 395 YORK MILLS towards YORK MILLS STATION,5191,0 days 00:00:22,22
...,...,...,...,...,...,...,...,...
249,43029329,17:56:47,21,65018,WEST - 995 YORK MILLS EXPRESS towards YORK MIL...,9834,0 days 00:04:19,259
250,43029330,17:45:47,21,65018,WEST - 995 YORK MILLS EXPRESS towards YORK MIL...,9834,0 days 00:04:19,259
251,43029331,17:34:47,21,65018,WEST - 995 YORK MILLS EXPRESS towards YORK MIL...,9834,0 days 00:04:19,259
252,43029332,17:23:47,21,65018,WEST - 995 YORK MILLS EXPRESS towards YORK MIL...,9834,0 days 00:04:19,259


In [59]:
# confirm the next stop_id in sequence for the test trip
stop_times.loc[((stop_times.trip_id == 43005604) & (stop_times.stop_sequence == 54)), 'stop_id'].item()

5191

In [60]:
# unique values of next_stop_id which also acts as the list of nearest neighbors (by transit)
current_stop_trip_times.next_stop_id.value_counts()

5191    196
9834     58
Name: next_stop_id, dtype: int64

---
### Experiment with converting stop times over 24 hours into the correct 24 hour time

In [61]:
t_series = pd.to_timedelta(current_stop_trip_times.stop_time).sort_values()

In [62]:
day_plus = t_series > td(days = 1)

In [63]:
t_series[day_plus] = t_series[day_plus] - td(days = 1)

In [64]:
t_series[day_plus]

189   0 days 00:04:48
188   0 days 00:14:48
187   0 days 00:24:48
186   0 days 00:34:48
185   0 days 00:44:48
184   0 days 00:54:48
183   0 days 01:04:48
182   0 days 01:14:48
181   0 days 01:24:48
195   0 days 01:34:48
194   0 days 01:44:48
193   0 days 01:54:48
192   0 days 02:04:48
191   0 days 02:14:48
190   0 days 02:24:48
0     0 days 02:52:20
4     0 days 03:22:20
3     0 days 03:52:20
5     0 days 04:22:20
Name: stop_time, dtype: timedelta64[ns]

---
### Question: Is every stop available on the list of trips filtered to only those with regular weekday service? Are there any stops that don't have a stop_time?

In [65]:
stop_times.shape, my_stop_times.shape, stops.shape

((4311631, 8), (1304782, 4), (9476, 6))

In [66]:
stops_in_service = stops[stops.stop_id.isin(my_stop_times.stop_id)]
stops_out_of_service = stops[~stops.stop_id.isin(my_stop_times.stop_id)]

In [67]:
trips[trips.trip_id.isin(stop_times[stop_times.stop_id.isin(stops_out_of_service.stop_id)].trip_id.to_list())].service_id.value_counts()

# distribution of service_id for all stops marked as out of regular weekday service
# confirms the lack of service_id = 1

4       520
2       436
3       416
401      44
4501     16
4401     14
Name: service_id, dtype: int64

---
### Update shortest_path_df to disregard stops that are out of service

In [68]:
shortest_path_df = shortest_path_df[shortest_path_df.stop_id.isin(stops_in_service.stop_id)].reset_index(drop = True)

In [69]:
shortest_path_df.head(3)

Unnamed: 0,stop_id,stop_name,stop_lat,stop_lon,distance,duration,seconds,previous_stop,visited
0,917,York Mills Rd at Sandover Dr (1222 York Mills),43.759813,-79.331751,0.0,0:00:00,0,0,False
1,8081,York Mills Rd at Valley Woods Rd East Side,43.759595,-79.332036,0.000503,0:00:35,35,0,False
2,10256,113 Valley Woods Rd,43.75896,-79.332778,0.00188,0:02:12,132,0,False


---
### Search for all times on current_stop_trips that are closest to current_time

#### Create and sort by new time delta column

In [70]:
# convert to time delta format
current_stop_trip_times['start_td'] = pd.to_timedelta(current_stop_trip_times.stop_time)

In [71]:
current_stop_trip_times['end_td'] = current_stop_trip_times.start_td + current_stop_trip_times.duration

In [72]:
# sort values by time delta, low to high
current_stop_trip_times = current_stop_trip_times.sort_values(by = 'end_td')
# reset index
current_stop_trip_times = current_stop_trip_times.reset_index(drop = True)

In [73]:
next_trips = current_stop_trip_times.loc[current_stop_trip_times.start_td > td(hours = 19)]

In [74]:
next_trips.head(3)

Unnamed: 0,trip_id,stop_time,stop_sequence,route_id,trip_headsign,next_stop_id,duration,seconds,start_td,end_td
195,43027160,19:00:42,48,65004,WEST - 95 YORK MILLS towards YORK MILLS STATION,5191,0 days 00:00:33,33,0 days 19:00:42,0 days 19:01:15
196,43029300,19:02:47,21,65018,WEST - 995 YORK MILLS EXPRESS towards YORK MIL...,9834,0 days 00:04:19,259,0 days 19:02:47,0 days 19:07:06
197,43027159,19:07:45,17,65004,WEST - 95C YORK MILLS towards YORK MILLS STATION,5191,0 days 00:00:30,30,0 days 19:07:45,0 days 19:08:15


#### Create list of nearest neighbors

In [75]:
nearest_neighbors = list(current_stop_trip_times.next_stop_id.unique())

In [76]:
nearest_neighbors

[5191, 9834]

#### Instantiate Current Time

In [77]:
current_datetime = dt(1900, 1, 1, 19, 0, 0)

In [78]:
current_timedelta = td(hours = 19)

#### Update shortest_path_df to include timedelta from current_time

In [79]:
current_hour = int(str(current_datetime.time())[0:2])

In [80]:
shortest_path_df['arrival_time'] = shortest_path_df.duration + td(hours = current_hour)

#### Loop over list of nearest neighbors, and find the next trip time for each

In [81]:
# alternative 1 to build selected_next_trips

next_trips_to_neighbors = []
for current_neighbor in nearest_neighbors:
    next_trip = next_trips[next_trips.next_stop_id == current_neighbor].head(1)
    next_trips_to_neighbors.append(next_trip)
selected_next_trips = pd.concat(next_trips_to_neighbors)

In [82]:
# alternative 2 to build selected_next_trips

next_trips_to_neighbors = []
for i in range(0, len(nearest_neighbors)):
    for j in range(0, len(next_trips)):
        if (next_trips.iloc[j, 5] == nearest_neighbors[i]):
            next_trips_to_neighbors.append(j)
            break
selected_next_trips = next_trips.iloc[next_trips_to_neighbors]

In [83]:
selected_next_trips

Unnamed: 0,trip_id,stop_time,stop_sequence,route_id,trip_headsign,next_stop_id,duration,seconds,start_td,end_td
195,43027160,19:00:42,48,65004,WEST - 95 YORK MILLS towards YORK MILLS STATION,5191,0 days 00:00:33,33,0 days 19:00:42,0 days 19:01:15
196,43029300,19:02:47,21,65018,WEST - 995 YORK MILLS EXPRESS towards YORK MIL...,9834,0 days 00:04:19,259,0 days 19:02:47,0 days 19:07:06


#### Use selected_next_trips to update shortest_path_df and hence close the loop on having 'visited' the current stop

1. update the shortest_path_df.arrival_time with selected_next_trips.end_td
1. update the shortest_path_df.duration with difference between current_timedelta & shortest_path_df.arrival_time
1. update the seconds by converting the duration time delta into seconds
1. update previous stop as the current stop id

In [84]:
print(current_stop_id)
print(current_timedelta)

917
19:00:00


In [85]:
selected_next_trips.next_stop_id

195    5191
196    9834
Name: next_stop_id, dtype: object

In [86]:
for i in range(0, len(selected_next_trips)):
    n_stop = selected_next_trips.next_stop_id.iloc[i]
    n_arrival = selected_next_trips.end_td.iloc[i]
    print(n_stop, n_arrival)
    match_stop = shortest_path_df.stop_id == n_stop
    shortest_path_df.loc[match_stop, 'arrival_time'] = n_arrival
    time_diff = n_arrival - current_timedelta
    shortest_path_df.loc[match_stop, 'duration'] = time_diff
    shortest_path_df.loc[match_stop, 'seconds'] = int(time_diff.total_seconds())
    shortest_path_df.loc[match_stop, 'previous_stop'] = current_stop_id

5191 0 days 19:01:15
9834 0 days 19:07:06


#### Mark current stop as having been visited and reset the shortest_path_df by sorting by arrival time

In [87]:
shortest_path_df.loc[shortest_path_df.stop_id == current_stop_id, 'visited'] = True

In [88]:
shortest_path_df = shortest_path_df.sort_values(by = 'arrival_time').reset_index(drop = True)

# Manually work through the 2nd loop

---
### Switch to the next stop in loop

- Update the current stop to the next closest stop on the shortest path df
- Update the current time to the arrival time at the new current stop

In [89]:
shortest_path_df[shortest_path_df.visited == False].stop_id.iloc[0]

8081

In [90]:
current_stop_id = shortest_path_df[shortest_path_df.visited == False].stop_id.iloc[0]

current_stop = current_stop_id # this is redundant and will have to be fixed later

In [91]:
current_timedelta = shortest_path_df[shortest_path_df.visited == False].arrival_time.iloc[0]

In [92]:
print(current_stop_id)
print(current_timedelta)

8081
0 days 19:00:35


---
### Calculate the walking distance from the new current stop to all other unvisited stops

In [93]:
current_stop_id, walk_speed_sec, current_timedelta

(8081, 1.4267500000002541e-05, Timedelta('0 days 19:00:35'))

In [94]:
start_timedelta = td(hours = 19)

In [95]:
current_stop_lat = stops.loc[stops.stop_id == current_stop_id, 'stop_lat'].item()
current_stop_lon = stops.loc[stops.stop_id == current_stop_id, 'stop_lon'].item()

In [96]:
current_stop_lat, current_stop_lon

(43.759595000000004, -79.332036)

In [97]:
# build walking distance reference table based of shortest path pf but only considering univisted neighbors
walking_distance = shortest_path_df.loc[shortest_path_df.visited == False]
walking_distance = walking_distance.loc[:, ['stop_id', 'stop_name', 'stop_lat', 'stop_lon']]

In [98]:
walking_distance['distance'] = abs(current_stop_lat - walking_distance['stop_lat']) + abs(current_stop_lon - walking_distance['stop_lon'])

In [99]:
walking_distance = walking_distance.sort_values(by = 'distance').reset_index(drop = True)

In [100]:
# loop through every stop record on the walking_distance
for i in range(0, len(walking_distance)):
    
    # calculate walking time based on distance
    walking_time = round(walking_distance.loc[i, 'distance'] / walk_speed_sec, 0)
    
    arrival_time = current_timedelta + td(seconds = walking_time)
    
    # update the duration and seconds column using walking time
    walking_distance.loc[i, 'arrival_time'] = arrival_time
    walking_distance.loc[i, 'seconds'] = (arrival_time - start_timedelta).total_seconds()
    
# convert the seconds column into integer type (removing all decimal points)
# walking_distance.seconds = walking_distance.seconds.astype('int')

In [101]:
walking_distance

Unnamed: 0,stop_id,stop_name,stop_lat,stop_lon,distance,arrival_time,seconds
0,8081,York Mills Rd at Valley Woods Rd East Side,43.759595,-79.332036,0.000000,0 days 19:00:35,35.0
1,10256,113 Valley Woods Rd,43.758960,-79.332778,0.001377,0 days 19:02:12,132.0
2,6913,Valley Woods Rd at York Mills Rd,43.758994,-79.333070,0.001635,0 days 19:02:30,150.0
3,5191,1200-1202 York Mills Rd,43.759132,-79.333893,0.002320,0 days 19:03:18,198.0
4,4880,York Mills Rd at Fenside Dr,43.760561,-79.327819,0.005183,0 days 19:06:38,398.0
...,...,...,...,...,...,...,...
9413,4305,The West Mall at Sherway Dr North Side,43.609781,-79.560638,0.378416,1 days 02:22:38,26558.0
9414,7093,The West Mall at Trillium Health Centre (North...,43.610381,-79.561480,0.378658,1 days 02:22:55,26575.0
9415,880,The West Mall at Sherway Dr (Trillium Health C...,43.609464,-79.560579,0.378674,1 days 02:22:56,26576.0
9416,477,Long Branch Loop,43.592111,-79.543700,0.379148,1 days 02:23:29,26609.0


In [102]:
stops_to_update = []
for i in range(0, len(walking_distance)):
    n_stop = walking_distance.stop_id.loc[i]
    new_arrival_time = walking_distance.arrival_time.loc[i]
    old_arrival_time = shortest_path_df[shortest_path_df.stop_id == n_stop].arrival_time.item()
    if new_arrival_time < old_arrival_time:
        stops_to_update.append(i)

In [103]:
stops_to_update_df = walking_distance.loc[stops_to_update].reset_index(drop = True)

---
### Compare the walking time from current stop vs shortest path table

In [104]:
stops_to_update_df.head(3)

Unnamed: 0,stop_id,stop_name,stop_lat,stop_lon,distance,arrival_time,seconds
0,5056,44 Valley Woods Rd,43.755137,-79.333446,0.005868,0 days 19:07:26,446.0
1,2549,29 Valley Woods Rd,43.753958,-79.33291,0.006511,0 days 19:08:11,491.0
2,5738,York Mills Rd at Valentine Dr,43.757207,-79.341181,0.011533,0 days 19:14:03,843.0


In [105]:
for i in range(0, len(stops_to_update_df)):
    i_stop_id = stops_to_update_df.stop_id.iloc[i]
    i_arrival_time = stops_to_update_df.arrival_time.iloc[i]
    
    shortest_path_df.loc[shortest_path_df.stop_id == i_stop_id, 'arrival_time'] = i_arrival_time
    shortest_path_df.loc[shortest_path_df.stop_id == i_stop_id, 'previous_stop'] = current_stop_id
    
    time_diff = i_arrival_time - start_timedelta
    shortest_path_df.loc[shortest_path_df.stop_id == i_stop_id, 'duration'] = time_diff
    shortest_path_df.loc[shortest_path_df.stop_id == i_stop_id, 'seconds'] = int(time_diff.total_seconds())

---
### Update shortest path table with formatting and mostly to create a column which indicates whether the journey from the previous stop was by transit or walking

In [106]:
shortest_path_df.drop(columns = ['distance', 'duration', 'seconds'], inplace = True)

In [107]:
shortest_path_df['previous_mode'] = 'W'

In [108]:
shortest_path_df.previous_stop.value_counts()

0       8038
8081    1379
917        2
Name: previous_stop, dtype: int64

In [109]:
shortest_path_df.loc[shortest_path_df.previous_stop == 917, 'previous_mode'] = 'T'

In [110]:
shortest_path_df.loc[shortest_path_df.previous_stop == 917, 'previous_mode']

2    T
8    T
Name: previous_mode, dtype: object

---
### Scan all available trips from current stop to select the next available trip to each nearest neighbor

In [111]:
my_stop_times.head(3)

Unnamed: 0,trip_id,stop_time,stop_id,stop_sequence
0,42990004,7:00:29,14155,1
1,42990004,7:01:19,3807,2
2,42990004,7:02:12,6904,3


In [112]:
my_trips.head(3)

Unnamed: 0,route_id,trip_id,trip_headsign,direction_id
0,64815,42990014,EAST - 10 VAN HORNE towards VICTORIA PARK,0
1,64815,42990036,EAST - 10 VAN HORNE towards VICTORIA PARK,0
2,64815,42990015,EAST - 10 VAN HORNE towards VICTORIA PARK,0


In [113]:
my_schedule = pd.merge(my_stop_times, my_trips, on = 'trip_id', how = 'inner')

In [114]:
my_schedule['time_delta'] = pd.to_timedelta(my_schedule.stop_time)

In [115]:
current_stop_trips = my_schedule.loc[my_schedule.stop_id == current_stop_id]

In [116]:
current_stop_trips = current_stop_trips.drop(columns = ['stop_id', 'direction_id'])

In [117]:
current_stop_trips = current_stop_trips.reset_index(drop = True)

#### Start here > current objective > keep building current_stop_trips to mimic how we built current_stop_trip_times for the first manual round using the home stop. Next step is to build the 'next_stop_id' and 'next_stop_arrival_time' instead of 'end_td' (full use of delta from start time vs just measuring the duration). Once that's done: for each next_stop_id find the earliest available trip and use this info to update the shortest path table. Consider at what point to include the 1 minute buffer time for transfers

In [118]:
current_stop_trips['next_stop_id'] = None
current_stop_trips['arrival_time'] = td(seconds = 0)

In [119]:
my_schedule.head(3)

Unnamed: 0,trip_id,stop_time,stop_id,stop_sequence,route_id,trip_headsign,direction_id,time_delta
0,42990004,7:00:29,14155,1,64815,EAST - 10 VAN HORNE towards VICTORIA PARK,0,0 days 07:00:29
1,42990004,7:01:19,3807,2,64815,EAST - 10 VAN HORNE towards VICTORIA PARK,0,0 days 07:01:19
2,42990004,7:02:12,6904,3,64815,EAST - 10 VAN HORNE towards VICTORIA PARK,0,0 days 07:02:12


In [120]:
for i in range(0, len(current_stop_trips)):
    
    i_trip_id = current_stop_trips.loc[i, 'trip_id']
    i_stop_sequence = int(current_stop_trips.loc[i, 'stop_sequence'])
    next_stop_sequence = i_stop_sequence + 1
    
    # pull a list of all stops on the current trip
    i_trip_stops = my_schedule[my_schedule.trip_id == trip_id]
    
    # returns true if there is a next stop available for the current trip
    next_stop_exists = next_stop_sequence in i_trip_stops.stop_sequence.unique()
    
    if (next_stop_exists):
        # grab data from my_schedule
        next_stop_id = my_schedule.loc[((my_schedule.trip_id == i_trip_id) & (my_schedule.stop_sequence == next_stop_sequence)), 'stop_id'].item()
        arrival_time = my_schedule.loc[((my_schedule.trip_id == i_trip_id) & (my_schedule.stop_sequence == next_stop_sequence)), 'time_delta'].item()
        # and then transfer to current_stop_trips
        current_stop_trips.loc[i, 'next_stop_id'] = next_stop_id
        current_stop_trips.loc[i, 'arrival_time'] = arrival_time
    else:
        # if no data, mark data as blank
        current_stop_trips.loc[i, 'next_stop_id'] = 0
        current_stop_trips.loc[i, 'arrival_time'] = td(seconds = 0)

In [121]:
current_stop_trips.next_stop_id.value_counts()

4880    182
3177     56
Name: next_stop_id, dtype: int64

In [122]:
current_stop_id, current_timedelta

(8081, Timedelta('0 days 19:00:35'))

In [123]:
transfer_timedelta = current_timedelta + td(minutes = 1)

In [124]:
transfer_timedelta

Timedelta('0 days 19:01:35')

In [125]:
next_trips = current_stop_trips.loc[current_stop_trips.time_delta > transfer_timedelta].sort_values(by = 'arrival_time')

In [126]:
next_trips.shape

(56, 8)

In [127]:
next_trips.head(1)

Unnamed: 0,trip_id,stop_time,stop_sequence,route_id,trip_headsign,time_delta,next_stop_id,arrival_time
134,43027071,19:05:09,20,65004,EAST - 95C YORK MILLS towards ELLESMERE STATION,0 days 19:05:09,4880,0 days 19:06:16


In [128]:
nearest_neighbors = next_trips.next_stop_id.value_counts().index.to_list()

In [129]:
nearest_neighbors

[4880, 3177]

In [130]:
next_trips_to_neighbors = []
for i in range(0, len(nearest_neighbors)):
    for j in range(0, len(next_trips)):
        if (next_trips.iloc[j, 6] == nearest_neighbors[i]):
            next_trips_to_neighbors.append(j)
            break
selected_next_trips = next_trips.iloc[next_trips_to_neighbors]

In [131]:
save_record = shortest_path_df[shortest_path_df.stop_id.isin(nearest_neighbors)].copy()

In [132]:
save_record

Unnamed: 0,stop_id,stop_name,stop_lat,stop_lon,previous_stop,visited,arrival_time,previous_mode
5,4880,York Mills Rd at Fenside Dr,43.760561,-79.327819,0,False,0 days 19:05:28,W
26,3177,Parkwoods Village Dr at Brookbanks Dr,43.76093,-79.323736,0,False,0 days 19:10:40,W


In [133]:
selected_next_trips

Unnamed: 0,trip_id,stop_time,stop_sequence,route_id,trip_headsign,time_delta,next_stop_id,arrival_time
134,43027071,19:05:09,20,65004,EAST - 95C YORK MILLS towards ELLESMERE STATION,0 days 19:05:09,4880,0 days 19:06:16
234,43029266,19:12:11,5,65018,EAST - 995 YORK MILLS EXPRESS towards U OF T S...,0 days 19:12:11,3177,0 days 19:14:52


In [134]:
stop_times.query('trip_id == 43027071 and stop_sequence > 19')

Unnamed: 0,trip_id,stop_time,stop_id,stop_sequence,pickup_type,drop_off_type,shape_dist_traveled,time_delta
1185474,43027071,19:05:09,8081,20,0,0,6.565,0 days 19:05:09
1185475,43027071,19:06:16,4880,21,0,0,6.9199,0 days 19:06:16
1185476,43027071,19:07:23,3177,22,0,0,7.2766,0 days 19:07:23
1185477,43027071,19:08:15,8969,23,0,0,7.5523,0 days 19:08:15
1185478,43027071,19:08:53,8855,24,0,0,7.7533,0 days 19:08:53
1185479,43027071,19:10:09,9291,25,0,0,8.1448,0 days 19:10:09
1185480,43027071,19:11:22,352,26,0,0,8.4965,0 days 19:11:22
1185481,43027071,19:12:04,5921,27,0,0,8.7018,0 days 19:12:04
1185482,43027071,19:13:04,9155,28,0,0,8.9926,0 days 19:13:04
1185483,43027071,19:14:05,4465,29,0,0,9.2852,0 days 19:14:05


In [135]:
for i in range(0, len(selected_next_trips)):
    n_stop = selected_next_trips.next_stop_id.iloc[i]
    n_arrival = selected_next_trips.arrival_time.iloc[i]
    print(n_stop, n_arrival)
    match_stop = shortest_path_df.stop_id == n_stop
    update_condition = shortest_path_df.loc[match_stop, 'arrival_time'].iloc[0] > n_arrival
    if (update_condition):
        shortest_path_df.loc[match_stop, 'arrival_time'] = n_arrival
        shortest_path_df.loc[match_stop, 'previous_stop'] = current_stop_id
        shortest_path_df.loc[match_stop, 'previous_mode'] = 'T'

4880 0 days 19:06:16
3177 0 days 19:14:52


In [136]:
shortest_path_df[shortest_path_df.stop_id.isin(nearest_neighbors)]

Unnamed: 0,stop_id,stop_name,stop_lat,stop_lon,previous_stop,visited,arrival_time,previous_mode
5,4880,York Mills Rd at Fenside Dr,43.760561,-79.327819,0,False,0 days 19:05:28,W
26,3177,Parkwoods Village Dr at Brookbanks Dr,43.76093,-79.323736,0,False,0 days 19:10:40,W


### Finish the 2nd loop and switch over to the next unvisited stop

In [137]:
shortest_path_df.loc[shortest_path_df.stop_id == current_stop_id, 'visited'] = True

In [138]:
shortest_path_df = shortest_path_df.sort_values(by = 'arrival_time').reset_index(drop = True)

In [139]:
current_stop_id = shortest_path_df[shortest_path_df.visited == False].stop_id.iloc[0]

In [140]:
current_stop_id

5191

In [141]:
current_timedelta = shortest_path_df[shortest_path_df.stop_id == current_stop_id].arrival_time.iloc[0]
transfer_timedelta = current_timedelta + td(minutes = 1)

In [142]:
current_timedelta, transfer_timedelta

(Timedelta('0 days 19:01:15'), Timedelta('0 days 19:02:15'))

##### Insight: Transfers should only matter when exiting the current trip/route; otherwise I will be forcing users to inefficiently get the off the bus and get on the next bus just to get to the next stop on the same route. Perhaps it would be a good idea to remove transfer time completely from the MVP, and optimize for it later. If anything, when transferring to another route there will most likely be a delay before the next bus comes for most times anyways. When implementing transfer time, consider to only add transfer time when the previous_mode is W or when the route/trip from the previous stop changes onto the next stop

In [143]:
shortest_path_df.head(5)

Unnamed: 0,stop_id,stop_name,stop_lat,stop_lon,previous_stop,visited,arrival_time,previous_mode
0,917,York Mills Rd at Sandover Dr (1222 York Mills),43.759813,-79.331751,0,True,0 days 19:00:00,W
1,8081,York Mills Rd at Valley Woods Rd East Side,43.759595,-79.332036,0,True,0 days 19:00:35,W
2,5191,1200-1202 York Mills Rd,43.759132,-79.333893,917,False,0 days 19:01:15,T
3,10256,113 Valley Woods Rd,43.75896,-79.332778,0,False,0 days 19:02:12,W
4,6913,Valley Woods Rd at York Mills Rd,43.758994,-79.33307,0,False,0 days 19:02:30,W


### Question: consider what would happen if there was infact no next stop or next stop sequence for the current trip in question? How would that impact the shortest path table?

Since the earliest possible trip ever (including those outside regular weekday service) starts around 3 AM, there is no trip that starts on the schedule between 12 AM and 3 AM.

Therefore, when no next stop id or stop sequence is found for a given trip that stops at the current stop id in question, the values for next stop id will be None (empty) and the value for arrival_time will be blank or 0 or 00:00:00.

At the point when current stop trips is converted to next trips to figure out the next applicable valid trip as per the current time, all trips that are at 0 seconds will automatically be filtered out and never be considered for an update to the shortest path table. To make this more airtight, we could add a filter that confirms that the next_stop_id is a valid stop_id i.e. anything integer number that is not zero.

In [144]:
stop_times.sort_values(by = 'time_delta').head(3)

Unnamed: 0,trip_id,stop_time,stop_id,stop_sequence,pickup_type,drop_off_type,shape_dist_traveled,time_delta
1645997,43039136,3:07:19,7363,1,0,0,0.0984,0 days 03:07:19
1645998,43039136,3:08:50,903,2,0,0,0.5683,0 days 03:08:50
1645999,43039136,3:09:04,902,3,0,0,0.6401,0 days 03:09:04


##### Question: should I avoid using a reduced stop list (by only fitlering on those trips in service) as I can still walk over to each stop and have them be included in the model. Perhaps the stops out of service might be easily walkable from another stop in service?

# Manually work through loop 3 
Reuse and refine code from loop 2; then consolidate all loop 3 work into master function

In [145]:
# rename variables for refinement
current_time = current_timedelta
start_time = start_timedelta
walking_speed = walk_speed_sec
shortest_path = shortest_path_df

In [146]:
# global variables in use
print(current_stop_id, current_time, start_time, walking_speed)

5191 0 days 19:01:15 19:00:00 1.4267500000002541e-05


In [147]:
# dataframes in use
display(shortest_path.head(1))

Unnamed: 0,stop_id,stop_name,stop_lat,stop_lon,previous_stop,visited,arrival_time,previous_mode
0,917,York Mills Rd at Sandover Dr (1222 York Mills),43.759813,-79.331751,0,True,0 days 19:00:00,W


#### Calculate walking distance to all other stops from current stop

In [148]:
# grab the current stop geocoordinates to calculate walking distance
current_stop_lat = shortest_path.loc[shortest_path.stop_id == current_stop_id, 'stop_lat'].iloc[0]
current_stop_lon = shortest_path.loc[shortest_path.stop_id == current_stop_id, 'stop_lon'].iloc[0]

In [149]:
print(current_stop_lat, current_stop_lon)

43.759132 -79.333893


In [150]:
# create a table of unvisited stops to calculate walking distance
walking_distance = shortest_path.loc[shortest_path.visited == False]
walking_distance = walking_distance.loc[:, ['stop_id', 'stop_name', 'stop_lat', 'stop_lon']]
walking_distance['distance'] = abs(current_stop_lat - walking_distance['stop_lat']) + abs(current_stop_lon - walking_distance['stop_lon'])
walking_distance['arrival_time'] = current_time + pd.to_timedelta((walking_distance.distance / walking_speed).round(0).astype(int), 'seconds')
walking_distance = walking_distance.sort_values(by = 'arrival_time').reset_index(drop = True)

In [151]:
stops_to_update = []
for i in range(0, len(walking_distance)):
    n_stop = walking_distance.stop_id.loc[i]
    new_arrival_time = walking_distance.arrival_time.loc[i]
    old_arrival_time = shortest_path_df[shortest_path_df.stop_id == n_stop].arrival_time.item()
    if new_arrival_time < old_arrival_time:
        stops_to_update.append(i)

---

In [152]:
stops_to_update_df

Unnamed: 0,stop_id,stop_name,stop_lat,stop_lon,distance,arrival_time,seconds
0,5056,44 Valley Woods Rd,43.755137,-79.333446,0.005868,0 days 19:07:26,446.0
1,2549,29 Valley Woods Rd,43.753958,-79.332910,0.006511,0 days 19:08:11,491.0
2,5738,York Mills Rd at Valentine Dr,43.757207,-79.341181,0.011533,0 days 19:14:03,843.0
3,4022,Silverdale Cres at York Mills Rd,43.756875,-79.341192,0.011876,0 days 19:14:27,867.0
4,3167,Laurentide Dr at Tetbury Cres,43.753215,-79.338987,0.013331,0 days 19:16:09,969.0
...,...,...,...,...,...,...,...
1374,7836,Evans Ave at Sherway Gate West Side,43.609022,-79.556933,0.375470,1 days 02:19:11,26351.0
1375,542,Sherway Gardens Rd at Sherway Gate (Mall),43.609785,-79.558022,0.375796,1 days 02:19:34,26374.0
1376,540,Sherway Gardens Rd at Sherway Gate (Mall),43.609729,-79.558438,0.376268,1 days 02:20:07,26407.0
1377,15260,Lake Shore Blvd West at Fortieth St,43.592468,-79.541863,0.376954,1 days 02:20:55,26455.0


In [153]:
stops_to_update_df = walking_distance.loc[stops_to_update].reset_index(drop = True)

In [154]:
walking_distance.head(5)

Unnamed: 0,stop_id,stop_name,stop_lat,stop_lon,distance,arrival_time
0,5191,1200-1202 York Mills Rd,43.759132,-79.333893,0.0,0 days 19:01:15
1,6913,Valley Woods Rd at York Mills Rd,43.758994,-79.33307,0.000961,0 days 19:02:22
2,10256,113 Valley Woods Rd,43.75896,-79.332778,0.001287,0 days 19:02:45
3,5056,44 Valley Woods Rd,43.755137,-79.333446,0.004442,0 days 19:06:26
4,10406,Opposite 44 Valley Woods Rd,43.755229,-79.333336,0.00446,0 days 19:06:28


In [155]:
shortest_path.head(10)

Unnamed: 0,stop_id,stop_name,stop_lat,stop_lon,previous_stop,visited,arrival_time,previous_mode
0,917,York Mills Rd at Sandover Dr (1222 York Mills),43.759813,-79.331751,0,True,0 days 19:00:00,W
1,8081,York Mills Rd at Valley Woods Rd East Side,43.759595,-79.332036,0,True,0 days 19:00:35,W
2,5191,1200-1202 York Mills Rd,43.759132,-79.333893,917,False,0 days 19:01:15,T
3,10256,113 Valley Woods Rd,43.75896,-79.332778,0,False,0 days 19:02:12,W
4,6913,Valley Woods Rd at York Mills Rd,43.758994,-79.33307,0,False,0 days 19:02:30,W
5,4880,York Mills Rd at Fenside Dr,43.760561,-79.327819,0,False,0 days 19:05:28,W
6,6061,York Mills Rd at Fenside Dr,43.760782,-79.327546,0,False,0 days 19:06:03,W
7,6223,Lynedock Cres at Fenside Dr East Side,43.761865,-79.328158,0,False,0 days 19:06:36,W
8,9834,York Mills Rd at Don Mills Rd,43.756053,-79.3466,917,False,0 days 19:07:06,T
9,10406,Opposite 44 Valley Woods Rd,43.755229,-79.333336,0,False,0 days 19:07:12,W


### Perhaps previous stop by default should be updated to the starting spot 917 since the initiating shortest path durations are based on walking distance from that stop!!!

In [156]:
shortest_path.loc[:, ['previous_stop', 'previous_mode']].value_counts()

previous_stop  previous_mode
0              W                8038
8081           W                1379
917            T                   2
dtype: int64

In [157]:
shortest_path.loc[shortest_path.previous_stop == 0, 'previous_stop'] = 917

In [158]:
shortest_path.query('previous_mode == "T"')

Unnamed: 0,stop_id,stop_name,stop_lat,stop_lon,previous_stop,visited,arrival_time,previous_mode
2,5191,1200-1202 York Mills Rd,43.759132,-79.333893,917,False,0 days 19:01:15,T
8,9834,York Mills Rd at Don Mills Rd,43.756053,-79.3466,917,False,0 days 19:07:06,T


In [159]:
shortest_path

Unnamed: 0,stop_id,stop_name,stop_lat,stop_lon,previous_stop,visited,arrival_time,previous_mode
0,917,York Mills Rd at Sandover Dr (1222 York Mills),43.759813,-79.331751,917,True,0 days 19:00:00,W
1,8081,York Mills Rd at Valley Woods Rd East Side,43.759595,-79.332036,917,True,0 days 19:00:35,W
2,5191,1200-1202 York Mills Rd,43.759132,-79.333893,917,False,0 days 19:01:15,T
3,10256,113 Valley Woods Rd,43.758960,-79.332778,917,False,0 days 19:02:12,W
4,6913,Valley Woods Rd at York Mills Rd,43.758994,-79.333070,917,False,0 days 19:02:30,W
...,...,...,...,...,...,...,...,...
9414,4305,The West Mall at Sherway Dr North Side,43.609781,-79.560638,917,False,1 days 02:22:38,W
9415,7093,The West Mall at Trillium Health Centre (North...,43.610381,-79.561480,917,False,1 days 02:22:55,W
9416,880,The West Mall at Sherway Dr (Trillium Health C...,43.609464,-79.560579,917,False,1 days 02:22:56,W
9417,477,Long Branch Loop,43.592111,-79.543700,917,False,1 days 02:23:29,W


#### Insight

##### Problem: as it stands, the algo will suggest that you to walk to stop B from A, and then thru B walk to C as the fastest way to get from A to C, when there is a faster and direct transit route from A to C.

##### Solution: If the previous_mode is W then a different walk distance should not be updated? There should never be more than 2 walk to walk nodes? (The next walk node after the starting stop walking baseline must follow a transit mode first). Alternatively, once a trip has been accessed, update the duration to all other stops and update the connections accordingly. 

---
# ARCHIVE

---

In [160]:
stops_to_update_df = walking_distance.loc[stops_to_update].reset_index(drop = True)

In [161]:
stops_to_update_df.head(3)

Unnamed: 0,stop_id,stop_name,stop_lat,stop_lon,distance,arrival_time
0,6913,Valley Woods Rd at York Mills Rd,43.758994,-79.33307,0.000961,0 days 19:02:22
1,5056,44 Valley Woods Rd,43.755137,-79.333446,0.004442,0 days 19:06:26
2,10406,Opposite 44 Valley Woods Rd,43.755229,-79.333336,0.00446,0 days 19:06:28


In [162]:
for i in range(0, len(stops_to_update_df)):
    i_stop_id = stops_to_update_df.stop_id.iloc[i]
    i_arrival_time = stops_to_update_df.arrival_time.iloc[i]
    
    shortest_path_df.loc[shortest_path_df.stop_id == i_stop_id, 'arrival_time'] = i_arrival_time
    shortest_path_df.loc[shortest_path_df.stop_id == i_stop_id, 'previous_stop'] = current_stop_id
    
    time_diff = i_arrival_time - start_timedelta
    shortest_path_df.loc[shortest_path_df.stop_id == i_stop_id, 'duration'] = time_diff
    shortest_path_df.loc[shortest_path_df.stop_id == i_stop_id, 'seconds'] = int(time_diff.total_seconds())

---
### Update shortest path table with formatting and mostly to create a column which indicates whether the journey from the previous stop was by transit or walking

In [163]:
shortest_path_df.drop(columns = ['distance', 'duration', 'seconds'], inplace = True)

KeyError: "['distance'] not found in axis"

In [None]:
shortest_path_df['previous_mode'] = 'W'

In [None]:
shortest_path_df.previous_stop.value_counts()

In [None]:
shortest_path_df.loc[shortest_path_df.previous_stop == 917, 'previous_mode'] = 'T'

In [None]:
shortest_path_df.loc[shortest_path_df.previous_stop == 917, 'previous_mode']

---
### Scan all available trips from current stop to select the next available trip to each nearest neighbor

In [None]:
my_stop_times.head(3)

In [None]:
my_trips.head(3)

In [None]:
my_schedule = pd.merge(my_stop_times, my_trips, on = 'trip_id', how = 'inner')

In [None]:
my_schedule['time_delta'] = pd.to_timedelta(my_schedule.stop_time)

In [None]:
current_stop_trips = my_schedule.loc[my_schedule.stop_id == current_stop_id]

In [None]:
current_stop_trips = current_stop_trips.drop(columns = ['stop_id', 'direction_id'])

In [None]:
current_stop_trips = current_stop_trips.reset_index(drop = True)

#### Start here > current objective > keep building current_stop_trips to mimic how we built current_stop_trip_times for the first manual round using the home stop. Next step is to build the 'next_stop_id' and 'next_stop_arrival_time' instead of 'end_td' (full use of delta from start time vs just measuring the duration). Once that's done: for each next_stop_id find the earliest available trip and use this info to update the shortest path table. Consider at what point to include the 1 minute buffer time for transfers

In [None]:
current_stop_trips['next_stop_id'] = None
current_stop_trips['arrival_time'] = td(seconds = 0)

In [None]:
my_schedule.head(3)

In [None]:
for i in range(0, len(current_stop_trips)):
    
    i_trip_id = current_stop_trips.loc[i, 'trip_id']
    i_stop_sequence = int(current_stop_trips.loc[i, 'stop_sequence'])
    next_stop_sequence = i_stop_sequence + 1
    
    # pull a list of all stops on the current trip
    i_trip_stops = my_schedule[my_schedule.trip_id == trip_id]
    
    # returns true if there is a next stop available for the current trip
    next_stop_exists = next_stop_sequence in i_trip_stops.stop_sequence.unique()
    
    if (next_stop_exists):
        # grab data from my_schedule
        next_stop_id = my_schedule.loc[((my_schedule.trip_id == i_trip_id) & (my_schedule.stop_sequence == next_stop_sequence)), 'stop_id'].item()
        arrival_time = my_schedule.loc[((my_schedule.trip_id == i_trip_id) & (my_schedule.stop_sequence == next_stop_sequence)), 'time_delta'].item()
        # and then transfer to current_stop_trips
        current_stop_trips.loc[i, 'next_stop_id'] = next_stop_id
        current_stop_trips.loc[i, 'arrival_time'] = arrival_time
    else:
        # if no data, mark data as blank
        current_stop_trips.loc[i, 'next_stop_id'] = 0
        current_stop_trips.loc[i, 'arrival_time'] = td(seconds = 0)

In [None]:
current_stop_trips.next_stop_id.value_counts()

In [None]:
current_stop_id, current_timedelta

In [None]:
transfer_timedelta = current_timedelta + td(minutes = 1)

In [None]:
transfer_timedelta

In [None]:
next_trips = current_stop_trips.loc[current_stop_trips.time_delta > transfer_timedelta].sort_values(by = 'arrival_time')

In [None]:
next_trips.shape

In [None]:
next_trips.head(1)

In [None]:
nearest_neighbors = next_trips.next_stop_id.value_counts().index.to_list()

In [None]:
nearest_neighbors

In [None]:
next_trips_to_neighbors = []
for i in range(0, len(nearest_neighbors)):
    for j in range(0, len(next_trips)):
        if (next_trips.iloc[j, 6] == nearest_neighbors[i]):
            next_trips_to_neighbors.append(j)
            break
selected_next_trips = next_trips.iloc[next_trips_to_neighbors]

In [None]:
save_record = shortest_path_df[shortest_path_df.stop_id.isin(nearest_neighbors)].copy()

In [None]:
save_record

In [None]:
selected_next_trips

In [None]:
for i in range(0, len(selected_next_trips)):
    n_stop = selected_next_trips.next_stop_id.iloc[i]
    n_arrival = selected_next_trips.arrival_time.iloc[i]
    print(n_stop, n_arrival)
    match_stop = shortest_path_df.stop_id == n_stop
    update_condition = shortest_path_df.loc[match_stop, 'arrival_time'].iloc[0] > n_arrival
    if (update_condition):
        shortest_path_df.loc[match_stop, 'arrival_time'] = n_arrival
        shortest_path_df.loc[match_stop, 'previous_stop'] = current_stop_id
        shortest_path_df.loc[match_stop, 'previous_mode'] = 'T'

In [None]:
shortest_path_df[shortest_path_df.stop_id.isin(nearest_neighbors)]

### Finish the 2nd loop and switch over to the next unvisited stop

In [None]:
shortest_path_df.loc[shortest_path_df.stop_id == current_stop_id, 'visited'] = True

In [None]:
shortest_path_df = shortest_path_df.sort_values(by = 'arrival_time').reset_index(drop = True)

In [None]:
current_stop_id = shortest_path_df[shortest_path_df.visited == False].stop_id.iloc[0]

In [None]:
current_stop_id

In [None]:
current_timedelta = shortest_path_df[shortest_path_df.stop_id == current_stop_id].arrival_time.iloc[0]
transfer_timedelta = current_timedelta + td(minutes = 1)

In [None]:
current_timedelta, transfer_timedelta

##### Insight: Transfers should only matter when exiting the current trip/route; otherwise I will be forcing users to inefficiently get the off the bus and get on the next bus just to get to the next stop on the same route. Perhaps it would be a good idea to remove transfer time completely from the MVP, and optimize for it later. If anything, when transferring to another route there will most likely be a delay before the next bus comes for most times anyways. When implementing transfer time, consider to only add transfer time when the previous_mode is W or when the route/trip from the previous stop changes onto the next stop

In [None]:
shortest_path_df.head(5)

### Question: consider what would happen if there was infact no next stop or next stop sequence for the current trip in question? How would that impact the shortest path table?

Since the earliest possible trip ever (including those outside regular weekday service) starts around 3 AM, there is no trip that starts on the schedule between 12 AM and 3 AM.

Therefore, when no next stop id or stop sequence is found for a given trip that stops at the current stop id in question, the values for next stop id will be None (empty) and the value for arrival_time will be blank or 0 or 00:00:00.

At the point when current stop trips is converted to next trips to figure out the next applicable valid trip as per the current time, all trips that are at 0 seconds will automatically be filtered out and never be considered for an update to the shortest path table. To make this more airtight, we could add a filter that confirms that the next_stop_id is a valid stop_id i.e. anything integer number that is not zero.

In [None]:
stop_times.sort_values(by = 'time_delta').head(3)