In [1]:
import pandas as pd

# Reduce schedule to the # of unique shape_id values

In [2]:
df = pd.read_feather('data/model/schedule.ftr')

In [3]:
df.head(1)

Unnamed: 0,trip_id,route_short_name,route_long_name,shape_id,trip_headsign,stop_sequence,stop_time,stop_time_delta,stop_id,stop_code,stop_name,stop_lat,stop_lon
0,42990004,10,VAN HORNE,886387,EAST - 10 VAN HORNE towards VICTORIA PARK,1,7:00:29,0 days 07:00:29,14155,14633,Don Mills Station,43.776222,-79.347048


In [4]:
# number of records
len(df)

1304782

In [5]:
# number of unique shape_id values
df.shape_id.value_counts().count()

1082

In [6]:
# number of unique trip_id values
df.trip_id.value_counts().count()

41016

In [7]:
# confirm values are sorted as needed (this may be redundant from preprocessing step)
df = df.sort_values(['shape_id', 'trip_id', 'stop_sequence'])

In [8]:
df

Unnamed: 0,trip_id,route_short_name,route_long_name,shape_id,trip_headsign,stop_sequence,stop_time,stop_time_delta,stop_id,stop_code,stop_name,stop_lat,stop_lon
0,42990004,10,VAN HORNE,886387,EAST - 10 VAN HORNE towards VICTORIA PARK,1,7:00:29,0 days 07:00:29,14155,14633,Don Mills Station,43.776222,-79.347048
1,42990004,10,VAN HORNE,886387,EAST - 10 VAN HORNE towards VICTORIA PARK,2,7:01:19,0 days 07:01:19,3807,1949,Don Mills Rd at Leith Hill Rd North Side,43.777534,-79.347811
2,42990004,10,VAN HORNE,886387,EAST - 10 VAN HORNE towards VICTORIA PARK,3,7:02:12,0 days 07:02:12,6904,1929,Don Mills Rd at Fairview Mall Dr North Side,43.779530,-79.348701
3,42990004,10,VAN HORNE,886387,EAST - 10 VAN HORNE towards VICTORIA PARK,4,7:03:35,0 days 07:03:35,1163,1938,Don Mills Rd at Godstone Rd,43.782682,-79.348922
4,42990004,10,VAN HORNE,886387,EAST - 10 VAN HORNE towards VICTORIA PARK,5,7:04:48,0 days 07:04:48,7723,1919,Don Mills Rd at Deerford Rd,43.785281,-79.350570
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304777,43128797,4,LINE 4 (SHEPPARD),890357,LINE 4 (SHEPPARD) towards SHEPPARD-YONGE STATION,1,5:31:00,0 days 05:31:00,14535,14109,Don Mills Station - Westbound Platform,43.775248,-79.346189
1304778,43128797,4,LINE 4 (SHEPPARD),890357,LINE 4 (SHEPPARD) towards SHEPPARD-YONGE STATION,2,5:33:05,0 days 05:33:05,14536,13847,Leslie Station - Westbound Platform,43.771248,-79.366790
1304779,43128797,4,LINE 4 (SHEPPARD),890357,LINE 4 (SHEPPARD) towards SHEPPARD-YONGE STATION,3,5:34:50,0 days 05:34:50,14537,13846,Bessarion Station - Westbound Platform,43.769296,-79.376345
1304780,43128797,4,LINE 4 (SHEPPARD),890357,LINE 4 (SHEPPARD) towards SHEPPARD-YONGE STATION,4,5:36:43,0 days 05:36:43,14538,13843,Bayview Station - Westbound Platform,43.766951,-79.386730


## Objectives
- Update the next stop sequence, next stop ID, and next stop time for each stop record
- Calculate the time difference between the current and next stop (if available) for each record
- Calculate the average duration between each two stop steps for each unique shape ID

In [9]:
# saving the original df seperately as a backup
master = df.copy()

In [10]:
# start fresh from here
df = master.copy()

In [11]:
df['next_stop_sequence'] = df.stop_sequence.shift(-1).astype('Int64')
df['next_stop_id'] = df.stop_id.shift(-1).astype('Int64')
df['next_stop_time'] = df.stop_time_delta.shift(-1)
df['next_stop_duration'] = df.next_stop_time - df.stop_time_delta

In [12]:
# number of unique duration values for any given shape_id + stop_sequence combo
df[(df.shape_id == 886387) & (df.stop_sequence == 1)].next_stop_duration.nunique()

1

In [13]:
# number of stop sequences for a given shape_id
df[df.shape_id == 886387].stop_sequence.nunique()

19

In [14]:
# note that the last stop_sequence for each shape_id has a unique value; however this value is invalid and must be cleared
# this is because the duration has been calculated between the current record and the next record which belongs to another trip_id
df[(df.shape_id == 886387) & (df.stop_sequence == 19)].next_stop_duration.nunique()

30

In [15]:
# trying to show that all stop_sequences for the same shape_id always have the same duration (at least at first glance)
df.sort_values(['shape_id', 'stop_sequence', 'trip_id']).iloc[27:37]

Unnamed: 0,trip_id,route_short_name,route_long_name,shape_id,trip_headsign,stop_sequence,stop_time,stop_time_delta,stop_id,stop_code,stop_name,stop_lat,stop_lon,next_stop_sequence,next_stop_id,next_stop_time,next_stop_duration
525,42990033,10,VAN HORNE,886387,EAST - 10 VAN HORNE towards VICTORIA PARK,1,17:40:29,0 days 17:40:29,14155,14633,Don Mills Station,43.776222,-79.347048,2,3807,0 days 17:41:19,0 days 00:00:50
544,42990034,10,VAN HORNE,886387,EAST - 10 VAN HORNE towards VICTORIA PARK,1,17:30:29,0 days 17:30:29,14155,14633,Don Mills Station,43.776222,-79.347048,2,3807,0 days 17:31:19,0 days 00:00:50
563,42990035,10,VAN HORNE,886387,EAST - 10 VAN HORNE towards VICTORIA PARK,1,15:15:29,0 days 15:15:29,14155,14633,Don Mills Station,43.776222,-79.347048,2,3807,0 days 15:16:19,0 days 00:00:50
588,42990037,10,VAN HORNE,886387,EAST - 10 VAN HORNE towards VICTORIA PARK,1,17:20:29,0 days 17:20:29,14155,14633,Don Mills Station,43.776222,-79.347048,2,3807,0 days 17:21:19,0 days 00:00:50
607,42990038,10,VAN HORNE,886387,EAST - 10 VAN HORNE towards VICTORIA PARK,1,6:00:29,0 days 06:00:29,14155,14633,Don Mills Station,43.776222,-79.347048,2,3807,0 days 06:01:19,0 days 00:00:50
1,42990004,10,VAN HORNE,886387,EAST - 10 VAN HORNE towards VICTORIA PARK,2,7:01:19,0 days 07:01:19,3807,1949,Don Mills Rd at Leith Hill Rd North Side,43.777534,-79.347811,3,6904,0 days 07:02:12,0 days 00:00:53
20,42990005,10,VAN HORNE,886387,EAST - 10 VAN HORNE towards VICTORIA PARK,2,8:01:19,0 days 08:01:19,3807,1949,Don Mills Rd at Leith Hill Rd North Side,43.777534,-79.347811,3,6904,0 days 08:02:12,0 days 00:00:53
39,42990006,10,VAN HORNE,886387,EAST - 10 VAN HORNE towards VICTORIA PARK,2,16:41:19,0 days 16:41:19,3807,1949,Don Mills Rd at Leith Hill Rd North Side,43.777534,-79.347811,3,6904,0 days 16:42:12,0 days 00:00:53
58,42990007,10,VAN HORNE,886387,EAST - 10 VAN HORNE towards VICTORIA PARK,2,18:11:19,0 days 18:11:19,3807,1949,Don Mills Rd at Leith Hill Rd North Side,43.777534,-79.347811,3,6904,0 days 18:12:12,0 days 00:00:53
77,42990008,10,VAN HORNE,886387,EAST - 10 VAN HORNE towards VICTORIA PARK,2,7:46:19,0 days 07:46:19,3807,1949,Don Mills Rd at Leith Hill Rd North Side,43.777534,-79.347811,3,6904,0 days 07:47:12,0 days 00:00:53


Looks like the duration between two stops with the same shape_id is always the same.

## Test: The duration between each stop for each shape_id is always the same

In [16]:
df

Unnamed: 0,trip_id,route_short_name,route_long_name,shape_id,trip_headsign,stop_sequence,stop_time,stop_time_delta,stop_id,stop_code,stop_name,stop_lat,stop_lon,next_stop_sequence,next_stop_id,next_stop_time,next_stop_duration
0,42990004,10,VAN HORNE,886387,EAST - 10 VAN HORNE towards VICTORIA PARK,1,7:00:29,0 days 07:00:29,14155,14633,Don Mills Station,43.776222,-79.347048,2,3807,0 days 07:01:19,0 days 00:00:50
1,42990004,10,VAN HORNE,886387,EAST - 10 VAN HORNE towards VICTORIA PARK,2,7:01:19,0 days 07:01:19,3807,1949,Don Mills Rd at Leith Hill Rd North Side,43.777534,-79.347811,3,6904,0 days 07:02:12,0 days 00:00:53
2,42990004,10,VAN HORNE,886387,EAST - 10 VAN HORNE towards VICTORIA PARK,3,7:02:12,0 days 07:02:12,6904,1929,Don Mills Rd at Fairview Mall Dr North Side,43.779530,-79.348701,4,1163,0 days 07:03:35,0 days 00:01:23
3,42990004,10,VAN HORNE,886387,EAST - 10 VAN HORNE towards VICTORIA PARK,4,7:03:35,0 days 07:03:35,1163,1938,Don Mills Rd at Godstone Rd,43.782682,-79.348922,5,7723,0 days 07:04:48,0 days 00:01:13
4,42990004,10,VAN HORNE,886387,EAST - 10 VAN HORNE towards VICTORIA PARK,5,7:04:48,0 days 07:04:48,7723,1919,Don Mills Rd at Deerford Rd,43.785281,-79.350570,6,2498,0 days 07:06:08,0 days 00:01:20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304777,43128797,4,LINE 4 (SHEPPARD),890357,LINE 4 (SHEPPARD) towards SHEPPARD-YONGE STATION,1,5:31:00,0 days 05:31:00,14535,14109,Don Mills Station - Westbound Platform,43.775248,-79.346189,2,14536,0 days 05:33:05,0 days 00:02:05
1304778,43128797,4,LINE 4 (SHEPPARD),890357,LINE 4 (SHEPPARD) towards SHEPPARD-YONGE STATION,2,5:33:05,0 days 05:33:05,14536,13847,Leslie Station - Westbound Platform,43.771248,-79.366790,3,14537,0 days 05:34:50,0 days 00:01:45
1304779,43128797,4,LINE 4 (SHEPPARD),890357,LINE 4 (SHEPPARD) towards SHEPPARD-YONGE STATION,3,5:34:50,0 days 05:34:50,14537,13846,Bessarion Station - Westbound Platform,43.769296,-79.376345,4,14538,0 days 05:36:43,0 days 00:01:53
1304780,43128797,4,LINE 4 (SHEPPARD),890357,LINE 4 (SHEPPARD) towards SHEPPARD-YONGE STATION,4,5:36:43,0 days 05:36:43,14538,13843,Bayview Station - Westbound Platform,43.766951,-79.386730,5,14539,0 days 05:39:24,0 days 00:02:41


### Clear out 'next' group of columns for each last stop in every shape_id sequence (since they are invalid)

In [17]:
shape_seq = df.loc[:, ['shape_id', 'stop_sequence']]

In [18]:
shape_seq = shape_seq.drop_duplicates()

In [19]:
shape_seq = shape_seq.sort_values(['shape_id', 'stop_sequence'])

In [20]:
shape_seq = shape_seq.drop_duplicates(subset = 'shape_id', keep = 'last')

In [21]:
# this should be a list of the last stop_sequence for each unique shape_id value
shape_seq.head(3)

Unnamed: 0,shape_id,stop_sequence
18,886387,19
195,886388,6
636,886389,11


In [22]:
df = pd.merge(df, shape_seq, on = 'shape_id')

In [23]:
df = df.rename(columns = {'stop_sequence_x': 'stop_sequence', 'stop_sequence_y': 'final_stop_sequence'})

In [24]:
df['last_stop_sequence'] = df.final_stop_sequence == df.stop_sequence

In [25]:
df.loc[:, ['next_stop_sequence', 'next_stop_id', 'next_stop_time', 'next_stop_duration']] = df.loc[:, ['next_stop_sequence', 'next_stop_id', 'next_stop_time', 'next_stop_duration']].where(~df.last_stop_sequence)

### Calculate the number of unique values (or delete all duplicates to determine uniqueness) for each shape_id + stop_sequence combo to determine if each combo has only one unique value

In [26]:
test = df[~df.last_stop_sequence]

In [27]:
test.last_stop_sequence.value_counts()

False    1263766
Name: last_stop_sequence, dtype: int64

In [28]:
test = test[['shape_id', 'stop_sequence', 'next_stop_duration']].drop_duplicates()

In [29]:
test

Unnamed: 0,shape_id,stop_sequence,next_stop_duration
0,886387,1,0 days 00:00:50
1,886387,2,0 days 00:00:53
2,886387,3,0 days 00:01:23
3,886387,4,0 days 00:01:13
4,886387,5,0 days 00:01:20
...,...,...,...
1303635,890356,3,0 days 00:02:39
1303637,890357,1,0 days 00:02:05
1303638,890357,2,0 days 00:01:45
1303639,890357,3,0 days 00:01:53


In [30]:
test['key'] = test.shape_id.astype(str) + test.stop_sequence.astype(str)

In [31]:
test.key.value_counts().count()

33920

In [43]:
test.key.value_counts()

88719519    15
88719513    14
88794716    13
88793143    13
88794726    13
            ..
88710031     1
88710030     1
88710029     1
88710028     1
8903574      1
Name: key, Length: 33920, dtype: int64

In [32]:
# number of key values that are unique vs those that are duplicated
(test.key.value_counts() > 1).value_counts()

False    17731
True     16189
Name: key, dtype: int64

### Hence we need to calculate the average duration from each stop sequence to the next

In [34]:
temp = df.copy()

In [38]:
temp = temp[['shape_id', 'stop_sequence', 'next_stop_duration']].drop_duplicates()

In [41]:
temp = temp.groupby(['shape_id', 'stop_sequence']).mean()

In [51]:
temp

Unnamed: 0_level_0,Unnamed: 1_level_0,next_stop_duration
shape_id,stop_sequence,Unnamed: 2_level_1
886387,1,0 days 00:00:50
886387,2,0 days 00:00:53
886387,3,0 days 00:01:23
886387,4,0 days 00:01:13
886387,5,0 days 00:01:19
...,...,...
890357,1,0 days 00:02:05
890357,2,0 days 00:01:45
890357,3,0 days 00:01:53
890357,4,0 days 00:02:41


In [65]:
df.shape_id.dtype

dtype('int64')

In [66]:
df.stop_sequence.dtype

dtype('int64')

In [84]:
temp.query('shape_id == 886387 & stop_sequence == 9')

Unnamed: 0_level_0,Unnamed: 1_level_0,next_stop_duration
shape_id,stop_sequence,Unnamed: 2_level_1
886387,9,0 days 00:01:10


In [91]:
temp['next_stop_duration'][886387][9]

Timedelta('0 days 00:01:10')

In [95]:
temp.next_stop_duration[886387][9]

Timedelta('0 days 00:01:10')

In [96]:
temp.loc[(886387, 9), 'next_stop_duration']

Timedelta('0 days 00:01:10')

In [100]:
temp.loc[(886387, 9)][0]

Timedelta('0 days 00:01:10')

In [102]:
df['avg_duration'] = temp.loc[(df.shape_id, df.stop_sequence), 'next_stop_duration']

KeyboardInterrupt: 

In [None]:
df

## Next: Remove all reference to stop time and drop all dups such that only one unique stop_sequence for each shape_id; then transpose the average duration as previously calculated