In [1]:
import pandas as pd

In [2]:
from datetime import timedelta as td

# Reduce schedule to the # of unique shape_id values

In [3]:
df = pd.read_feather('data/model/schedule.ftr')

In [4]:
df.head(1)

Unnamed: 0,trip_id,route_short_name,route_long_name,shape_id,trip_headsign,stop_sequence,stop_time,stop_time_delta,stop_id,stop_code,stop_name,stop_lat,stop_lon
0,42990004,10,VAN HORNE,886387,EAST - 10 VAN HORNE towards VICTORIA PARK,1,7:00:29,0 days 07:00:29,14155,14633,Don Mills Station,43.776222,-79.347048


In [5]:
# number of records
len(df)

1304782

In [6]:
# number of unique shape_id values
df.shape_id.value_counts().count()

1082

In [7]:
# number of unique trip_id values
df.trip_id.value_counts().count()

41016

In [8]:
# confirm values are sorted as needed (this may be redundant from preprocessing step)
df = df.sort_values(['shape_id', 'trip_id', 'stop_sequence'])
df = df.reset_index(drop = True)

In [9]:
df

Unnamed: 0,trip_id,route_short_name,route_long_name,shape_id,trip_headsign,stop_sequence,stop_time,stop_time_delta,stop_id,stop_code,stop_name,stop_lat,stop_lon
0,42990004,10,VAN HORNE,886387,EAST - 10 VAN HORNE towards VICTORIA PARK,1,7:00:29,0 days 07:00:29,14155,14633,Don Mills Station,43.776222,-79.347048
1,42990004,10,VAN HORNE,886387,EAST - 10 VAN HORNE towards VICTORIA PARK,2,7:01:19,0 days 07:01:19,3807,1949,Don Mills Rd at Leith Hill Rd North Side,43.777534,-79.347811
2,42990004,10,VAN HORNE,886387,EAST - 10 VAN HORNE towards VICTORIA PARK,3,7:02:12,0 days 07:02:12,6904,1929,Don Mills Rd at Fairview Mall Dr North Side,43.779530,-79.348701
3,42990004,10,VAN HORNE,886387,EAST - 10 VAN HORNE towards VICTORIA PARK,4,7:03:35,0 days 07:03:35,1163,1938,Don Mills Rd at Godstone Rd,43.782682,-79.348922
4,42990004,10,VAN HORNE,886387,EAST - 10 VAN HORNE towards VICTORIA PARK,5,7:04:48,0 days 07:04:48,7723,1919,Don Mills Rd at Deerford Rd,43.785281,-79.350570
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304777,43128797,4,LINE 4 (SHEPPARD),890357,LINE 4 (SHEPPARD) towards SHEPPARD-YONGE STATION,1,5:31:00,0 days 05:31:00,14535,14109,Don Mills Station - Westbound Platform,43.775248,-79.346189
1304778,43128797,4,LINE 4 (SHEPPARD),890357,LINE 4 (SHEPPARD) towards SHEPPARD-YONGE STATION,2,5:33:05,0 days 05:33:05,14536,13847,Leslie Station - Westbound Platform,43.771248,-79.366790
1304779,43128797,4,LINE 4 (SHEPPARD),890357,LINE 4 (SHEPPARD) towards SHEPPARD-YONGE STATION,3,5:34:50,0 days 05:34:50,14537,13846,Bessarion Station - Westbound Platform,43.769296,-79.376345
1304780,43128797,4,LINE 4 (SHEPPARD),890357,LINE 4 (SHEPPARD) towards SHEPPARD-YONGE STATION,4,5:36:43,0 days 05:36:43,14538,13843,Bayview Station - Westbound Platform,43.766951,-79.386730


## Objectives
- Update the next stop sequence, next stop ID, and next stop time for each stop record
- Calculate the time difference between the current and next stop (if available) for each record
- Calculate the average duration between each two stop steps for each unique shape ID

In [10]:
# saving the original df seperately as a backup
master = df.copy()

In [11]:
# start fresh from here
df = master.copy()

In [12]:
df['next_stop_sequence'] = df.stop_sequence.shift(-1).astype('Int64')
df['next_stop_id'] = df.stop_id.shift(-1).astype('Int64')
df['next_stop_time'] = df.stop_time_delta.shift(-1)
df['next_stop_duration'] = df.next_stop_time - df.stop_time_delta

In [13]:
# number of unique duration values for any given shape_id + stop_sequence combo
df[(df.shape_id == 886387) & (df.stop_sequence == 1)].next_stop_duration.nunique()

1

In [14]:
# number of stop sequences for a given shape_id
df[df.shape_id == 886387].stop_sequence.nunique()

19

In [15]:
# note that the last stop_sequence for each shape_id has a unique value; however this value is invalid and must be cleared
# this is because the duration has been calculated between the current record and the next record which belongs to another trip_id
df[(df.shape_id == 886387) & (df.stop_sequence == 19)].next_stop_duration.nunique()

30

In [16]:
# trying to show that all stop_sequences for the same shape_id always have the same duration (at least at first glance)
df.sort_values(['shape_id', 'stop_sequence', 'trip_id']).iloc[27:37]

Unnamed: 0,trip_id,route_short_name,route_long_name,shape_id,trip_headsign,stop_sequence,stop_time,stop_time_delta,stop_id,stop_code,stop_name,stop_lat,stop_lon,next_stop_sequence,next_stop_id,next_stop_time,next_stop_duration
513,42990033,10,VAN HORNE,886387,EAST - 10 VAN HORNE towards VICTORIA PARK,1,17:40:29,0 days 17:40:29,14155,14633,Don Mills Station,43.776222,-79.347048,2,3807,0 days 17:41:19,0 days 00:00:50
532,42990034,10,VAN HORNE,886387,EAST - 10 VAN HORNE towards VICTORIA PARK,1,17:30:29,0 days 17:30:29,14155,14633,Don Mills Station,43.776222,-79.347048,2,3807,0 days 17:31:19,0 days 00:00:50
551,42990035,10,VAN HORNE,886387,EAST - 10 VAN HORNE towards VICTORIA PARK,1,15:15:29,0 days 15:15:29,14155,14633,Don Mills Station,43.776222,-79.347048,2,3807,0 days 15:16:19,0 days 00:00:50
570,42990037,10,VAN HORNE,886387,EAST - 10 VAN HORNE towards VICTORIA PARK,1,17:20:29,0 days 17:20:29,14155,14633,Don Mills Station,43.776222,-79.347048,2,3807,0 days 17:21:19,0 days 00:00:50
589,42990038,10,VAN HORNE,886387,EAST - 10 VAN HORNE towards VICTORIA PARK,1,6:00:29,0 days 06:00:29,14155,14633,Don Mills Station,43.776222,-79.347048,2,3807,0 days 06:01:19,0 days 00:00:50
1,42990004,10,VAN HORNE,886387,EAST - 10 VAN HORNE towards VICTORIA PARK,2,7:01:19,0 days 07:01:19,3807,1949,Don Mills Rd at Leith Hill Rd North Side,43.777534,-79.347811,3,6904,0 days 07:02:12,0 days 00:00:53
20,42990005,10,VAN HORNE,886387,EAST - 10 VAN HORNE towards VICTORIA PARK,2,8:01:19,0 days 08:01:19,3807,1949,Don Mills Rd at Leith Hill Rd North Side,43.777534,-79.347811,3,6904,0 days 08:02:12,0 days 00:00:53
39,42990006,10,VAN HORNE,886387,EAST - 10 VAN HORNE towards VICTORIA PARK,2,16:41:19,0 days 16:41:19,3807,1949,Don Mills Rd at Leith Hill Rd North Side,43.777534,-79.347811,3,6904,0 days 16:42:12,0 days 00:00:53
58,42990007,10,VAN HORNE,886387,EAST - 10 VAN HORNE towards VICTORIA PARK,2,18:11:19,0 days 18:11:19,3807,1949,Don Mills Rd at Leith Hill Rd North Side,43.777534,-79.347811,3,6904,0 days 18:12:12,0 days 00:00:53
77,42990008,10,VAN HORNE,886387,EAST - 10 VAN HORNE towards VICTORIA PARK,2,7:46:19,0 days 07:46:19,3807,1949,Don Mills Rd at Leith Hill Rd North Side,43.777534,-79.347811,3,6904,0 days 07:47:12,0 days 00:00:53


Looks like the duration between two stops with the same shape_id is always the same.

## Test: The duration between each stop for each shape_id is always the same

In [17]:
df

Unnamed: 0,trip_id,route_short_name,route_long_name,shape_id,trip_headsign,stop_sequence,stop_time,stop_time_delta,stop_id,stop_code,stop_name,stop_lat,stop_lon,next_stop_sequence,next_stop_id,next_stop_time,next_stop_duration
0,42990004,10,VAN HORNE,886387,EAST - 10 VAN HORNE towards VICTORIA PARK,1,7:00:29,0 days 07:00:29,14155,14633,Don Mills Station,43.776222,-79.347048,2,3807,0 days 07:01:19,0 days 00:00:50
1,42990004,10,VAN HORNE,886387,EAST - 10 VAN HORNE towards VICTORIA PARK,2,7:01:19,0 days 07:01:19,3807,1949,Don Mills Rd at Leith Hill Rd North Side,43.777534,-79.347811,3,6904,0 days 07:02:12,0 days 00:00:53
2,42990004,10,VAN HORNE,886387,EAST - 10 VAN HORNE towards VICTORIA PARK,3,7:02:12,0 days 07:02:12,6904,1929,Don Mills Rd at Fairview Mall Dr North Side,43.779530,-79.348701,4,1163,0 days 07:03:35,0 days 00:01:23
3,42990004,10,VAN HORNE,886387,EAST - 10 VAN HORNE towards VICTORIA PARK,4,7:03:35,0 days 07:03:35,1163,1938,Don Mills Rd at Godstone Rd,43.782682,-79.348922,5,7723,0 days 07:04:48,0 days 00:01:13
4,42990004,10,VAN HORNE,886387,EAST - 10 VAN HORNE towards VICTORIA PARK,5,7:04:48,0 days 07:04:48,7723,1919,Don Mills Rd at Deerford Rd,43.785281,-79.350570,6,2498,0 days 07:06:08,0 days 00:01:20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304777,43128797,4,LINE 4 (SHEPPARD),890357,LINE 4 (SHEPPARD) towards SHEPPARD-YONGE STATION,1,5:31:00,0 days 05:31:00,14535,14109,Don Mills Station - Westbound Platform,43.775248,-79.346189,2,14536,0 days 05:33:05,0 days 00:02:05
1304778,43128797,4,LINE 4 (SHEPPARD),890357,LINE 4 (SHEPPARD) towards SHEPPARD-YONGE STATION,2,5:33:05,0 days 05:33:05,14536,13847,Leslie Station - Westbound Platform,43.771248,-79.366790,3,14537,0 days 05:34:50,0 days 00:01:45
1304779,43128797,4,LINE 4 (SHEPPARD),890357,LINE 4 (SHEPPARD) towards SHEPPARD-YONGE STATION,3,5:34:50,0 days 05:34:50,14537,13846,Bessarion Station - Westbound Platform,43.769296,-79.376345,4,14538,0 days 05:36:43,0 days 00:01:53
1304780,43128797,4,LINE 4 (SHEPPARD),890357,LINE 4 (SHEPPARD) towards SHEPPARD-YONGE STATION,4,5:36:43,0 days 05:36:43,14538,13843,Bayview Station - Westbound Platform,43.766951,-79.386730,5,14539,0 days 05:39:24,0 days 00:02:41


### Clear out 'next' group of columns for each last stop in every shape_id sequence (since they are invalid)

In [18]:
shape_seq = df.loc[:, ['shape_id', 'stop_sequence']]

In [19]:
shape_seq = shape_seq.drop_duplicates()

In [20]:
shape_seq = shape_seq.sort_values(['shape_id', 'stop_sequence'])

In [21]:
shape_seq = shape_seq.drop_duplicates(subset = 'shape_id', keep = 'last')

In [22]:
# this should be a list of the last stop_sequence for each unique shape_id value
shape_seq.head(3)

Unnamed: 0,shape_id,stop_sequence
18,886387,19
613,886388,6
636,886389,11


In [23]:
df = pd.merge(df, shape_seq, on = 'shape_id')

In [24]:
df = df.rename(columns = {'stop_sequence_x': 'stop_sequence', 'stop_sequence_y': 'final_stop_sequence'})

In [25]:
df['last_stop_sequence'] = df.final_stop_sequence == df.stop_sequence

In [26]:
df.loc[:, ['next_stop_sequence', 'next_stop_id', 'next_stop_time', 'next_stop_duration']] = df.loc[:, ['next_stop_sequence', 'next_stop_id', 'next_stop_time', 'next_stop_duration']].where(~df.last_stop_sequence)

### Calculate the number of unique values (or delete all duplicates to determine uniqueness) for each shape_id + stop_sequence combo to determine if each combo has only one unique value

In [27]:
test = df[~df.last_stop_sequence]

In [28]:
test.last_stop_sequence.value_counts()

False    1263766
Name: last_stop_sequence, dtype: int64

In [29]:
test = test[['shape_id', 'stop_sequence', 'next_stop_duration']].drop_duplicates()

In [30]:
test

Unnamed: 0,shape_id,stop_sequence,next_stop_duration
0,886387,1,0 days 00:00:50
1,886387,2,0 days 00:00:53
2,886387,3,0 days 00:01:23
3,886387,4,0 days 00:01:13
4,886387,5,0 days 00:01:20
...,...,...,...
1303635,890356,3,0 days 00:02:39
1303637,890357,1,0 days 00:02:05
1303638,890357,2,0 days 00:01:45
1303639,890357,3,0 days 00:01:53


In [31]:
test['key'] = test.shape_id.astype(str) + test.stop_sequence.astype(str)

In [32]:
test.key.value_counts().count()

33920

In [33]:
test.key.value_counts()

88719519    15
88719513    14
88794716    13
88793143    13
88794726    13
            ..
88710031     1
88710030     1
88710029     1
88710028     1
8903574      1
Name: key, Length: 33920, dtype: int64

In [34]:
# number of key values that are unique vs those that are duplicated
(test.key.value_counts() > 1).value_counts()

False    17731
True     16189
Name: key, dtype: int64

### Hence we need to calculate the average duration from each stop sequence to the next

In [35]:
temp = df.copy()

In [36]:
temp = temp[['shape_id', 'stop_sequence', 'next_stop_duration']].drop_duplicates()

In [37]:
temp = temp.groupby(['shape_id', 'stop_sequence']).mean()

In [38]:
temp

Unnamed: 0_level_0,Unnamed: 1_level_0,next_stop_duration
shape_id,stop_sequence,Unnamed: 2_level_1
886387,1,0 days 00:00:50
886387,2,0 days 00:00:53
886387,3,0 days 00:01:23
886387,4,0 days 00:01:13
886387,5,0 days 00:01:19
...,...,...
890357,1,0 days 00:02:05
890357,2,0 days 00:01:45
890357,3,0 days 00:01:53
890357,4,0 days 00:02:41


In [39]:
df.shape_id.dtype

dtype('int64')

In [40]:
df.stop_sequence.dtype

dtype('int64')

In [41]:
temp.query('shape_id == 886387 & stop_sequence == 9')

Unnamed: 0_level_0,Unnamed: 1_level_0,next_stop_duration
shape_id,stop_sequence,Unnamed: 2_level_1
886387,9,0 days 00:01:10


In [42]:
temp['next_stop_duration'][886387][9]

Timedelta('0 days 00:01:10')

In [43]:
temp.next_stop_duration[886387][9]

Timedelta('0 days 00:01:10')

In [44]:
temp.loc[(886387, 9), 'next_stop_duration']

Timedelta('0 days 00:01:10')

In [45]:
temp.loc[(886387, 9)][0]

Timedelta('0 days 00:01:10')

In [46]:
temp

Unnamed: 0_level_0,Unnamed: 1_level_0,next_stop_duration
shape_id,stop_sequence,Unnamed: 2_level_1
886387,1,0 days 00:00:50
886387,2,0 days 00:00:53
886387,3,0 days 00:01:23
886387,4,0 days 00:01:13
886387,5,0 days 00:01:19
...,...,...
890357,1,0 days 00:02:05
890357,2,0 days 00:01:45
890357,3,0 days 00:01:53
890357,4,0 days 00:02:41


In [47]:
temp.loc[(886387, 1)][0]

Timedelta('0 days 00:00:50')

In [48]:
temp

Unnamed: 0_level_0,Unnamed: 1_level_0,next_stop_duration
shape_id,stop_sequence,Unnamed: 2_level_1
886387,1,0 days 00:00:50
886387,2,0 days 00:00:53
886387,3,0 days 00:01:23
886387,4,0 days 00:01:13
886387,5,0 days 00:01:19
...,...,...
890357,1,0 days 00:02:05
890357,2,0 days 00:01:45
890357,3,0 days 00:01:53
890357,4,0 days 00:02:41


In [49]:
df['avg_duration'] = td(seconds = 0)

In [50]:
len(df)

1304782

In [51]:
20000 % 10000

0

In [52]:
df

Unnamed: 0,trip_id,route_short_name,route_long_name,shape_id,trip_headsign,stop_sequence,stop_time,stop_time_delta,stop_id,stop_code,stop_name,stop_lat,stop_lon,next_stop_sequence,next_stop_id,next_stop_time,next_stop_duration,final_stop_sequence,last_stop_sequence,avg_duration
0,42990004,10,VAN HORNE,886387,EAST - 10 VAN HORNE towards VICTORIA PARK,1,7:00:29,0 days 07:00:29,14155,14633,Don Mills Station,43.776222,-79.347048,2,3807,0 days 07:01:19,0 days 00:00:50,19,False,0 days
1,42990004,10,VAN HORNE,886387,EAST - 10 VAN HORNE towards VICTORIA PARK,2,7:01:19,0 days 07:01:19,3807,1949,Don Mills Rd at Leith Hill Rd North Side,43.777534,-79.347811,3,6904,0 days 07:02:12,0 days 00:00:53,19,False,0 days
2,42990004,10,VAN HORNE,886387,EAST - 10 VAN HORNE towards VICTORIA PARK,3,7:02:12,0 days 07:02:12,6904,1929,Don Mills Rd at Fairview Mall Dr North Side,43.779530,-79.348701,4,1163,0 days 07:03:35,0 days 00:01:23,19,False,0 days
3,42990004,10,VAN HORNE,886387,EAST - 10 VAN HORNE towards VICTORIA PARK,4,7:03:35,0 days 07:03:35,1163,1938,Don Mills Rd at Godstone Rd,43.782682,-79.348922,5,7723,0 days 07:04:48,0 days 00:01:13,19,False,0 days
4,42990004,10,VAN HORNE,886387,EAST - 10 VAN HORNE towards VICTORIA PARK,5,7:04:48,0 days 07:04:48,7723,1919,Don Mills Rd at Deerford Rd,43.785281,-79.350570,6,2498,0 days 07:06:08,0 days 00:01:20,19,False,0 days
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304777,43128797,4,LINE 4 (SHEPPARD),890357,LINE 4 (SHEPPARD) towards SHEPPARD-YONGE STATION,1,5:31:00,0 days 05:31:00,14535,14109,Don Mills Station - Westbound Platform,43.775248,-79.346189,2,14536,0 days 05:33:05,0 days 00:02:05,5,False,0 days
1304778,43128797,4,LINE 4 (SHEPPARD),890357,LINE 4 (SHEPPARD) towards SHEPPARD-YONGE STATION,2,5:33:05,0 days 05:33:05,14536,13847,Leslie Station - Westbound Platform,43.771248,-79.366790,3,14537,0 days 05:34:50,0 days 00:01:45,5,False,0 days
1304779,43128797,4,LINE 4 (SHEPPARD),890357,LINE 4 (SHEPPARD) towards SHEPPARD-YONGE STATION,3,5:34:50,0 days 05:34:50,14537,13846,Bessarion Station - Westbound Platform,43.769296,-79.376345,4,14538,0 days 05:36:43,0 days 00:01:53,5,False,0 days
1304780,43128797,4,LINE 4 (SHEPPARD),890357,LINE 4 (SHEPPARD) towards SHEPPARD-YONGE STATION,4,5:36:43,0 days 05:36:43,14538,13843,Bayview Station - Westbound Platform,43.766951,-79.386730,5,14539,0 days 05:39:24,0 days 00:02:41,5,False,0 days


In [53]:
%%time

for i in range(0, len(df)):
    df.loc[i, 'avg_duration'] = temp.loc[(df.loc[i, 'shape_id'], df.loc[i, 'stop_sequence'])][0]
    if (i % 10000 == 0):
        print(i)

0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
270000
280000
290000
300000
310000
320000
330000
340000
350000
360000
370000
380000
390000
400000
410000
420000
430000
440000
450000
460000
470000
480000
490000
500000
510000
520000
530000
540000
550000
560000
570000
580000
590000
600000
610000
620000
630000
640000
650000
660000
670000
680000
690000
700000
710000
720000
730000
740000
750000
760000
770000
780000
790000
800000
810000
820000
830000
840000
850000
860000
870000
880000
890000
900000
910000
920000
930000
940000
950000
960000
970000
980000
990000
1000000
1010000
1020000
1030000
1040000
1050000
1060000
1070000
1080000
1090000
1100000
1110000
1120000
1130000
1140000
1150000
1160000
1170000
1180000
1190000
1200000
1210000
1220000
1230000
1240000
1250000
1260000
1270000
1280000
1290000
1300000
CPU times: user 6min 45s, sys: 13.5 s, total: 6min 59s
Wall time: 

In [55]:
df.avg_duration

0         0 days 00:00:50
1         0 days 00:00:53
2         0 days 00:01:23
3         0 days 00:01:13
4         0 days 00:01:19
                ...      
1304777   0 days 00:02:05
1304778   0 days 00:01:45
1304779   0 days 00:01:53
1304780   0 days 00:02:41
1304781               NaT
Name: avg_duration, Length: 1304782, dtype: timedelta64[ns]

In [57]:
# confirms that its working
df[['shape_id', 'stop_sequence', 'avg_duration']].drop_duplicates()

Unnamed: 0,shape_id,stop_sequence,avg_duration
0,886387,1,0 days 00:00:50
1,886387,2,0 days 00:00:53
2,886387,3,0 days 00:01:23
3,886387,4,0 days 00:01:13
4,886387,5,0 days 00:01:19
...,...,...,...
1303637,890357,1,0 days 00:02:05
1303638,890357,2,0 days 00:01:45
1303639,890357,3,0 days 00:01:53
1303640,890357,4,0 days 00:02:41


In [58]:
df

Unnamed: 0,trip_id,route_short_name,route_long_name,shape_id,trip_headsign,stop_sequence,stop_time,stop_time_delta,stop_id,stop_code,stop_name,stop_lat,stop_lon,next_stop_sequence,next_stop_id,next_stop_time,next_stop_duration,final_stop_sequence,last_stop_sequence,avg_duration
0,42990004,10,VAN HORNE,886387,EAST - 10 VAN HORNE towards VICTORIA PARK,1,7:00:29,0 days 07:00:29,14155,14633,Don Mills Station,43.776222,-79.347048,2,3807,0 days 07:01:19,0 days 00:00:50,19,False,0 days 00:00:50
1,42990004,10,VAN HORNE,886387,EAST - 10 VAN HORNE towards VICTORIA PARK,2,7:01:19,0 days 07:01:19,3807,1949,Don Mills Rd at Leith Hill Rd North Side,43.777534,-79.347811,3,6904,0 days 07:02:12,0 days 00:00:53,19,False,0 days 00:00:53
2,42990004,10,VAN HORNE,886387,EAST - 10 VAN HORNE towards VICTORIA PARK,3,7:02:12,0 days 07:02:12,6904,1929,Don Mills Rd at Fairview Mall Dr North Side,43.779530,-79.348701,4,1163,0 days 07:03:35,0 days 00:01:23,19,False,0 days 00:01:23
3,42990004,10,VAN HORNE,886387,EAST - 10 VAN HORNE towards VICTORIA PARK,4,7:03:35,0 days 07:03:35,1163,1938,Don Mills Rd at Godstone Rd,43.782682,-79.348922,5,7723,0 days 07:04:48,0 days 00:01:13,19,False,0 days 00:01:13
4,42990004,10,VAN HORNE,886387,EAST - 10 VAN HORNE towards VICTORIA PARK,5,7:04:48,0 days 07:04:48,7723,1919,Don Mills Rd at Deerford Rd,43.785281,-79.350570,6,2498,0 days 07:06:08,0 days 00:01:20,19,False,0 days 00:01:19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304777,43128797,4,LINE 4 (SHEPPARD),890357,LINE 4 (SHEPPARD) towards SHEPPARD-YONGE STATION,1,5:31:00,0 days 05:31:00,14535,14109,Don Mills Station - Westbound Platform,43.775248,-79.346189,2,14536,0 days 05:33:05,0 days 00:02:05,5,False,0 days 00:02:05
1304778,43128797,4,LINE 4 (SHEPPARD),890357,LINE 4 (SHEPPARD) towards SHEPPARD-YONGE STATION,2,5:33:05,0 days 05:33:05,14536,13847,Leslie Station - Westbound Platform,43.771248,-79.366790,3,14537,0 days 05:34:50,0 days 00:01:45,5,False,0 days 00:01:45
1304779,43128797,4,LINE 4 (SHEPPARD),890357,LINE 4 (SHEPPARD) towards SHEPPARD-YONGE STATION,3,5:34:50,0 days 05:34:50,14537,13846,Bessarion Station - Westbound Platform,43.769296,-79.376345,4,14538,0 days 05:36:43,0 days 00:01:53,5,False,0 days 00:01:53
1304780,43128797,4,LINE 4 (SHEPPARD),890357,LINE 4 (SHEPPARD) towards SHEPPARD-YONGE STATION,4,5:36:43,0 days 05:36:43,14538,13843,Bayview Station - Westbound Platform,43.766951,-79.386730,5,14539,0 days 05:39:24,0 days 00:02:41,5,False,0 days 00:02:41


## Next: Remove all reference to stop time and drop all dups such that only one unique stop_sequence for each shape_id; then transpose the average duration as previously calculated

In [61]:
df[df.shape_id == 890357].trip_id.value_counts()

43128569    5
43128713    5
43128715    5
43128716    5
43128717    5
           ..
43128649    5
43128650    5
43128651    5
43128652    5
43128797    5
Name: trip_id, Length: 229, dtype: int64

In [67]:
df

Unnamed: 0,trip_id,route_short_name,route_long_name,shape_id,trip_headsign,stop_sequence,stop_time,stop_time_delta,stop_id,stop_code,stop_name,stop_lat,stop_lon,next_stop_sequence,next_stop_id,next_stop_time,next_stop_duration,final_stop_sequence,last_stop_sequence,avg_duration
0,42990004,10,VAN HORNE,886387,EAST - 10 VAN HORNE towards VICTORIA PARK,1,7:00:29,0 days 07:00:29,14155,14633,Don Mills Station,43.776222,-79.347048,2,3807,0 days 07:01:19,0 days 00:00:50,19,False,0 days 00:00:50
1,42990004,10,VAN HORNE,886387,EAST - 10 VAN HORNE towards VICTORIA PARK,2,7:01:19,0 days 07:01:19,3807,1949,Don Mills Rd at Leith Hill Rd North Side,43.777534,-79.347811,3,6904,0 days 07:02:12,0 days 00:00:53,19,False,0 days 00:00:53
2,42990004,10,VAN HORNE,886387,EAST - 10 VAN HORNE towards VICTORIA PARK,3,7:02:12,0 days 07:02:12,6904,1929,Don Mills Rd at Fairview Mall Dr North Side,43.779530,-79.348701,4,1163,0 days 07:03:35,0 days 00:01:23,19,False,0 days 00:01:23
3,42990004,10,VAN HORNE,886387,EAST - 10 VAN HORNE towards VICTORIA PARK,4,7:03:35,0 days 07:03:35,1163,1938,Don Mills Rd at Godstone Rd,43.782682,-79.348922,5,7723,0 days 07:04:48,0 days 00:01:13,19,False,0 days 00:01:13
4,42990004,10,VAN HORNE,886387,EAST - 10 VAN HORNE towards VICTORIA PARK,5,7:04:48,0 days 07:04:48,7723,1919,Don Mills Rd at Deerford Rd,43.785281,-79.350570,6,2498,0 days 07:06:08,0 days 00:01:20,19,False,0 days 00:01:19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304777,43128797,4,LINE 4 (SHEPPARD),890357,LINE 4 (SHEPPARD) towards SHEPPARD-YONGE STATION,1,5:31:00,0 days 05:31:00,14535,14109,Don Mills Station - Westbound Platform,43.775248,-79.346189,2,14536,0 days 05:33:05,0 days 00:02:05,5,False,0 days 00:02:05
1304778,43128797,4,LINE 4 (SHEPPARD),890357,LINE 4 (SHEPPARD) towards SHEPPARD-YONGE STATION,2,5:33:05,0 days 05:33:05,14536,13847,Leslie Station - Westbound Platform,43.771248,-79.366790,3,14537,0 days 05:34:50,0 days 00:01:45,5,False,0 days 00:01:45
1304779,43128797,4,LINE 4 (SHEPPARD),890357,LINE 4 (SHEPPARD) towards SHEPPARD-YONGE STATION,3,5:34:50,0 days 05:34:50,14537,13846,Bessarion Station - Westbound Platform,43.769296,-79.376345,4,14538,0 days 05:36:43,0 days 00:01:53,5,False,0 days 00:01:53
1304780,43128797,4,LINE 4 (SHEPPARD),890357,LINE 4 (SHEPPARD) towards SHEPPARD-YONGE STATION,4,5:36:43,0 days 05:36:43,14538,13843,Bayview Station - Westbound Platform,43.766951,-79.386730,5,14539,0 days 05:39:24,0 days 00:02:41,5,False,0 days 00:02:41


In [75]:
df.to_feather('data/model/schedule_2.ftr')

In [77]:
stop_seq_durations = temp.copy()

In [78]:
x = stop_seq_durations

In [84]:
x['next_stop_seconds'] = x.next_stop_duration.dt.total_seconds()

In [94]:
x.next_stop_seconds

shape_id  stop_sequence
886387    1                 50.0
          2                 53.0
          3                 83.0
          4                 73.0
          5                 79.0
                           ...  
890357    1                125.0
          2                105.0
          3                113.0
          4                161.0
          5                  NaN
Name: next_stop_seconds, Length: 35002, dtype: float64

In [85]:
x

Unnamed: 0_level_0,Unnamed: 1_level_0,next_stop_duration,next_stop_seconds
shape_id,stop_sequence,Unnamed: 2_level_1,Unnamed: 3_level_1
886387,1,0 days 00:00:50,50.0
886387,2,0 days 00:00:53,53.0
886387,3,0 days 00:01:23,83.0
886387,4,0 days 00:01:13,73.0
886387,5,0 days 00:01:19,79.0
...,...,...,...
890357,1,0 days 00:02:05,125.0
890357,2,0 days 00:01:45,105.0
890357,3,0 days 00:01:53,113.0
890357,4,0 days 00:02:41,161.0


In [99]:
x = x.reset_index()

In [101]:
x.to_feather('data/model/shape_sequence_durations.ftr')

In [102]:
x

Unnamed: 0,shape_id,stop_sequence,next_stop_duration,next_stop_seconds
0,886387,1,0 days 00:00:50,50.0
1,886387,2,0 days 00:00:53,53.0
2,886387,3,0 days 00:01:23,83.0
3,886387,4,0 days 00:01:13,73.0
4,886387,5,0 days 00:01:19,79.0
...,...,...,...,...
34997,890357,1,0 days 00:02:05,125.0
34998,890357,2,0 days 00:01:45,105.0
34999,890357,3,0 days 00:01:53,113.0
35000,890357,4,0 days 00:02:41,161.0
