# Build the reference table containing the weights between all connected stops

In [1]:
import numpy as np
import pandas as pd

In [2]:
from datetime import timedelta as td

In [3]:
stops = pd.read_feather('data/model/stops.ftr')
schedule = pd.read_feather('data/model/schedule.ftr')

In [4]:
seq = pd.read_feather('data/model/shape_sequence_durations.ftr')

In [5]:
s = schedule.copy()
s = s[~s.route_long_name.str.lower().str.contains('night')]
s = s[~s.route_long_name.str.lower().str.contains('express')]

---

In [6]:
stops.head(3)

Unnamed: 0,stop_id,stop_code,stop_name,stop_lat,stop_lon
0,262,662,Danforth Rd at Kennedy Rd,43.714379,-79.260939
1,263,929,Davenport Rd at Bedford Rd,43.674448,-79.399659
2,264,940,Davenport Rd at Dupont St,43.675511,-79.401938


In [7]:
s.head(3)

Unnamed: 0,trip_id,route_short_name,route_long_name,shape_id,trip_headsign,stop_sequence,stop_time,stop_time_delta,stop_id,stop_code,stop_name,stop_lat,stop_lon
0,42990004,10,VAN HORNE,886387,EAST - 10 VAN HORNE towards VICTORIA PARK,1,7:00:29,0 days 07:00:29,14155,14633,Don Mills Station,43.776222,-79.347048
1,42990004,10,VAN HORNE,886387,EAST - 10 VAN HORNE towards VICTORIA PARK,2,7:01:19,0 days 07:01:19,3807,1949,Don Mills Rd at Leith Hill Rd North Side,43.777534,-79.347811
2,42990004,10,VAN HORNE,886387,EAST - 10 VAN HORNE towards VICTORIA PARK,3,7:02:12,0 days 07:02:12,6904,1929,Don Mills Rd at Fairview Mall Dr North Side,43.77953,-79.348701


In [8]:
seq.head(3)

Unnamed: 0,shape_id,stop_sequence,next_stop_duration,next_stop_seconds
0,886387,1,0 days 00:00:50,50.0
1,886387,2,0 days 00:00:53,53.0
2,886387,3,0 days 00:01:23,83.0


---

In [9]:
s.shape_id.nunique()

859

In [10]:
x = s.copy()

In [11]:
x = s[(s.stop_sequence == 1) & (s.stop_time_delta > td(hours = 17)) & (s.stop_time_delta < td(hours = 19))]

In [12]:
x = x.sort_values(['shape_id', 'stop_time_delta'])

In [13]:
x = x.drop_duplicates(['shape_id'])

In [14]:
x[x.route_short_name == 95]

Unnamed: 0,trip_id,route_short_name,route_long_name,shape_id,trip_headsign,stop_sequence,stop_time,stop_time_delta,stop_id,stop_code,stop_name,stop_lat,stop_lon
1178760,43027037,95,YORK MILLS,888270,EAST - 95C YORK MILLS towards ELLESMERE STATION,1,17:01:58,0 days 17:01:58,24357,15974,York Mills Temporary Bus Loop,43.744303,-79.407324
1180436,43027064,95,YORK MILLS,888272,EAST - 95A YORK MILLS towards PORT UNION,1,17:06:58,0 days 17:06:58,24357,15974,York Mills Temporary Bus Loop,43.744303,-79.407324
1191951,43027266,95,YORK MILLS,888274,WEST - 95 YORK MILLS towards YORK MILLS STATION,1,17:09:40,0 days 17:09:40,10122,10521,Cul De Sac (Loop) at Ellesmere Rd,43.794687,-79.154799
1191382,43027253,95,YORK MILLS,888280,WEST - 95C YORK MILLS towards YORK MILLS STATION,1,17:00:09,0 days 17:00:09,15015,15189,Ellesmere Station,43.766958,-79.277486


In [15]:
t = x.trip_id.values

In [16]:
len(t)

465

In [17]:
y = s.loc[s.trip_id.isin(t)].copy()

In [18]:
y = y.sort_values(['shape_id', 'trip_id', 'stop_sequence'])

In [19]:
y['next_stop_id'] = y.stop_id.shift(-1)
y['next_stop_time_delta'] = y.stop_time_delta.shift(-1)
y['trip_id_test'] = y.trip_id.shift(-1)

In [20]:
y['trip_id_compare'] = y.trip_id.astype(float)

In [21]:
y['last_stop_sequence'] = ~(y.trip_id_test == y.trip_id_compare)

In [22]:
y['duration_td'] = y.next_stop_time_delta - y.stop_time_delta

In [23]:
z = y[y.last_stop_sequence == False].copy()

In [24]:
z.next_stop_id = z.next_stop_id.astype(int)

In [25]:
z['duration'] = z.duration_td.dt.total_seconds().astype(int)

In [26]:
z[z.stop_id == 917]

Unnamed: 0,trip_id,route_short_name,route_long_name,shape_id,trip_headsign,stop_sequence,stop_time,stop_time_delta,stop_id,stop_code,stop_name,stop_lat,stop_lon,next_stop_id,next_stop_time_delta,trip_id_test,trip_id_compare,last_stop_sequence,duration_td,duration
1191998,43027266,95,YORK MILLS,888274,WEST - 95 YORK MILLS towards YORK MILLS STATION,48,17:54:42,0 days 17:54:42,917,9083,York Mills Rd at Sandover Dr (1222 York Mills),43.759813,-79.331751,5191,0 days 17:55:15,43027266.0,43027266.0,False,0 days 00:00:33,33
1191398,43027253,95,YORK MILLS,888280,WEST - 95C YORK MILLS towards YORK MILLS STATION,17,17:15:42,0 days 17:15:42,917,9083,York Mills Rd at Sandover Dr (1222 York Mills),43.759813,-79.331751,5191,0 days 17:16:15,43027253.0,43027253.0,False,0 days 00:00:33,33


In [27]:
a = z[['stop_id', 'next_stop_id', 'duration']].groupby(['stop_id', 'next_stop_id']).mean()

In [28]:
a.duration = a.duration.astype(int)

In [29]:
a.loc[917]

Unnamed: 0_level_0,duration
next_stop_id,Unnamed: 1_level_1
5191,33


In [30]:
a

Unnamed: 0_level_0,Unnamed: 1_level_0,duration
stop_id,next_stop_id,Unnamed: 2_level_1
263,264,76
264,4165,69
265,10375,63
266,7773,47
267,4040,91
...,...,...
24418,24416,154
24419,24420,73
24420,24421,154
24421,24418,176


In [31]:
b = a.copy()
b = b.reset_index()

In [32]:
(b.stop_id.value_counts() > 1).value_counts()

# Only 462 stops connect to more than one stop!
# The remaining 8439 stops only lead to one other stop by transit

False    8439
True      462
Name: stop_id, dtype: int64

In [33]:
stop_index = a.index.get_level_values(0).unique().sort_values().values

In [34]:
type(stop_index)

numpy.ndarray

In [35]:
df = pd.DataFrame(stop_index, columns = ['stop_id'])

In [36]:
df['neighbors'] = None
df['num'] = 0

In [37]:
df = df.set_index('stop_id')

In [38]:
df

Unnamed: 0_level_0,neighbors,num
stop_id,Unnamed: 1_level_1,Unnamed: 2_level_1
263,,0
264,,0
265,,0
266,,0
267,,0
...,...,...
24418,,0
24419,,0
24420,,0
24421,,0


In [39]:
a.loc[stop_index[0]].index.to_list()

[264]

In [41]:
for stop in stop_index:
    n = tuple(a.loc[stop].index.values)
    df.at[stop, 'neighbors'] = n
    df.loc[stop, 'num'] = len(n)

In [42]:
df[df.num > 4]

Unnamed: 0_level_0,neighbors,num
stop_id,Unnamed: 1_level_1,Unnamed: 2_level_1
3355,"(14173, 14176, 14177, 14178, 14181)",5
5626,"(14306, 14307, 14308, 14310, 14312)",5
9949,"(14227, 14228, 14229, 14231, 14232)",5


In [43]:
df.num.value_counts()

1    8439
2     414
3      36
4       9
5       3
Name: num, dtype: int64

In [44]:
df

Unnamed: 0_level_0,neighbors,num
stop_id,Unnamed: 1_level_1,Unnamed: 2_level_1
263,"(264,)",1
264,"(4165,)",1
265,"(10375,)",1
266,"(7773,)",1
267,"(4040,)",1
...,...,...
24418,"(24416,)",1
24419,"(24420,)",1
24420,"(24421,)",1
24421,"(24418,)",1


In [45]:
a

Unnamed: 0_level_0,Unnamed: 1_level_0,duration
stop_id,next_stop_id,Unnamed: 2_level_1
263,264,76
264,4165,69
265,10375,63
266,7773,47
267,4040,91
...,...,...
24418,24416,154
24419,24420,73
24420,24421,154
24421,24418,176


In [46]:
pd.to_pickle(a, 'data/model/weights_transit.pickle')

In [47]:
pd.to_pickle(df, 'data/model/neighbors_transit.pickle')