# **1-on_time.csv**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Data Import
1. On-time performance
2. Daily weather data (pending)
3. Traffic/construction data (pending)

In [None]:
DF = pd.read_csv("../../datasets/on_time.csv",low_memory=False)
DF

Unnamed: 0,Row ID,Stop Number,Route Number,Route Name,Route Destination,Day Type,Scheduled Time,Deviation,Location
0,838791619,30893,71,Arlington,Portage via Sinclair,Weekday,2021-08-03T12:57:00,-83,POINT (-97.1487003222228 49.9583003868781)
1,838791621,30894,71,Arlington,Portage via Sinclair,Weekday,2021-08-03T12:57:47,-131,POINT (-97.14473223771209 49.957067630945396)
2,838791623,30884,71,Arlington,Portage via Sinclair,Weekday,2021-08-03T12:58:20,-112,POINT (-97.14591991330629 49.9555484932656)
3,838791626,30377,71,Arlington,Portage via Sinclair,Weekday,2021-08-03T13:00:00,-128,POINT (-97.1452473007769 49.9523715211324)
4,838791628,30378,71,Arlington,Portage via Sinclair,Weekday,2021-08-03T13:01:02,-277,POINT (-97.1409297615775 49.9511780159218)
...,...,...,...,...,...,...,...,...,...
4264663,856899198,40043,47,Transcona - Pembina,Transcona via Regent,Weekday,2021-08-17T10:26:36,-252,POINT (-96.99292649366559 49.89504583500769)
4264664,856899200,40037,47,Transcona - Pembina,Transcona via Regent,Weekday,2021-08-17T10:27:13,-248,POINT (-96.9891662035126 49.8950616196603)
4264665,856899202,40031,47,Transcona - Pembina,Transcona via Regent,Weekday,2021-08-17T10:27:42,-248,POINT (-96.9867783593948 49.895892701349496)
4264666,856899204,40029,47,Transcona - Pembina,Transcona via Regent,Weekday,2021-08-17T10:28:03,-240,POINT (-96.9846475305137 49.8959006485947)


In [None]:
for col in DF.columns:
    print(col,DF[col].dtype)

Row ID int64
Stop Number int64
Route Number object
Route Name object
Route Destination object
Day Type object
Scheduled Time object
Deviation int64
Location object


# Data Preprocessing
1. Remove Row ID
2. Convert Scheduled Time column to DateTime format
3. Convert Location column to tuple
4. Group Route Name and Route Destination into a single column 
5. Add Delay Type (Early, On-time, Short, Medium, Long, Severe, Crippling)

In [None]:
# 1
DF = DF.drop(columns="Row ID")
DF

Unnamed: 0,Stop Number,Route Number,Route Name,Route Destination,Day Type,Scheduled Time,Deviation,Location
0,30893,71,Arlington,Portage via Sinclair,Weekday,2021-08-03T12:57:00,-83,POINT (-97.1487003222228 49.9583003868781)
1,30894,71,Arlington,Portage via Sinclair,Weekday,2021-08-03T12:57:47,-131,POINT (-97.14473223771209 49.957067630945396)
2,30884,71,Arlington,Portage via Sinclair,Weekday,2021-08-03T12:58:20,-112,POINT (-97.14591991330629 49.9555484932656)
3,30377,71,Arlington,Portage via Sinclair,Weekday,2021-08-03T13:00:00,-128,POINT (-97.1452473007769 49.9523715211324)
4,30378,71,Arlington,Portage via Sinclair,Weekday,2021-08-03T13:01:02,-277,POINT (-97.1409297615775 49.9511780159218)
...,...,...,...,...,...,...,...,...
4264663,40043,47,Transcona - Pembina,Transcona via Regent,Weekday,2021-08-17T10:26:36,-252,POINT (-96.99292649366559 49.89504583500769)
4264664,40037,47,Transcona - Pembina,Transcona via Regent,Weekday,2021-08-17T10:27:13,-248,POINT (-96.9891662035126 49.8950616196603)
4264665,40031,47,Transcona - Pembina,Transcona via Regent,Weekday,2021-08-17T10:27:42,-248,POINT (-96.9867783593948 49.895892701349496)
4264666,40029,47,Transcona - Pembina,Transcona via Regent,Weekday,2021-08-17T10:28:03,-240,POINT (-96.9846475305137 49.8959006485947)


In [None]:
# 2
DF['Scheduled Time'] = pd.to_datetime(DF['Scheduled Time'].str.replace('T',' '))
print(DF['Scheduled Time'].dtype)
DF.head()

datetime64[ns]


Unnamed: 0,Stop Number,Route Number,Route Name,Route Destination,Day Type,Scheduled Time,Deviation,Location
0,30893,71,Arlington,Portage via Sinclair,Weekday,2021-08-03 12:57:00,-83,POINT (-97.1487003222228 49.9583003868781)
1,30894,71,Arlington,Portage via Sinclair,Weekday,2021-08-03 12:57:47,-131,POINT (-97.14473223771209 49.957067630945396)
2,30884,71,Arlington,Portage via Sinclair,Weekday,2021-08-03 12:58:20,-112,POINT (-97.14591991330629 49.9555484932656)
3,30377,71,Arlington,Portage via Sinclair,Weekday,2021-08-03 13:00:00,-128,POINT (-97.1452473007769 49.9523715211324)
4,30378,71,Arlington,Portage via Sinclair,Weekday,2021-08-03 13:01:02,-277,POINT (-97.1409297615775 49.9511780159218)


In [None]:
# 3
def __clean_string(df):
    to_repl = ''
    for pat in ["POINT (",")"]:
        df = df.str.replace(pat,to_repl,regex=False)
    df = df.str.split()
    return df

def as_dtype(lst,dtype):
    if isinstance(lst,(list,tuple)):
        return type(lst)([dtype(i) for i in lst])
    else: return lst

DF['Location'] = __clean_string(DF['Location'])
DF.loc[:,['Long','Lat']] = [as_dtype(i,float) for i in DF['Location'].values]
DF = DF.drop(columns="Location")
DF

Unnamed: 0,Stop Number,Route Number,Route Name,Route Destination,Day Type,Scheduled Time,Deviation,Long,Lat
0,30893,71,Arlington,Portage via Sinclair,Weekday,2021-08-03 12:57:00,-83,-97.148700,49.958300
1,30894,71,Arlington,Portage via Sinclair,Weekday,2021-08-03 12:57:47,-131,-97.144732,49.957068
2,30884,71,Arlington,Portage via Sinclair,Weekday,2021-08-03 12:58:20,-112,-97.145920,49.955548
3,30377,71,Arlington,Portage via Sinclair,Weekday,2021-08-03 13:00:00,-128,-97.145247,49.952372
4,30378,71,Arlington,Portage via Sinclair,Weekday,2021-08-03 13:01:02,-277,-97.140930,49.951178
...,...,...,...,...,...,...,...,...,...
4264663,40043,47,Transcona - Pembina,Transcona via Regent,Weekday,2021-08-17 10:26:36,-252,-96.992926,49.895046
4264664,40037,47,Transcona - Pembina,Transcona via Regent,Weekday,2021-08-17 10:27:13,-248,-96.989166,49.895062
4264665,40031,47,Transcona - Pembina,Transcona via Regent,Weekday,2021-08-17 10:27:42,-248,-96.986778,49.895893
4264666,40029,47,Transcona - Pembina,Transcona via Regent,Weekday,2021-08-17 10:28:03,-240,-96.984648,49.895901


In [None]:
# 4
DF['Route'] = tuple([tuple(val) for val in DF[['Route Number','Route Name','Route Destination']].values])
DF = DF.drop(columns=['Route Name','Route Destination'])
DF

Unnamed: 0,Stop Number,Route Number,Day Type,Scheduled Time,Deviation,Long,Lat,Route
0,30893,71,Weekday,2021-08-03 12:57:00,-83,-97.148700,49.958300,"(71, Arlington, Portage via Sinclair)"
1,30894,71,Weekday,2021-08-03 12:57:47,-131,-97.144732,49.957068,"(71, Arlington, Portage via Sinclair)"
2,30884,71,Weekday,2021-08-03 12:58:20,-112,-97.145920,49.955548,"(71, Arlington, Portage via Sinclair)"
3,30377,71,Weekday,2021-08-03 13:00:00,-128,-97.145247,49.952372,"(71, Arlington, Portage via Sinclair)"
4,30378,71,Weekday,2021-08-03 13:01:02,-277,-97.140930,49.951178,"(71, Arlington, Portage via Sinclair)"
...,...,...,...,...,...,...,...,...
4264663,40043,47,Weekday,2021-08-17 10:26:36,-252,-96.992926,49.895046,"(47, Transcona - Pembina, Transcona via Regent)"
4264664,40037,47,Weekday,2021-08-17 10:27:13,-248,-96.989166,49.895062,"(47, Transcona - Pembina, Transcona via Regent)"
4264665,40031,47,Weekday,2021-08-17 10:27:42,-248,-96.986778,49.895893,"(47, Transcona - Pembina, Transcona via Regent)"
4264666,40029,47,Weekday,2021-08-17 10:28:03,-240,-96.984648,49.895901,"(47, Transcona - Pembina, Transcona via Regent)"


Delay types:
- Early: < -2 min
- On-time: Within -2 min - <2 min
- Short delay: 2 min - <10 min,
- Medium delay: 10 min - <30 min,
- Long delay: 30 min - <60 min,
- Severe delay: 60+ min

In [None]:
# 5 
MIN = 60
dev = DF['Deviation'].copy()
delay_types = {
    'early':    (dev < -2*MIN),
    'on-time':  (-2*MIN <= dev) & (dev < 2*MIN),
    'short' :   (2*MIN <= dev)  & (dev < 10*MIN),
    'medium' :  (10*MIN <= dev) & (dev < 30*MIN),
    'long' :    (30*MIN <= dev) & (dev < 60*MIN),
    'severe' :  (dev >= 60*MIN),
}

for delay_type,cond in delay_types.items():
    type_index = dev[cond].dropna().index
    DF.loc[type_index,'Delay Type'] = delay_type
DF


Unnamed: 0,Stop Number,Route Number,Day Type,Scheduled Time,Deviation,Long,Lat,Route,Delay Type
0,30893,71,Weekday,2021-08-03 12:57:00,-83,-97.148700,49.958300,"(71, Arlington, Portage via Sinclair)",on-time
1,30894,71,Weekday,2021-08-03 12:57:47,-131,-97.144732,49.957068,"(71, Arlington, Portage via Sinclair)",early
2,30884,71,Weekday,2021-08-03 12:58:20,-112,-97.145920,49.955548,"(71, Arlington, Portage via Sinclair)",on-time
3,30377,71,Weekday,2021-08-03 13:00:00,-128,-97.145247,49.952372,"(71, Arlington, Portage via Sinclair)",early
4,30378,71,Weekday,2021-08-03 13:01:02,-277,-97.140930,49.951178,"(71, Arlington, Portage via Sinclair)",early
...,...,...,...,...,...,...,...,...,...
4264663,40043,47,Weekday,2021-08-17 10:26:36,-252,-96.992926,49.895046,"(47, Transcona - Pembina, Transcona via Regent)",early
4264664,40037,47,Weekday,2021-08-17 10:27:13,-248,-96.989166,49.895062,"(47, Transcona - Pembina, Transcona via Regent)",early
4264665,40031,47,Weekday,2021-08-17 10:27:42,-248,-96.986778,49.895893,"(47, Transcona - Pembina, Transcona via Regent)",early
4264666,40029,47,Weekday,2021-08-17 10:28:03,-240,-96.984648,49.895901,"(47, Transcona - Pembina, Transcona via Regent)",early


In [None]:
DF.to_csv("clean_datasets/ON_TIME.csv")

# **2-traffic_counts.csv**

In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [12]:
DF = pd.read_csv("../../datasets/traffic_counts.csv",low_memory=False)
DF

Unnamed: 0,Timestamp,Site,Right,Left,Northbound,Southbound,Eastbound,Westbound,Total,Latitude,Longitude,Location
0,26/06/2020 11:45:00 PM,Pembina And 280 N Of Adamar,118,107,107.0,118.0,,,225,49.826952,-97.152312,POINT (-97.152312 49.826952)
1,26/06/2020 11:45:00 PM,Henderson And 55M S Of Frasers Grove,110,71,110.0,71.0,,,181,49.934985,-97.096166,POINT (-97.096166 49.934985)
2,26/06/2020 11:00:00 PM,McPhillips And 190m South Of Leila,139,149,139.0,149.0,,,288,49.951733,-97.149032,POINT (-97.149032 49.951733)
3,26/06/2020 11:15:00 PM,Nichol And St.Marys,86,83,83.0,86.0,,,169,49.850579,-97.112202,POINT (-97.112202 49.850579)
4,26/06/2020 11:15:00 PM,Inkster And 130W Of Wyatt,126,96,,,126.0,96.0,222,49.945949,-97.187758,POINT (-97.187758 49.945949)
...,...,...,...,...,...,...,...,...,...,...,...,...
502391,07/10/2021 02:00:00 PM,Marion And 260M E Of Dupuy,170,174,,,170.0,174.0,344,49.881900,-97.089342,POINT (-97.089342 49.8819)
502392,07/10/2021 11:15:00 AM,Lagimodiere And 80M N Of Burmac,277,235,235.0,277.0,,,512,49.848695,-97.049665,POINT (-97.049665 49.848695)
502393,07/10/2021 10:00:00 PM,Lagimodiere And 80M N Of Burmac,122,120,120.0,122.0,,,242,49.848695,-97.049665,POINT (-97.049665 49.848695)
502394,07/10/2021 07:15:00 AM,Pembina And 280 N Of Adamar,179,382,382.0,179.0,,,561,49.826952,-97.152312,POINT (-97.152312 49.826952)


In [13]:
for col in DF.columns:
    print(col,DF[col].dtype)

Timestamp object
Site object
Right int64
Left int64
Northbound float64
Southbound float64
Eastbound float64
Westbound float64
Total int64
Latitude float64
Longitude float64
Location object


# Data Preprocessing
1. Convert Timestamp, selecting only periods of interest
2. Analyze relationship between Site x Location and {Right,Left} x {N,S,W,E}
3. Clean columns if possible

In [14]:
#1
DF.loc[:,'Timestamp'] = pd.to_datetime(DF['Timestamp'])
print(DF['Timestamp'].dtype)
START_DATE = pd.to_datetime('Aug 1 2021'); END_DATE = pd.to_datetime('Sep 1 2021')
cond = (START_DATE < DF["Timestamp"]) & (DF["Timestamp"] < END_DATE)
DF = DF[cond].sort_values('Timestamp')
DF

datetime64[ns]


Unnamed: 0,Timestamp,Site,Right,Left,Northbound,Southbound,Eastbound,Westbound,Total,Latitude,Longitude,Location
308118,2021-08-01 20:00:00,McPhillips And 190m South Of Leila,170,152,170.0,152.0,,,322,49.951733,-97.149032,POINT (-97.149032 49.951733)
308119,2021-08-01 02:00:00,McPhillips And 190m South Of Leila,14,15,14.0,15.0,,,29,49.951733,-97.149032,POINT (-97.149032 49.951733)
308120,2021-08-01 19:30:00,McPhillips And 190m South Of Leila,181,161,181.0,161.0,,,342,49.951733,-97.149032,POINT (-97.149032 49.951733)
308121,2021-08-01 11:15:00,McPhillips And 190m South Of Leila,178,262,178.0,262.0,,,440,49.951733,-97.149032,POINT (-97.149032 49.951733)
308122,2021-08-01 04:15:00,Henderson And 55M S Of Frasers Grove,12,12,12.0,12.0,,,24,49.934985,-97.096166,POINT (-97.096166 49.934985)
...,...,...,...,...,...,...,...,...,...,...,...,...
490102,2021-08-24 14:00:00,Marion And 260M E Of Dupuy,180,198,,,180.0,198.0,378,49.881900,-97.089342,POINT (-97.089342 49.8819)
490103,2021-08-24 11:15:00,Lagimodiere And 80M N Of Burmac,282,249,249.0,282.0,,,531,49.848695,-97.049665,POINT (-97.049665 49.848695)
490104,2021-08-24 22:00:00,Lagimodiere And 80M N Of Burmac,171,127,127.0,171.0,,,298,49.848695,-97.049665,POINT (-97.049665 49.848695)
490105,2021-08-24 07:15:00,Pembina And 280 N Of Adamar,131,326,326.0,131.0,,,457,49.826952,-97.152312,POINT (-97.152312 49.826952)


In [15]:
sites = DF.loc[:,"Site"].str.split().values
DF.loc[:,["Street","Near"]] = [[l[0],l[-1]] for l in sites]
DF

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value[:, i].tolist(), pi)


Unnamed: 0,Timestamp,Site,Right,Left,Northbound,Southbound,Eastbound,Westbound,Total,Latitude,Longitude,Location,Street,Near
308118,2021-08-01 20:00:00,McPhillips And 190m South Of Leila,170,152,170.0,152.0,,,322,49.951733,-97.149032,POINT (-97.149032 49.951733),McPhillips,Leila
308119,2021-08-01 02:00:00,McPhillips And 190m South Of Leila,14,15,14.0,15.0,,,29,49.951733,-97.149032,POINT (-97.149032 49.951733),McPhillips,Leila
308120,2021-08-01 19:30:00,McPhillips And 190m South Of Leila,181,161,181.0,161.0,,,342,49.951733,-97.149032,POINT (-97.149032 49.951733),McPhillips,Leila
308121,2021-08-01 11:15:00,McPhillips And 190m South Of Leila,178,262,178.0,262.0,,,440,49.951733,-97.149032,POINT (-97.149032 49.951733),McPhillips,Leila
308122,2021-08-01 04:15:00,Henderson And 55M S Of Frasers Grove,12,12,12.0,12.0,,,24,49.934985,-97.096166,POINT (-97.096166 49.934985),Henderson,Grove
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
490102,2021-08-24 14:00:00,Marion And 260M E Of Dupuy,180,198,,,180.0,198.0,378,49.881900,-97.089342,POINT (-97.089342 49.8819),Marion,Dupuy
490103,2021-08-24 11:15:00,Lagimodiere And 80M N Of Burmac,282,249,249.0,282.0,,,531,49.848695,-97.049665,POINT (-97.049665 49.848695),Lagimodiere,Burmac
490104,2021-08-24 22:00:00,Lagimodiere And 80M N Of Burmac,171,127,127.0,171.0,,,298,49.848695,-97.049665,POINT (-97.049665 49.848695),Lagimodiere,Burmac
490105,2021-08-24 07:15:00,Pembina And 280 N Of Adamar,131,326,326.0,131.0,,,457,49.826952,-97.152312,POINT (-97.152312 49.826952),Pembina,Adamar


In [16]:
DF = DF.drop(columns=["Right","Left","Location"]).rename(columns={"Latitude":"Lat","Longitude":"Long"})
DF.to_csv("clean_datasets/TRAFFIC_COUNTS.csv")
DF

Unnamed: 0,Timestamp,Site,Northbound,Southbound,Eastbound,Westbound,Total,Lat,Long,Street,Near
308118,2021-08-01 20:00:00,McPhillips And 190m South Of Leila,170.0,152.0,,,322,49.951733,-97.149032,McPhillips,Leila
308119,2021-08-01 02:00:00,McPhillips And 190m South Of Leila,14.0,15.0,,,29,49.951733,-97.149032,McPhillips,Leila
308120,2021-08-01 19:30:00,McPhillips And 190m South Of Leila,181.0,161.0,,,342,49.951733,-97.149032,McPhillips,Leila
308121,2021-08-01 11:15:00,McPhillips And 190m South Of Leila,178.0,262.0,,,440,49.951733,-97.149032,McPhillips,Leila
308122,2021-08-01 04:15:00,Henderson And 55M S Of Frasers Grove,12.0,12.0,,,24,49.934985,-97.096166,Henderson,Grove
...,...,...,...,...,...,...,...,...,...,...,...
490102,2021-08-24 14:00:00,Marion And 260M E Of Dupuy,,,180.0,198.0,378,49.881900,-97.089342,Marion,Dupuy
490103,2021-08-24 11:15:00,Lagimodiere And 80M N Of Burmac,249.0,282.0,,,531,49.848695,-97.049665,Lagimodiere,Burmac
490104,2021-08-24 22:00:00,Lagimodiere And 80M N Of Burmac,127.0,171.0,,,298,49.848695,-97.049665,Lagimodiere,Burmac
490105,2021-08-24 07:15:00,Pembina And 280 N Of Adamar,326.0,131.0,,,457,49.826952,-97.152312,Pembina,Adamar


In [17]:
#2
DF_SITES = DF[["Site","Street","Near","Lat","Long"]].drop_duplicates().set_index("Site")
DF_SITES.to_csv("clean_datasets/SITES.csv")
DF_SITES

Unnamed: 0_level_0,Street,Near,Lat,Long
Site,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
McPhillips And 190m South Of Leila,McPhillips,Leila,49.951733,-97.149032
Henderson And 55M S Of Frasers Grove,Henderson,Grove,49.934985,-97.096166
Pembina And 280 N Of Adamar,Pembina,Adamar,49.826952,-97.152312
Inkster And 130W Of Wyatt,Inkster,Wyatt,49.945949,-97.187758
Nichol And St.Marys,Nichol,St.Marys,49.850579,-97.112202
Lagimodiere And 80M N Of Burmac,Lagimodiere,Burmac,49.848695,-97.049665
Disraeli Bridge,Disraeli,Bridge,49.906744,-97.123028
Marion And 260M E Of Dupuy,Marion,Dupuy,49.8819,-97.089342


# **3-stops.csv**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
DF = pd.read_csv("../../datasets/stops.csv",low_memory=False)
DF

Unnamed: 0,stop_id,stop_code,stop_name,stop_lat,stop_lon,stop_url
0,10001,10001,Southbound Osborne at Mulvey,49.871261,-97.139518,http://www.winnipegtransit.com/stops/10001
1,10002,10002,Southbound Osborne at Woodward,49.868819,-97.137553,http://www.winnipegtransit.com/stops/10002
2,10003,10003,Southbound Osborne at Brandon,49.867880,-97.136795,http://www.winnipegtransit.com/stops/10003
3,10004,10004,Southbound Osborne at Hethrington,49.866522,-97.135707,http://www.winnipegtransit.com/stops/10004
4,10005,10005,Southbound Osborne at Morley,49.865164,-97.134604,http://www.winnipegtransit.com/stops/10005
...,...,...,...,...,...,...
5151,62018,62018,Eastbound Parker at Beaumont,49.847806,-97.164596,http://www.winnipegtransit.com/stops/62018
5152,62021,62021,Northbound Eaglewood at Longspur,49.780139,-97.196271,http://www.winnipegtransit.com/stops/62021
5153,62022,62022,Southbound Eaglewood at Longspur,49.780498,-97.196785,http://www.winnipegtransit.com/stops/62022
5154,62023,62023,Westbound Bison at Appleford,49.791002,-97.209359,http://www.winnipegtransit.com/stops/62023


In [None]:
for col in DF.columns:
    print(col,DF[col].dtype)

stop_id int64
stop_code int64
stop_name object
stop_lat float64
stop_lon float64
stop_url object


In [None]:
df = DF[DF["stop_name"].str.contains("Station")][["stop_id","stop_name"]]
df.head(25)

Unnamed: 0,stop_id,stop_name
33,10038,Northbound Daly at Morley (95 to Riverview via...
557,10625,Northbound Main at Broadway (Union Station)
572,10641,Southbound Main at Broadway (Union Station)
719,10831,Northbound Balmoral at Balmoral Station (BLUE)
729,10844,"Northbound Balmoral at Balmoral Station (48, 49)"
730,10845,Northbound Balmoral at Balmoral Station (46)
731,10846,Northbound Balmoral at Balmoral Station (42)
745,10910,Northbound Osborne at Osborne Station
746,10911,Southbound Osborne at Osborne Station
754,11027,Southbound Southwest Transitway at Harkness St...


In [None]:
[[l[0].split()[0]," ".join(l[0].split()[1:]),l[1]] for l in df["stop_name"].str.split(" at ")]

[['Northbound', 'Daly', 'Morley (95 to Riverview via Ft Rouge Station)'],
 ['Northbound', 'Main', 'Broadway (Union Station)'],
 ['Southbound', 'Main', 'Broadway (Union Station)'],
 ['Northbound', 'Balmoral', 'Balmoral Station (BLUE)'],
 ['Northbound', 'Balmoral', 'Balmoral Station (48, 49)'],
 ['Northbound', 'Balmoral', 'Balmoral Station (46)'],
 ['Northbound', 'Balmoral', 'Balmoral Station (42)'],
 ['Northbound', 'Osborne', 'Osborne Station'],
 ['Southbound', 'Osborne', 'Osborne Station'],
 ['Southbound', 'Southwest Transitway', 'Harkness Station'],
 ['Northbound', 'Southwest Transitway', 'Harkness Station'],
 ['Southbound', 'Southwest Transitway', 'Osborne Station'],
 ['Northbound', 'Southwest Transitway', 'Osborne Station'],
 ['Southbound', 'Southwest Transitway', 'Fort Rouge Station'],
 ['Northbound', 'Southwest Transitway', 'Fort Rouge Station'],
 ['Southbound', 'Southwest Transitway', 'Jubilee Station'],
 ['Northbound', 'Southwest Transitway', 'Jubilee Station'],
 ['Southbound', 

In [None]:
DF.loc[:,["Direction","Street","At"]] = [[l[0].split()[0]," ".join(l[0].split()[1:]),l[1]] for l in DF["stop_name"].str.split(" at ")]
DF = DF.rename(columns={"stop_id":"Stop Number","stop_lat":"Lat","stop_lon":"Long","stop_name":"Stop Name"})
DF = DF.drop(columns=["stop_code","stop_url"])
DF

Unnamed: 0,Stop Number,Stop Name,Lat,Long,Direction,Street,At
0,10001,Southbound Osborne at Mulvey,49.871261,-97.139518,Southbound,Osborne,Mulvey
1,10002,Southbound Osborne at Woodward,49.868819,-97.137553,Southbound,Osborne,Woodward
2,10003,Southbound Osborne at Brandon,49.867880,-97.136795,Southbound,Osborne,Brandon
3,10004,Southbound Osborne at Hethrington,49.866522,-97.135707,Southbound,Osborne,Hethrington
4,10005,Southbound Osborne at Morley,49.865164,-97.134604,Southbound,Osborne,Morley
...,...,...,...,...,...,...,...
5151,62018,Eastbound Parker at Beaumont,49.847806,-97.164596,Eastbound,Parker,Beaumont
5152,62021,Northbound Eaglewood at Longspur,49.780139,-97.196271,Northbound,Eaglewood,Longspur
5153,62022,Southbound Eaglewood at Longspur,49.780498,-97.196785,Southbound,Eaglewood,Longspur
5154,62023,Westbound Bison at Appleford,49.791002,-97.209359,Westbound,Bison,Appleford


In [None]:
DF.to_csv("clean_datasets/STOPS.csv")

# **4-closure.csv**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
DF = pd.read_csv("../../datasets/lane_closure.csv",low_memory=False)
DF.head()

Unnamed: 0,Primary Street,Cross Street,Boundaries,Direction,Date Closed - From,Date Closed - To,Traffic Effect,Organization,Time Closed - From,Time Closed - To,...,Inserted Date,Geometry Id,Status,Complete Closure,KML,Latitude,Longitude,X,Y,Geometry
0,Bryce St,River Av,River Av to End,Eastbound & Westbound,August 16 2021,October 16 2021,"Eastbound curb lane, closed for building const...",Zelko,12:00 AM,12:00 AM,...,08/11/2021 06:37:43 PM,552963,Current,No,<LineString><extrude>0</extrude><tessellate>0<...,49.879699,-97.142781,633429.365245,5526909.0,MULTILINESTRING ((-97.142387643857 49.87915381...
1,Lilac St,Carter Av,Carter Av to Weatherdon Av,Southbound,June 14 2021,October 29 2021,"East side sidewalk, closed for building constr...",Globeland Construction Ltd,12:00 AM,12:00 AM,...,06/09/2021 10:35:09 AM,550863,Current,No,<LineString><extrude>0</extrude><tessellate>0<...,49.862454,-97.151445,632854.263781,5524977.0,MULTILINESTRING ((-97.151173092203 49.86211919...
2,Princess St,Pacific Av,Pacific Av to Rupert Av,Southbound,August 14 2020,October 16 2021,"West side sidewalk, covered by pedestrian scaf...",Hofer Const. Ltd.,12:00 AM,12:00 AM,...,05/21/2021 12:12:59 PM,550250,Current,No,<LineString><extrude>0</extrude><tessellate>0<...,49.901786,-97.139766,633584.982267,5529370.0,MULTILINESTRING ((-97.139915854251 49.90158994...
3,Henry Av,Austin St,Austin St to End,Westbound,May 10 2021,November 30 2021,"Westbound right turnaround lane, closed for st...",Bockstael Construction Limited,12:00 AM,11:00 PM,...,05/20/2021 09:32:11 AM,550156,Current,No,<LineString><extrude>0</extrude><tessellate>0<...,49.90334,-97.134152,633983.759759,5529553.0,MULTILINESTRING ((-97.13466284795 49.903503694...
4,Grandin St,Tache Av,Tache Av to St Joseph St,Eastbound,May 31 2021,January 13 2022,"South side sidewalk, closed for building const...",Qualico,12:00 AM,12:00 AM,...,05/26/2021 03:38:27 PM,550361,Current,No,<LineString><extrude>0</extrude><tessellate>0<...,49.896013,-97.126315,634566.885434,5528752.0,MULTILINESTRING ((-97.127616387474 49.89567885...


In [None]:
for col in DF.columns:
    print(col,DF[col].dtype)

Primary Street object
Cross Street object
Boundaries object
Direction object
Date Closed - From object
Date Closed - To object
Traffic Effect object
Organization object
Time Closed - From object
Time Closed - To object
Lane Closure ID int64
Modified Date object
Inserted Date object
Geometry Id int64
Status object
Complete Closure object
KML object
Latitude float64
Longitude float64
X float64
Y float64
Geometry object


In [23]:
DF = pd.read_csv("../../datasets/lane_closure.csv",low_memory=False)
DF = DF.rename(columns={"Primary Street":"Street","Cross Street":"At","Latitude":"Lat","Longitude":"Long"})
DF.loc[:,"Street"] = [" ".join(l[:-1]) for l in DF["Street"].str.split()]
DF.loc[:,"At"] = [" ".join(l[:-1]) for l in DF["At"].str.split()]
DF.loc[:,"Boundaries"] = DF["Boundaries"].str.split(" to ")
DF.loc[:,"Direction"] = DF["Direction"].str.split(" & ")
__clean_string = lambda l: [[[(float(j.split()[1]),float(j.split()[0])) for j in s.split(", ")] for s in m] for m in l.values]
DF.loc[:,"Geometry"] = __clean_string(DF["Geometry"].str.replace("MULTILINESTRING \(\(|\)\)","",regex=True).replace().str.split("\), \("))
# DF["Geometry"] = DF["Geometry"].str.strip("MULTILINESTRING \(|\)").str.split("\), \(")
# DF["Geometry"] = [[to_tuple_l(s,rev=True,dtype=float,strip_l="(|)",split_l=", ",split=" ") for s in m] for m in DF["Geometry"].values]
DF["Date Closed - From"] = pd.to_datetime(DF["Date Closed - From"]) 
DF["Date Closed - To"] = pd.to_datetime(DF["Date Closed - To"]) 

def _num_points(m): 
    set_i = set()
    for l in m: set_i |= set(l)
    return set_i 
DF["Num_Segments"] = [len(i) for i in DF["Geometry"].values]
DF["Num_Points"] = [len(_num_points(i)) for i in DF["Geometry"].values]

DF = DF.drop(columns=["Traffic Effect","Organization","Lane Closure ID","Modified Date","Inserted Date","Geometry Id","Status","KML","X","Y"])
DF

  return asarray(a).ndim


Unnamed: 0,Street,At,Boundaries,Direction,Date Closed - From,Date Closed - To,Time Closed - From,Time Closed - To,Complete Closure,Lat,Long,Geometry,Num_Segments,Num_Points
0,Bryce,River,"[River Av, End]","[Eastbound, Westbound]",2021-08-16,2021-10-16,12:00 AM,12:00 AM,No,49.879699,-97.142781,"[[(49.879153816388, -97.142387643857), (49.880...",1,3
1,Lilac,Carter,"[Carter Av, Weatherdon Av]",[Southbound],2021-06-14,2021-10-29,12:00 AM,12:00 AM,No,49.862454,-97.151445,"[[(49.862119196838, -97.151173092203), (49.862...",1,5
2,Princess,Pacific,"[Pacific Av, Rupert Av]",[Southbound],2020-08-14,2021-10-16,12:00 AM,12:00 AM,No,49.901786,-97.139766,"[[(49.90158994935, -97.139915854251), (49.9019...",1,3
3,Henry,Austin,"[Austin St, End]",[Westbound],2021-05-10,2021-11-30,12:00 AM,11:00 PM,No,49.903340,-97.134152,"[[(49.903503694415, -97.13466284795), (49.9033...",1,4
4,Grandin,Tache,"[Tache Av, St Joseph St]",[Eastbound],2021-05-31,2022-01-13,12:00 AM,12:00 AM,No,49.896013,-97.126315,"[[(49.895678857718, -97.127616387474), (49.896...",1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
425,Sherbrook,Logan,"[Logan Av, William Av]",[Southbound],2021-09-20,2021-11-01,12:00 AM,12:00 AM,No,49.906584,-97.153705,"[[(49.904179261579, -97.155653416614), (49.904...",1,30
426,Gull Lake,Markham,"[Markham Rd, East Lake Dr]","[Northbound, Southbound]",2021-09-17,2021-10-08,12:00 AM,12:00 AM,No,49.809311,-97.158976,"[[(49.80730904568, -97.158914003228), (49.8081...",1,39
427,Wardlaw,Osborne,"[Osborne St, Scott St]",[Eastbound],2021-09-20,2021-10-15,12:00 AM,12:00 AM,No,49.877534,-97.142232,"[[(49.876961712509, -97.143938755153), (49.877...",1,5
428,Scotsborough,Beckinsale,"[Beckinsale By, Novavista Dr]",[Southbound],2021-09-23,2021-10-13,12:00 AM,12:00 AM,No,49.820905,-97.107388,"[[(49.820524519173, -97.107184511392), (49.820...",1,4


In [None]:
START_DATE = pd.to_datetime('Aug 1 2021'); END_DATE = pd.to_datetime('Sep 1 2021')
cond = (START_DATE > DF["Date Closed - From"]) & (DF["Date Closed - To"] >= END_DATE)
DF_AUG = DF[cond]
DF_AUG

Unnamed: 0,Street,At,Boundaries,Direction,Date Closed - From,Date Closed - To,Time Closed - From,Time Closed - To,Complete Closure,Lat,Long,Geometry,Num_Segments,Num_Points
1,Lilac,Carter,"[Carter Av, Weatherdon Av]",[Southbound],2021-06-14,2021-10-29,12:00 AM,12:00 AM,No,49.862454,-97.151445,"[[(49.862119196838, -97.151173092203), (49.862...",1,5
2,Princess,Pacific,"[Pacific Av, Rupert Av]",[Southbound],2020-08-14,2021-10-16,12:00 AM,12:00 AM,No,49.901786,-97.139766,"[[(49.90158994935, -97.139915854251), (49.9019...",1,3
3,Henry,Austin,"[Austin St, End]",[Westbound],2021-05-10,2021-11-30,12:00 AM,11:00 PM,No,49.903340,-97.134152,"[[(49.903503694415, -97.13466284795), (49.9033...",1,4
4,Grandin,Tache,"[Tache Av, St Joseph St]",[Eastbound],2021-05-31,2022-01-13,12:00 AM,12:00 AM,No,49.896013,-97.126315,"[[(49.895678857718, -97.127616387474), (49.896...",1,2
5,Edmonton,St Mary,"[St Mary Av, Graham Av]",[Northbound],2021-03-07,2021-10-22,11:00 PM,11:59 PM,No,49.890486,-97.146188,"[[(49.889751350198, -97.145788366233), (49.889...",1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
367,University,Dafoe,"[Dafoe Rd, Markham Rd]","[Northbound, Southbound]",2021-07-23,2021-10-29,12:00 AM,12:00 AM,Yes,49.808929,-97.142052,"[[(49.805997754019, -97.138668665141), (49.807...",1,28
370,Charlotte,,[At End],"[Eastbound, Westbound]",2021-06-28,2021-11-01,12:00 AM,12:00 AM,Yes,49.882473,-97.096175,"[[(49.882472182984, -97.096611651835), (49.882...",1,2
385,Keewatin,Logan,"[Logan Av, Selkirk Av]","[Northbound, Southbound]",2021-07-27,2021-10-22,12:00 AM,12:00 AM,No,49.926892,-97.195248,"[[(49.921824144979, -97.195915153849), (49.921...",3,61
397,Munroe,Henderson,"[Henderson Hw, Golspie St]",[Westbound],2021-05-03,2021-10-08,12:00 AM,12:00 AM,Yes,49.919450,-97.103062,"[[(49.92159361799, -97.109515392943), (49.9214...",1,7


In [None]:
# DF.to_csv("clean_datasets/LANE_CLOSURE.csv")
DF_AUG.to_csv("clean_datasets/LANE_CLOSURE_AUG_2021.csv")

# **5.road_network**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
DF = pd.read_csv("../../datasets/road_network.csv",low_memory=False)
for col in DF.columns:
    print(col,DF[col].dtype)
DF

ID int64
Block ID float64
Street Name object
Street Type object
Street Direction object
Street Class object
Street Type Secondary object
Street Qualifier object
Street Qualifier Secondary object
Address From Left float64
Address To Left float64
Address From Right float64
Address To Right float64
Number of Lanes float64
Oneway int64
Has Reversed Geometry float64
Speed Limit int64
Speed Limit Description object
Speed Limit Jurisdiction object
Location object


Unnamed: 0,ID,Block ID,Street Name,Street Type,Street Direction,Street Class,Street Type Secondary,Street Qualifier,Street Qualifier Secondary,Address From Left,Address To Left,Address From Right,Address To Right,Number of Lanes,Oneway,Has Reversed Geometry,Speed Limit,Speed Limit Description,Speed Limit Jurisdiction,Location
0,23480,1873.0,Sifton,Rd,,Arterial,Street,A,,,,,,2.0,0,0.0,50,Speed Limits are less than or equal to 50 kph,City_Transportation,MULTILINESTRING ((-97.143794801994 49.81055738...
1,6648,23974.0,Valde,Ave,,Local,Street,A,,19.0,19.0,22.0,50.0,2.0,0,0.0,50,Speed Limits are less than or equal to 50 kph,City_Transportation,MULTILINESTRING ((-97.003620088172 49.91094071...
2,15347,23779.0,Lagimodiere,Blvd,,Arterial,Street,A,,,,,,2.0,1,0.0,80,Posted 80 kph speed limit,City,MULTILINESTRING ((-97.066599484474 49.86715578...
3,30035,70342.0,Sage Creek,Blvd,,Collector,Street,B,,,,0.0,0.0,2.0,2,0.0,50,Speed Limits are less than or equal to 50 kph,City_Transportation,MULTILINESTRING ((-97.049568380871 49.83374749...
4,19001,24031.0,Point,Rd,,Collector,Street,A,,,,,,2.0,0,0.0,50,Speed Limits are less than or equal to 50 kph,City_Transportation,MULTILINESTRING ((-97.145356293446 49.84697663...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28957,29646,17669.0,Newton,Ave,,Local,Street,A,,258.0,328.0,255.0,325.0,2.0,0,0.0,50,Speed Limits are less than or equal to 50 kph,City_Transportation,MULTILINESTRING ((-97.11556803066 49.943945001...
28958,8062,23040.0,Main,St,,Arterial,Street,A,,782.0,782.0,,,3.0,1,0.0,50,Speed Limits are less than or equal to 50 kph,City_Transportation,MULTILINESTRING ((-97.134404708765 49.90567402...
28959,6592,18560.0,Digby,Ave,,Local,Street,A,,15.0,27.0,,,2.0,0,0.0,50,Speed Limits are less than or equal to 50 kph,City_Transportation,MULTILINESTRING ((-97.125132844509 49.91383316...
28960,8584,24835.0,Disraeli,Fwy,,Arterial,Street,B,,,,,,3.0,2,0.0,60,Posted 60 kph speed limit,City,MULTILINESTRING ((-97.128799417643 49.90276360...


In [3]:
DF = DF.loc[:,["Block ID","Street Name","Street Type","Number of Lanes","Oneway","Speed Limit","Location"]].rename(columns={"Street Name":"Street"})
__clean_string = lambda l: np.array([[(float(j.split()[1]),float(j.split()[0])) for j in m] for m in l],dtype=object)
DF.loc[:,"Location"] = __clean_string(DF["Location"].str.replace("MULTILINESTRING \(\(|\)\)","",regex=True).str.split(", "))
DF.loc[:,"Num_Points"] = [len(i) for i in DF["Location"].values]
DF

Unnamed: 0,Block ID,Street,Street Type,Number of Lanes,Oneway,Speed Limit,Location,Num_Points
0,1873.0,Sifton,Rd,2.0,0,50,"[(49.810557383041, -97.143794801994), (49.8105...",2
1,23974.0,Valde,Ave,2.0,0,50,"[(49.910940712709, -97.003620088172), (49.9109...",2
2,23779.0,Lagimodiere,Blvd,2.0,1,80,"[(49.8671557845, -97.066599484474), (49.869726...",2
3,70342.0,Sage Creek,Blvd,2.0,2,50,"[(49.833747499855, -97.049568380871), (49.8337...",2
4,24031.0,Point,Rd,2.0,0,50,"[(49.846976634414, -97.145356293446), (49.8472...",2
...,...,...,...,...,...,...,...,...
28957,17669.0,Newton,Ave,2.0,0,50,"[(49.943945001092, -97.11556803066), (49.94406...",21
28958,23040.0,Main,St,3.0,1,50,"[(49.905674025615, -97.134404708765), (49.9062...",5
28959,18560.0,Digby,Ave,2.0,0,50,"[(49.913833165245, -97.125132844509), (49.9138...",34
28960,24835.0,Disraeli,Fwy,3.0,2,60,"[(49.902763600349, -97.128799417643), (49.9028...",14


In [4]:
DF.to_csv("clean_datasets/ROAD_NETWORK.csv")

In [5]:
# [" ".join(map(str,val)) for val in DF[["Street Name","Street Type","Street Direction"]].fillna("").values]