In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import myutils as u
import math

Outline
1. Datasets Used
    - Bus OTP data & Bus stops information
        - Bus stop OTP: {Time, Exact Location, Street, Direction, Bus Lateness}
    - Traffic count data & Traffic count sites information
        - Features: {Time*, Exact Location, Street, Directional traffic volume, Aggregrate traffic volume}

2. Goal:
    - Link Traffic volume to Bus Lateness

3. Specifics:
- Time Period: 1 month data (Aug/Sep/Oct 2021)
- Affected Bus stops:
    - Direct zone: Bus stops on the same street and direction, within 100m
    - Indirect zone: Bus stops not categorized as in Direct Zone but within 100m
- Bus lateness entries:
    - All bus time entries within time interval [T-15min,T] 

4. Hypothesis:
-  Traffic count affects bus lateness
    - Main Hypothesis: Bus on the same street and direction will be directly affected
    - Traffic spillover effects: Other streets in the indirect zone will also experience similar traffic volume and will be impacted too  
- Other potentially relevant factors:
    - Distance might matter: The farther the bus stop is away from a count, the weaker the relationship might be (People turning to other streets)?
    - Number of lanes: (Not accounted for in the original data) More lanes per traffic might mean less lateness?
    - Directions: Bus stops towards the site vs bus stops away from the site

5. Approach
- Data Preparation:
    - Match: Street, Direction
    - Approximate: Time, Location (within a radius)
    - Direct factors: Traffic volume - Bus Lateness
    - Other factors: Distance (stop - count site), Aggregate traffic volume



In [2]:
# SITES = u.import_data("SITES")
# pt1 = tuple(SITES.iloc[0][["Lat","Long"]])
# pt2 = tuple(SITES.iloc[1][["Lat","Long"]])
# d = u.distance(pt1,pt2)
# print(pt1,pt2,d)

<!-- Column	How to prepare	Source
Bus stops	Select bus stops in the radius of the traffic count (radius=500)	STOPS.csv
Distance	Use lat, long to calculate distance to the site	STOPS.csv
Same Street	Compare street name of stop vs traffic count site	STOPS.csv
Same Direction	Traffic volume in the direction of the stop	TRAFFIC COUNT.csv
Opposite	Traffic volume in the other direction	TRAFFIC COUNT.csv
Total	Total volume	TRAFFIC COUNT.csv
Mean OTP	Collect Bus OTP in the time frame, calculate the mean	ON_TIME.csv
Interval	Time -> converted to float hh.mm	ON_TIME.csv
Num Lanes	Match  number of lanes based on street name (optional). Might be a bit complicated since different part of the same streets may have different num lanes	ROAD NETWORKS.csv -->

Location: Disraeli Bridge
Time: 4:45 - 5 pm Aug 20 2021
Bus stops within Radius = 500m

In [2]:
ALL_SITES = u.import_data("SITES")
ALL_STOPS = u.import_data("STOPS")
ALL_TRAFFIC_COUNTS = u.import_data("TRAFFIC_COUNTS")
ALL_LANE_CLOSURE = u.import_data("LANE_CLOSURE")
ALL_ON_TIME = u.import_data("ON_TIME")
ALL_ROAD_NETWORK = u.import_data("ROAD_NETWORK")

In [3]:
TC_SITE = ALL_SITES[ALL_SITES["Street"] == "McPhillips"]
MIN = pd.to_timedelta("1 min")
time = pd.to_datetime("2021-08-20 17:00:00")
TRAFFIC_COUNTS = ALL_TRAFFIC_COUNTS[(ALL_TRAFFIC_COUNTS["Timestamp"]==time) & (ALL_TRAFFIC_COUNTS["Street"]==TC_SITE["Street"].item())]
TRAFFIC_COUNTS.loc[:,"fmt_Timestamp"] = [u.fmt_timestamp(i) for i in TRAFFIC_COUNTS["Timestamp"]]

radius = 500 # meters
site_coords = TC_SITE[["Lat","Long"]].values[0]
stop_coords = ALL_STOPS[["Stop Number","Lat","Long"]].set_index("Stop Number").values
distances = u.distance_within(site_coords,stop_coords,radius,key=None)
AFF_STOPS = ALL_STOPS.iloc[distances["index"]]  
AFF_STOPS

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value


Unnamed: 0,Stop Number,Stop Name,Lat,Long,Direction,Street,At
1483,30199,Westbound Leila at Watson,49.95498,-97.152048,Westbound,Leila,Watson
1628,30353,Northbound McPhillips at Leila North,49.954245,-97.146611,Northbound,McPhillips,Leila North
1637,30362,Westbound Leila at Garden Park West,49.95232,-97.143785,Westbound,Leila,Garden Park West
1638,30363,Westbound Leila at McPhillips East,49.95302,-97.145947,Westbound,Leila,McPhillips East
1639,30364,Southbound Garden City Terminal at Garden City...,49.952219,-97.145322,Southbound,Garden City Terminal,Garden City Centre (71 via McPhillips)
1640,30365,Westbound Leila at McPhillips West,49.953873,-97.148619,Westbound,Leila,McPhillips West
1641,30366,Westbound Leila at Watson East (Seven Oaks Hos...,49.954478,-97.150478,Westbound,Leila,Watson East (Seven Oaks Hospital)
1646,30371,Eastbound Leila at Leila Loop,49.954954,-97.15322,Eastbound,Leila,Leila Loop
1647,30372,Eastbound Leila at Watson East,49.953885,-97.149413,Eastbound,Leila,Watson East
1649,30374,Southbound McPhillips at Leila,49.952922,-97.148237,Southbound,McPhillips,Leila


In [4]:
ON_TIME = ALL_ON_TIME[(ALL_ON_TIME["Scheduled Time"] <= time) & (ALL_ON_TIME["Scheduled Time"] > time -15*MIN)]

DF = AFF_STOPS.loc[:,["Stop Number"]]
DF.loc[:,"Distance"] = distances["distance"]
DF.loc[:,"Street"] = (AFF_STOPS["Street"] == TRAFFIC_COUNTS["Street"].item()).replace({True:1,False:0})
DF.loc[(AFF_STOPS["Street"] != TC_SITE["Street"].item()),"Directional"] = 0
for dir in ["Northbound","Southbound","Eastbound","Westbound"]:
    DF.loc[(AFF_STOPS["Street"] == TC_SITE["Street"].item()) & (AFF_STOPS["Direction"] == dir),"Directional"] = TRAFFIC_COUNTS[dir].item()
DF.loc[:,"Total"] = TRAFFIC_COUNTS["Total"].item()
DF.loc[:,["Arrivals","Average OTP"]] = [(len(ON_TIME.loc[ON_TIME["Stop Number"]==i,"Deviation"].values),ON_TIME.loc[ON_TIME["Stop Number"]==i,"Deviation"].values.mean()) for i in AFF_STOPS["Stop Number"]]

DF.sort_values("Distance")

  DF.loc[:,["Arrivals","Average OTP"]] = [(len(ON_TIME.loc[ON_TIME["Stop Number"]==i,"Deviation"].values),ON_TIME.loc[ON_TIME["Stop Number"]==i,"Deviation"].values.mean()) for i in AFF_STOPS["Stop Number"]]
  ret = ret.dtype.type(ret / rcount)


Unnamed: 0,Stop Number,Distance,Street,Directional,Total,Arrivals,Average OTP
1661,30386,62.842201,1,375.0,687,3,98.333333
1650,30375,130.469948,1,375.0,687,3,69.666667
1649,30374,144.105377,1,312.0,687,2,-152.5
1668,30393,169.37511,1,312.0,687,2,-137.0
1640,30365,240.027105,0,0.0,687,1,-33.0
1647,30372,241.157455,0,0.0,687,1,-101.0
1654,30379,246.263428,0,0.0,687,4,-22.75
2025,30779,252.923932,0,0.0,687,1,-114.0
1638,30363,263.378003,0,0.0,687,3,-80.333333
2123,30890,263.809125,0,0.0,687,2,-45.0


Aggregating data for 1hr or more instead of just every 15min

In [19]:
TC_SITE = ALL_SITES[ALL_SITES["Street"] == "McPhillips"]
MIN = pd.to_timedelta("1 min")
DAY = pd.to_timedelta("1 day")
time = pd.to_datetime("2021-08-20")
TRAFFIC_COUNTS = ALL_TRAFFIC_COUNTS[(ALL_TRAFFIC_COUNTS["Timestamp"]>=time) & (ALL_TRAFFIC_COUNTS["Timestamp"]<time+1*DAY) & (ALL_TRAFFIC_COUNTS["Street"]==TC_SITE["Street"].item())]
TRAFFIC_COUNTS.loc[:,"Time Interval"] = [u.fmt_timestamp(i) for i in TRAFFIC_COUNTS["Timestamp"]]
TRAFFIC_COUNTS.sort_values("Time Interval")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value


Unnamed: 0,Timestamp,Site,Northbound,Southbound,Eastbound,Westbound,Total,Lat,Long,Street,Near,Time Interval
17688,2021-08-20 00:00:00,McPhillips And 190m South Of Leila,105.0,45.0,0.0,0.0,150,49.951733,-97.149032,McPhillips,Leila,0.00
17960,2021-08-20 00:15:00,McPhillips And 190m South Of Leila,78.0,64.0,0.0,0.0,142,49.951733,-97.149032,McPhillips,Leila,0.25
17872,2021-08-20 00:30:00,McPhillips And 190m South Of Leila,50.0,43.0,0.0,0.0,93,49.951733,-97.149032,McPhillips,Leila,0.50
17773,2021-08-20 00:45:00,McPhillips And 190m South Of Leila,45.0,56.0,0.0,0.0,101,49.951733,-97.149032,McPhillips,Leila,0.75
17729,2021-08-20 01:00:00,McPhillips And 190m South Of Leila,42.0,32.0,0.0,0.0,74,49.951733,-97.149032,McPhillips,Leila,1.00
...,...,...,...,...,...,...,...,...,...,...,...,...
18016,2021-08-20 22:45:00,McPhillips And 190m South Of Leila,138.0,115.0,0.0,0.0,253,49.951733,-97.149032,McPhillips,Leila,22.75
17687,2021-08-20 23:00:00,McPhillips And 190m South Of Leila,122.0,119.0,0.0,0.0,241,49.951733,-97.149032,McPhillips,Leila,23.00
18173,2021-08-20 23:15:00,McPhillips And 190m South Of Leila,147.0,89.0,0.0,0.0,236,49.951733,-97.149032,McPhillips,Leila,23.25
17895,2021-08-20 23:30:00,McPhillips And 190m South Of Leila,122.0,101.0,0.0,0.0,223,49.951733,-97.149032,McPhillips,Leila,23.50


In [43]:
freq = "1h"
day = pd.to_datetime("2021-08-20")
cols = ["Northbound","Southbound","Eastbound","Westbound","Total"]
time_range = day + pd.timedelta_range(start="0:00:00",end="24:00:00",freq=freq)
df = pd.DataFrame()
for i in range(len(time_range)-1):
    lower_lim = time_range[i]
    upper_lim = time_range[i+1]
    res = TRAFFIC_COUNTS[TRAFFIC_COUNTS["Timestamp"] == upper_lim]
    res.loc[:,cols] = TRAFFIC_COUNTS.loc[(TRAFFIC_COUNTS["Timestamp"] <= upper_lim) & (TRAFFIC_COUNTS["Timestamp"] > lower_lim),cols].sum(axis=0).values
    df = pd.concat([df,res])
df


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, v, pi)


Unnamed: 0,Timestamp,Site,Northbound,Southbound,Eastbound,Westbound,Total,Lat,Long,Street,Near,Time Interval
17729,2021-08-20 01:00:00,McPhillips And 190m South Of Leila,215.0,195.0,0.0,0.0,410.0,49.951733,-97.149032,McPhillips,Leila,1.0
17656,2021-08-20 02:00:00,McPhillips And 190m South Of Leila,120.0,99.0,0.0,0.0,219.0,49.951733,-97.149032,McPhillips,Leila,2.0
17661,2021-08-20 03:00:00,McPhillips And 190m South Of Leila,90.0,70.0,0.0,0.0,160.0,49.951733,-97.149032,McPhillips,Leila,3.0
17814,2021-08-20 04:00:00,McPhillips And 190m South Of Leila,121.0,86.0,0.0,0.0,207.0,49.951733,-97.149032,McPhillips,Leila,4.0
17874,2021-08-20 05:00:00,McPhillips And 190m South Of Leila,125.0,155.0,0.0,0.0,280.0,49.951733,-97.149032,McPhillips,Leila,5.0
17883,2021-08-20 06:00:00,McPhillips And 190m South Of Leila,237.0,369.0,0.0,0.0,606.0,49.951733,-97.149032,McPhillips,Leila,6.0
17879,2021-08-20 07:00:00,McPhillips And 190m South Of Leila,498.0,1102.0,0.0,0.0,1600.0,49.951733,-97.149032,McPhillips,Leila,7.0
17772,2021-08-20 08:00:00,McPhillips And 190m South Of Leila,790.0,1413.0,0.0,0.0,2203.0,49.951733,-97.149032,McPhillips,Leila,8.0
17809,2021-08-20 09:00:00,McPhillips And 190m South Of Leila,893.0,1217.0,0.0,0.0,2110.0,49.951733,-97.149032,McPhillips,Leila,9.0
17764,2021-08-20 10:00:00,McPhillips And 190m South Of Leila,993.0,1213.0,0.0,0.0,2206.0,49.951733,-97.149032,McPhillips,Leila,10.0


In [62]:
DF1 = pd.DataFrame()
for timestamp in df["Timestamp"]:
    ON_TIME = ALL_ON_TIME[(ALL_ON_TIME["Scheduled Time"] <= time) & (ALL_ON_TIME["Scheduled Time"] > time - pd.to_timedelta(freq))]
    df_ = df[df["Timestamp"] == timestamp]

    df1 = AFF_STOPS.loc[:,["Stop Number"]]
    df1.loc[:,"Distance"] = distances["distance"]
    df1.loc[:,"Street"] = (AFF_STOPS["Street"] == df_["Street"].item()).replace({True:1,False:0})
    df1.loc[(AFF_STOPS["Street"] != TC_SITE["Street"].item()),"Directional"] = 0
    for dir in ["Northbound","Southbound","Eastbound","Westbound"]:
        df1.loc[(AFF_STOPS["Street"] == TC_SITE["Street"].item()) & (AFF_STOPS["Direction"] == dir),"Directional"] = df_[dir].item()
    df1.loc[:,"Total"] = df_["Total"].item()
    df1.loc[:,["Arrivals","Average OTP"]] = [(len(ON_TIME.loc[ON_TIME["Stop Number"]==i,"Deviation"].values),ON_TIME.loc[ON_TIME["Stop Number"]==i,"Deviation"].values.mean()) for i in AFF_STOPS["Stop Number"]]
    df1.loc[:,"Time interval"] = u.fmt_timestamp(timestamp)
    DF1 = pd.concat([DF1,df1])

DF1

  df1.loc[:,["Arrivals","Average OTP"]] = [(len(ON_TIME.loc[ON_TIME["Stop Number"]==i,"Deviation"].values),ON_TIME.loc[ON_TIME["Stop Number"]==i,"Deviation"].values.mean()) for i in AFF_STOPS["Stop Number"]]
  ret = ret.dtype.type(ret / rcount)


Unnamed: 0,Stop Number,Distance,Street,Directional,Total,Arrivals,Average OTP,Time interval
1483,30199,421.070384,0,0.0,410.0,1,-135.0,1.0
1628,30353,329.076830,1,215.0,410.0,1,249.0,1.0
1637,30362,381.455906,0,0.0,410.0,2,85.5,1.0
1638,30363,263.378003,0,0.0,410.0,2,35.0,1.0
1639,30364,271.147088,0,0.0,410.0,1,251.0,1.0
...,...,...,...,...,...,...,...,...
2150,30918,337.529056,0,0.0,1058.0,0,,23.0
2151,30919,342.831977,0,0.0,1058.0,0,,23.0
2157,30925,468.780146,0,0.0,1058.0,0,,23.0
2158,30926,350.956296,0,0.0,1058.0,0,,23.0


In [63]:
DF1.dropna()


Unnamed: 0,Stop Number,Distance,Street,Directional,Total,Arrivals,Average OTP,Time interval
1483,30199,421.070384,0,0.0,410.0,1,-135.0,1.0
1628,30353,329.076830,1,215.0,410.0,1,249.0,1.0
1637,30362,381.455906,0,0.0,410.0,2,85.5,1.0
1638,30363,263.378003,0,0.0,410.0,2,35.0,1.0
1639,30364,271.147088,0,0.0,410.0,1,251.0,1.0
...,...,...,...,...,...,...,...,...
1651,30376,264.104329,0,0.0,1058.0,1,-454.0,23.0
1974,30721,340.404675,0,0.0,1058.0,1,-467.0,23.0
2025,30779,252.923932,0,0.0,1058.0,1,-496.0,23.0
2123,30890,263.809125,0,0.0,1058.0,1,-78.0,23.0


In [76]:
DF1[DF1["Time interval"] == 19]

Unnamed: 0,Stop Number,Distance,Street,Directional,Total,Arrivals,Average OTP,Time interval
1483,30199,421.070384,0,0.0,2629.0,1,-135.0,19.0
1628,30353,329.07683,1,1420.0,2629.0,1,249.0,19.0
1637,30362,381.455906,0,0.0,2629.0,2,85.5,19.0
1638,30363,263.378003,0,0.0,2629.0,2,35.0,19.0
1639,30364,271.147088,0,0.0,2629.0,1,251.0,19.0
1640,30365,240.027105,0,0.0,2629.0,1,-152.0,19.0
1641,30366,322.652257,0,0.0,2629.0,1,-143.0,19.0
1646,30371,467.44978,0,0.0,2629.0,2,-36.5,19.0
1647,30372,241.157455,0,0.0,2629.0,1,-451.0,19.0
1649,30374,144.105377,1,1209.0,2629.0,0,,19.0
