In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import myutils as u
import math
import datetime
import warnings
import pandas.errors
warnings.simplefilter(action='ignore', category=Warning)

# Import Data

In [None]:
ALL_TC_SITES = u.import_data("SITES")
ALL_TC = u.import_data("TRAFFIC_COUNTS")
ALL_OTP = u.import_data("ON_TIME")
ALL_STOPS = u.import_data("STOPS")
ALL_CONSTR = u.import_data("LANE_CLOSURE")
ALL_ROADS = u.import_data("ROAD_NETWORK")

# Consts
MIN = pd.to_timedelta("1 min")
DAY = pd.to_timedelta("1 day")
SITES_NO = {'McPhillips': 0,'Henderson': 1,'Pembina': 2,'Inkster': 3,'Nichol': 4,'Lagimodiere': 5,'Disraeli': 6,'Marion': 7} 

In [None]:
def get_stops_distances(tc_site, distance=500,key=None):
    TC_SITE = ALL_TC_SITES[ALL_TC_SITES["Street"] == tc_site]
    site_coords = TC_SITE[["Lat","Long"]].values[0]
    stop_coords = ALL_STOPS[["Lat","Long"]].values
    distances = u.distance_within(site_coords,stop_coords,distance,key=key)
    return distances 

def get_stops_nearby(tc_site, distance=500):
    stops_index = get_stops_distances(tc_site, distance, "index")
    return ALL_STOPS.iloc[stops_index]

def get_traffic_counts(tc_site,date,freq=None):
    # Obtain the traffic count information
    TC_SITE = ALL_TC_SITES[ALL_TC_SITES["Street"] == tc_site]
    TC = ALL_TC[(ALL_TC["Timestamp"]>date) & (ALL_TC["Timestamp"]<= date+1*DAY) & (ALL_TC["Street"]==TC_SITE["Street"].item())]
    TC.loc[:,"Time Interval"] = [u.fmt_timestamp(i) for i in TC["Timestamp"]]

    if freq != None:
    # Select the frequency with which the data is aggregated
        time_range = date + pd.timedelta_range(start="0:00:00",end="24:00:00",freq=freq)
        AGGR_TC = pd.DataFrame()
        cols = ["Northbound","Southbound","Eastbound","Westbound","Total"]
        for i in range(len(time_range)-1):
            lower_lim = time_range[i]
            upper_lim = time_range[i+1]
            res = TC[TC["Timestamp"] == upper_lim]
            res.loc[:,cols] = TC.loc[(TC["Timestamp"] <= upper_lim) & (TC["Timestamp"] > lower_lim),cols].sum(axis=0).values
            AGGR_TC = pd.concat([AGGR_TC,res])
        TC = AGGR_TC
    return TC.sort_values("Timestamp")

def get_otp(start,end):
    return ALL_OTP[(ALL_OTP["Scheduled Time"] <= end) & (ALL_OTP["Scheduled Time"] > start)]

def prepare_data(tc_site,date,distance=500,freq=None):
    AFF_STOPS = get_stops_nearby(tc_site, distance)
    distances = get_stops_distances(tc_site, distance,key="distance")
    TC = get_traffic_counts(tc_site,date,freq)
    OTP = get_otp(date, date+1*DAY)
    DF = pd.DataFrame()
    for timestamp in TC["Timestamp"]:
        TC_i = TC[TC["Timestamp"] == timestamp]
        df1 = AFF_STOPS.loc[:,["Stop Number"]]
        df1.loc[:,"Site"] = tc_site
        df1.loc[:,"Distance"] = distances
        df1.loc[:,"Same Street"] = (AFF_STOPS["Street"] == TC_i["Street"].item()).replace({True:1,False:0})
        # Directional & Total traffic count
        df1.loc[(AFF_STOPS["Street"] != tc_site),"Directional"] = 0
        for direction in ["Northbound","Southbound","Eastbound","Westbound"]:
            df1.loc[(AFF_STOPS["Street"] == tc_site) & (AFF_STOPS["Direction"] == direction),"Directional"] = TC_i[direction].item()
        df1.loc[:,"Total"] = TC_i["Total"].item()
        # 
        df1.loc[:,["Arrivals","Average OTP"]] = [(len(OTP.loc[OTP["Stop Number"]==stop_no,"Deviation"].values),OTP.loc[OTP["Stop Number"]==stop_no,"Deviation"].values.mean()) 
                                                                for stop_no in AFF_STOPS["Stop Number"]]
        df1.loc[:,"Time interval"] = u.fmt_timestamp(timestamp)
        df1.loc[:,"Date"] = TC_i["Timestamp"].item().date()
        DF = pd.concat([DF,df1])
    return DF

date = pd.to_datetime("2021-08-23")
tc_site = "McPhillips"
distance=500
freq = "2h"

AFF_STOPS = get_stops_nearby(tc_site, distance)
TC = get_traffic_counts(tc_site,date,freq)
prepare_data(tc_site,date,distance,freq)
# AFF_STOPS

# Remove Irrelevant Bus Stops

In [None]:
ALL_STOPS

In [None]:
ALL_TC_SITES

In [None]:
stop_coords = ALL_STOPS[["Lat","Long"]].values
all_stops = set(ALL_STOPS.index)
valid_stops = set()
for org in ALL_TC_SITES[["Lat","Long"]].values:
    valid_stop_i = set(u.distance_within(org,stop_coords,d=5000,key="index"))
    valid_stops |= valid_stop_i
invalid_stops = all_stops - valid_stops


In [None]:
ALL_STOPS.loc[pd.Index(invalid_stops)]
stop = ALL_STOPS.loc[pd.Index(invalid_stops),["Lat","Long"]].values[1]
print(ALL_STOPS.loc[pd.Index(invalid_stops)].values[1])
tc_stations = ALL_TC_SITES[["Lat","Long"]].values
u.distance_within(stop,tc_stations,d=1e9,key="distance")


In [None]:
VALID_STOPS = ALL_STOPS.loc[pd.Index(valid_stops)]
VALID_STOPS

# Calculate Weights

## Data shapes:
1. stop:
    (Lat, Long, Stop number)
2. times:
    ['2021-08-01 20:00:00', '2021-08-01 20:15:00']
3. traffic counts:
    (Lat, Long, Total count, Site name)

In [None]:
def calculateAggregateWeight(stop: tuple, times: list) -> float:
    """
    Calculate aggregate weight of a bus stop with the traffic count stations at given times

    Arguments:
        - stop: a geo coordinate in tuple
        - times: a list of string contaning date and time to select from

    Returns:
        aggregate weight of a bus stop with the traffic count stations at given times
    """
    get_traffic_counts = lambda time: ALL_TC.loc[ALL_TC["Timestamp"] == time][["Lat", "Long", "Total", "Site"]].sort_values(by="Lat")
    
    traffic_counts = np.zeros(8, dtype=np.int64)
    for time in times:
        tc_stations = get_traffic_counts(time)
        traffic_counts = np.add(tc_stations.Total.tolist(), traffic_counts)

    distances = u.distance_within(stop[:2], tc_stations.values, d=1e9, key="distance")
    coefficients = [1/dist for dist in distances]
    
    return np.dot(coefficients, traffic_counts.tolist())

In [None]:
def enumerateTimes(start: str, end: str, num_interval: int):
    """
    Enumerate time intervals with given start and end time, and the number of invertal of 15 mins.

    Arguments:
        - start: a string in the format defined in DATE_TIME_FORMAT
        - end: a string in the format defined in DATE_TIME_FORMAT
        - num_interval: number of interval of 15 mins. 

    Returns:
        A 2-layer nested list contaning enumerated time.
        The outer layer gives the a list of datetime string that is separated by a 15 minutes time interval.

    Example:
        `enumerateTimes("2021-08-01 02:00:00", "2021-08-01 04:00:00", 2)` gives the following result:  
        
        
        ```
        [['2021-08-01 02:00:00', '2021-08-01 02:15:00'],    
        ['2021-08-01 02:30:00', '2021-08-01 02:45:00'],    
        ['2021-08-01 03:00:00', '2021-08-01 03:15:00'],   
        ['2021-08-01 03:30:00', '2021-08-01 03:45:00']]  
        ```
    """
    DATE_TIME_FORMAT = "%Y-%m-%d %H:%M:%S"
    str2time = lambda str: datetime.datetime.strptime(str, DATE_TIME_FORMAT)
    time2str = lambda time: time.strftime(DATE_TIME_FORMAT)

    startTime = str2time(start)
    endTime = str2time(end)
    interval = datetime.timedelta(minutes=15)

    ret = []
    currentTime = startTime
    while (currentTime < endTime):
        chunk = []
        for i in range(0, num_interval):
            chunk.append(time2str(currentTime))
            currentTime = currentTime + interval
        ret.append(chunk)
    return ret

In [None]:
for time in enumerateTimes("2021-08-01 20:00:00", "2021-08-01 21:00:00", 2):
    for stop in VALID_STOPS[["Lat","Long", "Stop Number"]].values:
        hook_this_to_somewhere = calculateAggregateWeight(stop, time) # TODO
        print(f"for time == {time}, stop == {stop[2]}, aggregate weight is {hook_this_to_somewhere}")