# Categorize `on_shn`, `parallel` (affected by SHN), and `other`

* Since `on_shn` is the primary category, and it's drawn with a 50 ft buffer around hwy centerline, no longer need to use `pct_highway` (set `pct_highway > 0`)
* Is 25% too high of a threshold? 
* `pct_route` threshold of 20% and 25% both fall within the top 70%-75% of routes
* Settle for at least 20% of route length runs within 50 ft of hwy (on hwy)

In [1]:
import geopandas as gpd
import pandas as pd

from update_vars import (ANALYSIS_DATE, 
                         BUS_SERVICE_GCS, COMPILED_CACHED_GCS)



In [2]:
df = gpd.read_parquet(f"{BUS_SERVICE_GCS}routes_on_shn_{ANALYSIS_DATE}.parquet")

In [3]:
print(f"# rows (route_id-Route pairs): {len(df)}")
print(f"# route_id: {len(df[['itp_id', 'route_id']].drop_duplicates())}")

# rows (route_id-Route pairs): 6675
# route_id: 2304


In [4]:
unique_routes = (df.sort_values(["itp_id", "route_id", "pct_route"], 
                               ascending=[True, True, False])
                 .drop_duplicates(subset=["itp_id", "route_id"])
                 .reset_index(drop=True)
                )

In [5]:
ptile = []

for i in range(5, 100, 5):
    ptile.append(i/100)

unique_routes.pct_route.describe(percentiles=ptile)

count    2304.000000
mean        0.151419
std         0.208442
min         0.000000
5%          0.003000
10%         0.004000
15%         0.005000
20%         0.006000
25%         0.007000
30%         0.008000
35%         0.010000
40%         0.013000
45%         0.021000
50%         0.039000
55%         0.063000
60%         0.087000
65%         0.121950
70%         0.175100
75%         0.258000
80%         0.314000
85%         0.385000
90%         0.464000
95%         0.632000
max         0.922000
Name: pct_route, dtype: float64

In [6]:
for r in range(20, 35, 5):
    subset = unique_routes[unique_routes.pct_route >= r/100]
        
    print(f"route threshold: {r/100} - {len(subset)}")

route threshold: 0.2 - 657
route threshold: 0.25 - 592
route threshold: 0.3 - 493


In [7]:
twenty = unique_routes[unique_routes.pct_route >= 0.20]
twentyfive = unique_routes[unique_routes.pct_route >= 0.25]

In [8]:
def make_map(gdf: gpd.GeoDataFrame): 
    
    cols = ["itp_id", "route_id", "geometry"]

    m = (gdf[cols].drop_duplicates()
         .explore("itp_id", categorical=True, tiles = "CartoDB Positron")
    )
    
    print(f"route threshold: {gdf.pct_route.min()}")
    display(m)

In [9]:
#make_map(twenty)
#make_map(twentyfive)

In [10]:
itp_id = 182

operator_twenty = twenty[twenty.itp_id==itp_id]
operator_twentyfive = twentyfive[twentyfive.itp_id==itp_id]

difference_routes = list(set(operator_twentyfive.route_id)
     .symmetric_difference(set(operator_twenty.route_id)))

make_map(operator_twenty)
make_map(operator_twentyfive)

print("Routes Included if Threshold is 20%")
make_map(operator_twenty[operator_twenty.route_id.isin(difference_routes)])

route threshold: 0.203


route threshold: 0.356


Routes Included if Threshold is 20%
route threshold: 0.203


In [11]:
itp_id = 4

operator_twenty = twenty[twenty.itp_id==itp_id]
operator_twentyfive = twentyfive[twentyfive.itp_id==itp_id]

difference_routes = list(set(operator_twentyfive.route_id)
     .symmetric_difference(set(operator_twenty.route_id)))

make_map(operator_twenty)
make_map(operator_twentyfive)

print("Routes Included if Threshold is 20%")
make_map(operator_twenty[operator_twenty.route_id.isin(difference_routes)])

route threshold: 0.201


route threshold: 0.251


Routes Included if Threshold is 20%
route threshold: 0.201
