In [1]:
import pandas as pd
import geopandas as gpd

#### task description
- summarize transit info for light rail, heavy rail, commuter rail
  - create summary table with these info: route, time_period, distance, runtime, pattern_number
- data source: `ccta/31000190/Model_runs/2035/12_April15/2035_BaseY_BCM2035`
- note: add 2 extra blank line in `transit.lin` in order to make the parsing successful (see scripts below)

#### modes (specified in `TransitSkims_2035_and_beyond.job`)
- light rail: 100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119
- heavy rail: 120,121,122,123,124,125,126,127,128,129
- commuter rail: 130,131,132,133,134,135,136,137,138,139

In [2]:
link_dist = pd.read_csv("../data/complete_network_with_externals.csv").rename(columns={"DISTANCE": "distance"}) 
link_dist.head()

Unnamed: 0,A,B,distance
0,1,1001671,0.17232
1,1,1021297,0.08112
2,1,1024255,0.07073
3,2,1006730,0.12418
4,2,1012189,0.0723


#### gather mode & runtime info for each transit route

In [3]:
rail_route_runtime = {"route": [], "long_name": [], "mode": [], "runtime": []}

with open("../data/transit.lin", "r") as f:
    lines = f.readlines()

    curr_line = None
    curr_longname = None
    curr_mode = None
    curr_runtime = None

    for txt in lines:
        if txt.startswith("LINE NAME="):
            # store the current line name
            curr_line = txt.split("\"")[1]
            # reset line_node_seq as an empty list
            line_node_seq = []

        if txt.startswith(" LONGNAME="):
            curr_longname = txt.split("=")[1].replace(",", "").replace('"', "").replace("\n", "")
            if curr_longname == "nan":
                curr_longname = ""

        if txt.startswith(" MODE="):
            curr_mode = int(txt.split("=")[1].replace(",", ""))

        if txt.startswith(" RUNTIME="):

            if curr_mode >= 100 and curr_mode <= 139: # only collect info for rail
                curr_runtime = float(txt.split("=")[1].replace(",", ""))
                rail_route_runtime["route"].append(curr_line)
                rail_route_runtime["long_name"].append(curr_longname)
                rail_route_runtime["mode"].append(curr_mode)
                rail_route_runtime["runtime"].append(curr_runtime)

        if txt.startswith(" N="):
            # reset curr_runtime
            curr_runtime = None

rail_route_runtime = pd.DataFrame(rail_route_runtime)
rail_route_runtime.head()

Unnamed: 0,route,long_name,mode,runtime
0,5_1_EA_d0_s1,,133,132.0
1,5_1_AM_d0_s1,,133,132.0
2,5_1_PM_d0_s2,,133,132.0
3,26_154_EA_d0_s404,Pittsburg/Bay Point - SFIA/Millbrae,120,86.0
4,26_154_AM_d0_s404,Pittsburg/Bay Point - SFIA/Millbrae,120,40.0


#### gather node sequence and distance info for each transit route

In [4]:
with open("../data/transit.lin", "r") as f:
    lines = f.readlines()

    all_route_links = pd.DataFrame()
    curr_route = None
    route_node_seq = None

    for txt in lines:
        if txt.startswith("LINE NAME="):
            # store the current line name
            curr_route = txt.split("\"")[1]
            # reset route_node_seq as an empty list
            route_node_seq = []

        if txt.startswith(" MODE="):
            curr_mode = int(txt.split("=")[1].replace(",", ""))

        # add to node sequence if the first item of txt after split by "," and remove whitespace is digit
        if txt.strip().split(",")[0].replace("-", "").isdigit():
            node = int(txt.strip().split(",")[0].replace("-", ""))
            route_node_seq.append(node)

        if curr_route and txt == "\n": # NOTE: add 2 extra blank line in transit.lin, otherwise the last route will not be populated
            # convert previous route_node_seq into df
            route_links = pd.DataFrame({"route": curr_route, "A": route_node_seq[:-1], "B": route_node_seq[1:]})
            # add to all_route_links
            all_route_links = pd.concat([all_route_links, route_links]).reset_index(drop=True)

all_route_links["A"] = all_route_links["A"].astype(int)
all_route_links["B"] = all_route_links["B"].astype(int)

# add link distance
all_route_links = pd.merge(all_route_links, link_dist, how="left", on=["A", "B"])

# aggregate by route to get route distance
route_dist = all_route_links[["route", "distance"]].groupby("route").sum().reset_index()

route_dist.head()

Unnamed: 0,route,distance
0,100_5871_AM_d0_s26095,7.11056
1,100_5871_MD_d0_s26095,7.11056
2,100_5871_PM_d0_s26095,7.11056
3,100_5872_AM_d0_s26098,8.66201
4,100_5872_MD_d0_s26098,8.66201


In [5]:
temp2 = all_route_links.copy()
temp2 = temp2[temp2["route"] == "5_1_AM_d0_s1"]
temp2

Unnamed: 0,route,A,B,distance
9,5_1_AM_d0_s1,3097285,3097286,11.8673
10,5_1_AM_d0_s1,3097286,2625972,12.57288
11,5_1_AM_d0_s1,2625972,2625973,16.90528
12,5_1_AM_d0_s1,2625973,2625974,3.09864
13,5_1_AM_d0_s1,2625974,2625975,7.14929
14,5_1_AM_d0_s1,2625975,2625976,10.46114
15,5_1_AM_d0_s1,2625976,2192946,11.62475
16,5_1_AM_d0_s1,2192946,2192947,4.3869
17,5_1_AM_d0_s1,2192947,2192948,2.63548


In [6]:
temp = all_route_links.copy()
temp = temp[temp["route"] == "5_991_EV_d1_s101208"]
temp

Unnamed: 0,route,A,B,distance
967881,5_991_EV_d1_s101208,9550777,9550776,4.157
967882,5_991_EV_d1_s101208,9550776,2625959,5.90532


#### combine gathered info

In [7]:
# combine rail_route_runtime with route_dist
rail_route_info = pd.merge(rail_route_runtime, route_dist, how="left", on="route")
# put info in "line" column into multiple columns
rail_route_info[["operator", "route_id", "time_period", "direction", "shape_id"]] = rail_route_info["route"].str.split("_", expand=True)
rail_route_info["direction"] = rail_route_info["direction"].str.replace("d", "") # remove "d" from "direction"

# convert operator, route_id, and direction column to int
for col in ["operator", "route_id", "direction"]:
    rail_route_info[col] = rail_route_info[col].astype(int)

# sort df
time_period_index = pd.DataFrame({"time_period": ["EA", "AM", "MD", "PM", "EV"],
                                  "time_period_index": range(1, 6)})

rail_route_info = pd.merge(rail_route_info, time_period_index, how="left", on="time_period")
rail_route_info = rail_route_info.sort_values(["operator", "route_id", "direction", "time_period_index"]).drop(columns="time_period_index")
# print(len(rail_route_info))

# assign shape pattern_number for each operator - route_id - direction combination
rail_pattern_num = rail_route_info[["operator", "route_id", "direction", "shape_id"]].copy().drop_duplicates()
rail_pattern_num["pattern_number"] = rail_pattern_num.groupby(["operator", "route_id", "direction"]).cumcount().add(1)

rail_route_info = pd.merge(rail_route_info, rail_pattern_num, how="left", on=["operator", "route_id", "direction", "shape_id"])
# print(len(rail_route_info))

# add rail type
rail_route_info["type"] = ""
rail_route_info.loc[((rail_route_info["mode"] >= 100) & (rail_route_info["mode"] <= 119)), "type"] = "light rail"
rail_route_info.loc[((rail_route_info["mode"] >= 120) & (rail_route_info["mode"] <= 129)), "type"] = "heavy rail"
rail_route_info.loc[((rail_route_info["mode"] >= 130) & (rail_route_info["mode"] <= 139)), "type"] = "commuter rail"

# reorder columns
rail_route_info = rail_route_info[["route", "mode", "type", "long_name", "operator", "route_id", "direction", "time_period", "distance", "runtime", "shape_id", "pattern_number"]]

rail_route_info.head()

Unnamed: 0,route,mode,type,long_name,operator,route_id,direction,time_period,distance,runtime,shape_id,pattern_number
0,3_160_AM_d0_s421,103,light rail,San Francisco - Angel Island Ferry,3,160,0,AM,4.63903,25.0,s421,1
1,3_160_MD_d0_s421,103,light rail,San Francisco - Angel Island Ferry,3,160,0,MD,4.63903,30.0,s421,1
2,3_160_PM_d0_s419,103,light rail,San Francisco - Angel Island Ferry,3,160,0,PM,4.63903,50.0,s419,2
3,3_160_MD_d1_s419,103,light rail,San Francisco - Angel Island Ferry,3,160,1,MD,4.63903,45.0,s419,1
4,3_161_MD_d0_s416,103,light rail,San Francisco - Sausalito Ferry,3,161,0,MD,5.22208,30.0,s416,1


#### write out result

In [8]:
rail_route_info.to_csv("../outputs/rail_route_info.csv", index=False)