- This notebook is the second step of conflating the old network with the new ranch network
- It loads the failed links from shst match from step 1
- It performs nearest match from the old link to the new link
- First, try to do nearest match using tight tolerance: county, roadway type
- Then, relax nearest match using county

In [1]:
import geopandas as gpd
import pandas as pd
import numpy as np
import os
import pickle

import ranch

In [2]:
root_dir = "D:\metcouncil_network_rebuild"

external_dir = os.path.join(root_dir, 'data', 'external')
interim_dir = os.path.join(root_dir, 'data', 'interim')

corrected_network_dir = os.path.join(external_dir, 'rachel_corrected_version')

## Links that failed shst match

In [3]:
# read the file written out from step 1 (roadway-conflation-post-processing.ipynb)

shst_failed_links_gdf = gpd.read_file(os.path.join(interim_dir, "old_model_links_failed_step1_shst_match.geojson"))

In [4]:
shst_failed_links_gdf.crs

<Projected CRS: EPSG:26915>
Name: NAD83 / UTM zone 15N
Axis Info [cartesian]:
- E[east]: Easting (metre)
- N[north]: Northing (metre)
Area of Use:
- name: North America - between 96°W and 90°W - onshore and offshore. Canada - Manitoba; Nunavut; Ontario. United States (USA) - Arkansas; Illinois; Iowa; Kansas; Louisiana; Michigan; Minnesota; Mississippi; Missouri; Nebraska; Oklahoma; Tennessee; Texas; Wisconsin.
- bounds: (-96.0, 25.61, -90.0, 84.0)
Coordinate Operation:
- name: UTM zone 15N
- method: Transverse Mercator
Datum: North American Datum 1983
- Ellipsoid: GRS 1980
- Prime Meridian: Greenwich

In [5]:
shst_failed_links_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 15556 entries, 0 to 15555
Data columns (total 42 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   link_id     15556 non-null  int64   
 1   geoId       13693 non-null  object  
 2   shape_id    15556 non-null  object  
 3   A           15556 non-null  int64   
 4   B           15556 non-null  int64   
 5   distance    15556 non-null  float64 
 6   roadway     15556 non-null  object  
 7   name        14137 non-null  object  
 8   drive       15556 non-null  int64   
 9   walk        15556 non-null  int64   
 10  bike        15556 non-null  int64   
 11  truck       15556 non-null  int64   
 12  centroid    15556 non-null  int64   
 13  mrcc_id     15556 non-null  int64   
 14  ROUTE_SYS   5079 non-null   object  
 15  assgngrp    15556 non-null  int64   
 16  rdclass     15556 non-null  int64   
 17  area_type   15556 non-null  int64   
 18  county      15556 non-null  int64   
 

## New Network object from Ranch

In [6]:
# the polygon file for the area

input_polygon_file = os.path.join(
    external_dir,
    "county", 
    "cb_2018_us_county_500k_withinMETC.geojson"
)

In [7]:
input_polygon_gdf = gpd.read_file(input_polygon_file)

In [8]:
input_polygon_gdf

Unnamed: 0,STATEFP,COUNTYFP,COUNTYNS,AFFGEOID,GEOID,NAME,LSAD,ALAND,AWATER,geometry
0,27,53,659472,0500000US27053,27053,Hennepin,6,1434458964,136553032,"MULTIPOLYGON (((-93.76763 44.94877, -93.76653 ..."
1,27,139,659514,0500000US27139,27139,Scott,6,922683745,31384772,"MULTIPOLYGON (((-93.91140 44.54564, -93.91024 ..."
2,27,163,659526,0500000US27163,27163,Washington,6,995612473,99462943,"MULTIPOLYGON (((-93.02139 44.78971, -93.02035 ..."
3,27,171,659530,0500000US27171,27171,Wright,6,1712292171,137609477,"MULTIPOLYGON (((-94.26152 45.29769, -94.26018 ..."
4,27,141,659515,0500000US27141,27141,Sherburne,6,1121230376,46966084,"MULTIPOLYGON (((-94.15051 45.52913, -94.14899 ..."
5,27,131,659511,0500000US27131,27131,Rice,6,1284215164,52066242,"MULTIPOLYGON (((-93.52522 44.27594, -93.52520 ..."
6,27,123,659507,0500000US27123,27123,Ramsey,6,394246995,46264475,"MULTIPOLYGON (((-93.22793 45.11632, -93.22779 ..."
7,27,19,659455,0500000US27019,27019,Carver,6,917370948,56746217,"MULTIPOLYGON (((-94.01236 44.94957, -94.01224 ..."
8,27,59,659475,0500000US27059,27059,Isanti,6,1128574114,41160302,"MULTIPOLYGON (((-93.51368 45.73432, -93.47322 ..."
9,55,93,1581107,0500000US55093,55093,Pierce,6,1486272236,46679668,"MULTIPOLYGON (((-92.80736 44.75891, -92.80612 ..."


In [9]:
# network object
working_network_filename = os.path.join(root_dir,"data", "interim","step8_network.pickle")

roadway_network = pickle.load(open(working_network_filename, 'rb'))

In [10]:
# number of unique shstReferenceIds in the network
print(roadway_network.links_df.shstReferenceId.nunique())

# number of unique shstGeometryIds in the network
print(roadway_network.links_df.shstGeometryId.nunique())

1061566
556550


In [11]:
roadway_network.links_df.columns

Index(['shstReferenceId', 'id', 'shstGeometryId', 'fromIntersectionId',
       'toIntersectionId', 'geometry', 'u', 'v', 'nodeIds', 'wayId',
       'roadClass', 'oneWay', 'roundabout', 'link', 'oneway', 'lanes', 'ref',
       'name', 'highway', 'service', 'width', 'maxspeed', 'access', 'junction',
       'bridge', 'tunnel', 'landuse', 'area', 'key', 'forward',
       'backReferenceId', 'metadata', 'source', 'roadway', 'drive_access',
       'walk_access', 'bike_access', 'county', 'length', 'A', 'B',
       'model_link_id', 'locationReferences', 'rail_only'],
      dtype='object')

In [12]:
roadway_network.links_df['county'].value_counts()

Hennepin      284405
Ramsey        159952
Dakota        109928
Washington     89742
Anoka          81587
Wright         42468
Carver         42238
Scott          32515
St. Croix      32325
Isanti         26320
McLeod         25333
Goodhue        25215
Rice           22781
Polk           22290
Sherburne      17415
Chisago        14965
Pierce         12440
Le Sueur       10091
Sibley          9556
Name: county, dtype: int64

## Unmatched links by county

In [13]:
input_polygon_gdf = input_polygon_gdf.to_crs(shst_failed_links_gdf.crs)

In [14]:
# add county name to the unmatched links

shst_failed_links_gdf = gpd.sjoin(
    shst_failed_links_gdf,
    input_polygon_gdf[['NAME','geometry']],
    how = 'left',
    predicate = 'intersects'
)

In [15]:
shst_failed_links_gdf['NAME'].unique()

array(['Carver', 'Dakota', 'Goodhue', nan, 'Polk', 'Wright', 'Washington',
       'Rice', 'Scott', 'Hennepin', 'Sherburne', 'Anoka', 'Isanti',
       'St. Croix', 'Ramsey', 'Chisago', 'Sibley', 'McLeod', 'Le Sueur',
       'Pierce'], dtype=object)

In [16]:
shst_failed_links_gdf[['A','B','link_id', 'geometry','NAME','geoId', 'roadway']]

Unnamed: 0,A,B,link_id,geometry,NAME,geoId,roadway
0,3181,136718,126,"LINESTRING (457432.560 4966007.165, 457432.963...",Carver,,tertiary
1,3208,3218,179,"LINESTRING (486780.485 4944095.268, 486780.881...",Dakota,cbd9b2da3888ca5ed49bf24379602b22,residential
2,3208,3213,180,"LINESTRING (486780.485 4944095.268, 486783.315...",Dakota,f20f27b848b5de5e335aaa1a3f22988a,residential
3,3208,3238,181,"LINESTRING (486780.485 4944095.268, 486771.304...",Dakota,822ed14e4356e4e46886b9dd4ed828ca,residential
4,3213,3238,191,"LINESTRING (486772.989 4944295.285, 486762.738...",Dakota,aa4faa2a8bc878ed5b3522ce216588a5,residential
...,...,...,...,...,...,...,...
15551,170173,81044,420982,"LINESTRING (440044.295 5015376.045, 440076.158...",Wright,342c3320b613ee978c9d2fa7b01f0dff,motorway
15552,170533,155559,421056,"LINESTRING (478579.400 4967651.199, 478502.625...",Hennepin,c955d90085855c3b78fa16a8bebf70a2,motorway
15553,170689,3182,421084,"LINESTRING (442026.980 5014003.096, 441666.718...",Wright,4d0815a7cb52ded0dfed8d750b56c067,motorway
15554,170938,30575,421141,"LINESTRING (482867.200 4967694.028, 482775.853...",Hennepin,1bf79b100ad1fd5ae7003e43f693e9ab,motorway


In [17]:
shst_failed_links_gdf.roadway.unique()

array(['tertiary', 'residential', 'road', 'planned', 'rest', 'secondary',
       'motorway', 'primary', 'trunk', 'unclassified'], dtype=object)

In [18]:
roadway_network.shapes_df.roadClass.value_counts()

Residential     213862
Other           139931
Service          95014
Tertiary         68621
Secondary        18801
Primary           7277
Motorway          5820
Unclassified      3965
Trunk             3259
Name: roadClass, dtype: int64

## Match using nearest link

In [19]:
roadway_network.shapes_df.columns

Index(['id', 'fromIntersectionId', 'toIntersectionId', 'forwardReferenceId',
       'backReferenceId', 'roadClass', 'metadata', 'geometry', 'source'],
      dtype='object')

In [20]:
ranch_shape_gdf = pd.merge(
    roadway_network.shapes_df[['id','roadClass','geometry']].rename(columns = {'id':'shstGeometryId'}),
    roadway_network.links_df[['shstGeometryId','county']].drop_duplicates(),
    how = 'left',
    on = 'shstGeometryId'
)

In [21]:
ranch_shape_gdf

Unnamed: 0,shstGeometryId,roadClass,geometry,county
0,72bf2bff013aa4e226045ec58c9d7b6c,Other,"LINESTRING (-93.18911 44.93969, -93.18958 44.9...",Ramsey
1,142f13a0c93978786b6021b934499302,Service,"LINESTRING (-93.18808 44.93956, -93.18958 44.9...",Ramsey
2,3c484f858736b95176b99b2add8bf128,Residential,"LINESTRING (-93.18806 44.93703, -93.18993 44.9...",Ramsey
3,5e885f48739a926e1feac24992e4cd9b,Residential,"LINESTRING (-93.18993 44.93747, -93.18993 44.9...",Ramsey
4,d64c53ac46a122059ae55f3453087ac4,Service,"LINESTRING (-93.18806 44.93747, -93.18993 44.9...",Ramsey
...,...,...,...,...
556545,959dc9ae8b104ed11c2fddbc6e1b37a7,Residential,"LINESTRING (-92.72710 44.71595, -92.72774 44.7...",Pierce
556546,e99d23aed94ca66e52c63d8bbd727bc4,Residential,"LINESTRING (-92.72421 44.71995, -92.72413 44.7...",Pierce
556547,8a4e0f59a0e8ebd97d67740d27913e08,Residential,"LINESTRING (-92.73143 44.74704, -92.72857 44.7...",Pierce
556548,6e208aed19d84f18ce2c09af6e31d6bc,Residential,"LINESTRING (-92.73143 44.74704, -92.73142 44.7...",Pierce


In [22]:
# convert crs to meter based

ranch_shape_gdf = ranch_shape_gdf.to_crs(shst_failed_links_gdf.crs)

#### first use tight tolerance

In [41]:
# match based on pems route, direction, and tomtom shieldnum, rtedir
# match based on facility type, county

offset = 20

tight_match_gdf = gpd.GeoDataFrame()

for county in shst_failed_links_gdf.NAME.unique():
    unmatched_subset_gdf = shst_failed_links_gdf[shst_failed_links_gdf.NAME == county].copy()
    
    for roadway in unmatched_subset_gdf.roadway.unique():
        
        print("county {}, roadway {}".format(county, roadway))
        
        subset_gdf = unmatched_subset_gdf[(unmatched_subset_gdf.roadway == roadway)].copy()
        subset_gdf['geometry'] = subset_gdf['geometry'].representative_point()
        
        bbox = subset_gdf.bounds + [-offset, -offset, offset, offset]
    
        shape = ranch_shape_gdf[
            (ranch_shape_gdf.roadClass.str.contains(roadway,case=False)) & 
            (ranch_shape_gdf.county == county)
        ].copy()

        if len(shape) == 0:
            shape = ranch_shape_gdf[
                (ranch_shape_gdf.county == county)
            ].copy()
        
        if len(shape) == 0:
            print("\t ranch does not county {}, roadway {}".format(county, roadway))
            continue
        
        hits = bbox.apply(lambda row: list(shape.sindex.intersection(row)),
                  axis = 1)
        
        tmp = pd.DataFrame({
            # index of points table
            "pt_idx": np.repeat(hits.index, hits.apply(len)),
            # ordinal position of line - access via iloc later
            "shape_i": np.concatenate(hits.values)
        })
        
        # join with pems
        
        tmp.set_index(["pt_idx"], inplace = True)
        
        tmp = tmp.join(subset_gdf.rename(
                                columns = {"geometry" : "point"}), 
                       how = "left")
        
        # join with links
        
        tmp.set_index(["shape_i"], inplace = True)
        
        tmp = tmp.join(shape.drop(columns =['county']).reset_index(drop=True), 
                       how="left")
        
        # find closest line to point
        
        tmp = gpd.GeoDataFrame(tmp, geometry = tmp["geometry"], crs = subset_gdf.crs)
        
        tmp["snap_distance"]  = tmp.geometry.distance(gpd.GeoSeries(tmp.point))
        
        tmp.sort_values(by = ["snap_distance"], inplace = True)
        
        closest = tmp.groupby(["link_id"]).first().reset_index()
        
        tight_match_gdf = pd.concat([tight_match_gdf, closest],
                                   sort = False,
                                   ignore_index = True)

county Carver, roadway tertiary
county Carver, roadway residential
county Carver, roadway secondary
county Carver, roadway primary
county Carver, roadway motorway
county Carver, roadway unclassified
county Carver, roadway trunk
county Dakota, roadway residential
county Dakota, roadway road
county Dakota, roadway tertiary
county Dakota, roadway secondary
county Dakota, roadway trunk
county Dakota, roadway motorway
county Dakota, roadway unclassified
county Dakota, roadway primary
county Goodhue, roadway residential
county Goodhue, roadway tertiary
county Goodhue, roadway primary
county Goodhue, roadway trunk
county Goodhue, roadway secondary
county Goodhue, roadway unclassified
county Polk, roadway residential
county Polk, roadway tertiary
county Polk, roadway road
county Polk, roadway secondary
county Polk, roadway unclassified
county Polk, roadway primary
county Wright, roadway residential
county Wright, roadway tertiary
county Wright, roadway secondary
county Wright, roadway unclassi

In [42]:
tight_match_gdf[tight_match_gdf.link_id == 383240]

Unnamed: 0,link_id,geoId,shape_id,A,B,distance,roadway,name,drive,walk,...,access_MD,access_PM,access_NT,point,index_right,NAME,shstGeometryId,roadClass,geometry,snap_distance
7045,383240,a4c5c4b6cab484ac8c6c92dcb9844bda,abfbf2005a3045910fa475448a258ae7,10916,3800,0.05734,motorway,,1,0,...,no,no,no,POINT (468766.296 4967284.047),0.0,Hennepin,0664e8becfb36ee9afb2ebea3918e32d,Motorway,"LINESTRING (468827.678 4967352.156, 468826.068...",7.910785


#### then use relaxed match for failed ones

In [43]:
# match based on pems route, direction, and tomtom shieldnum, rtedir
# match based on facility type, county

offset = 20

relaxed_match_gdf = gpd.GeoDataFrame()

reduced_shst_failed_links_gdf = shst_failed_links_gdf[~shst_failed_links_gdf.link_id.isin(tight_match_gdf.link_id)]

for county in reduced_shst_failed_links_gdf.NAME.unique():
    unmatched_subset_gdf = reduced_shst_failed_links_gdf[reduced_shst_failed_links_gdf.NAME == county].copy()
    
    for roadway in unmatched_subset_gdf.roadway.unique():
        
        print("county {}, roadway {}".format(county, roadway))
        
        subset_gdf = unmatched_subset_gdf[(unmatched_subset_gdf.roadway == roadway)].copy()
        subset_gdf['geometry'] = subset_gdf['geometry'].representative_point()
        
        bbox = subset_gdf.bounds + [-offset, -offset, offset, offset]
    
        shape = ranch_shape_gdf[
            #(ranch_shape_gdf.roadClass.str.contains(roadway,case=False)) & 
            (ranch_shape_gdf.county == county)
        ].copy()

        if len(shape) == 0:
            shape = ranch_shape_gdf[
                (ranch_shape_gdf.county == county)
            ].copy()
        
        if len(shape) == 0:
            print("\t ranch does not county {}, roadway {}".format(county, roadway))
            continue
        
        hits = bbox.apply(lambda row: list(shape.sindex.intersection(row)),
                  axis = 1)
        
        tmp = pd.DataFrame({
            # index of points table
            "pt_idx": np.repeat(hits.index, hits.apply(len)),
            # ordinal position of line - access via iloc later
            "shape_i": np.concatenate(hits.values)
        })
        
        # join with pems
        
        tmp.set_index(["pt_idx"], inplace = True)
        
        tmp = tmp.join(subset_gdf.rename(
                                columns = {"geometry" : "point"}), 
                       how = "left")
        
        # join with links
        
        tmp.set_index(["shape_i"], inplace = True)
        
        tmp = tmp.join(shape.drop(columns =['county']).reset_index(drop=True), 
                       how="left")
        
        # find closest line to point
        
        tmp = gpd.GeoDataFrame(tmp, geometry = tmp["geometry"], crs = subset_gdf.crs)
        
        tmp["snap_distance"]  = tmp.geometry.distance(gpd.GeoSeries(tmp.point))
        
        tmp.sort_values(by = ["snap_distance"], inplace = True)
        
        closest = tmp.groupby(["link_id"]).first().reset_index()
        
        relaxed_match_gdf = pd.concat([relaxed_match_gdf, closest],
                                   sort = False,
                                   ignore_index = True)

county Carver, roadway tertiary
county Carver, roadway residential
county Carver, roadway unclassified
county Goodhue, roadway residential
county Goodhue, roadway tertiary
county Goodhue, roadway trunk
county Goodhue, roadway unclassified
county Goodhue, roadway secondary
county Polk, roadway residential
county Polk, roadway secondary
county Polk, roadway unclassified
county Wright, roadway residential
county Wright, roadway tertiary
county Wright, roadway secondary
county Wright, roadway primary
county Rice, roadway tertiary
county Rice, roadway residential
county Rice, roadway secondary
county Hennepin, roadway tertiary
county Hennepin, roadway residential
county Hennepin, roadway secondary
county Hennepin, roadway unclassified
county Hennepin, roadway trunk
county Hennepin, roadway motorway
county Anoka, roadway residential
county Anoka, roadway tertiary
county Anoka, roadway primary
county Anoka, roadway unclassified
county Anoka, roadway secondary
county Isanti, roadway tertiary
c

In [44]:
relaxed_match_gdf

Unnamed: 0,link_id,geoId,shape_id,A,B,distance,roadway,name,drive,walk,...,access_MD,access_PM,access_NT,point,index_right,NAME,shstGeometryId,roadClass,geometry,snap_distance
0,126,,e4ae6ab8197f164d11fc4d548ed3652b,3181,136718,0.219109,tertiary,Great Plains Boulevard,1,1,...,,,,POINT (457472.450 4966185.588),7.0,Carver,46a97c1d3dcff493d6f3294ee93a1024,Other,"LINESTRING (457432.560 4966007.165, 457432.963...",1.408613e-09
1,125395,,55efddb4ec25f98b203d5c9594698afd,58313,79402,0.227081,tertiary,Orchard Road,1,1,...,,,,POINT (434855.772 4964522.992),7.0,Carver,de68f20ad9550b8b30f491cc56366994,Residential,"LINESTRING (434870.201 4964755.502, 434868.357...",3.506193e+00
2,164110,,4e38f99469a207d408a19aedc940bd30,75440,31993,0.270078,tertiary,Orchard Road,1,1,...,,,,POINT (434848.964 4964564.199),7.0,Carver,de68f20ad9550b8b30f491cc56366994,Residential,"LINESTRING (434870.201 4964755.502, 434868.357...",1.234421e+01
3,173157,35f4a4fae2f6b4f20e57b4920c4cb129,255a334f0550bb1fd45825846986803d,79402,58317,0.036706,tertiary,Orchard Road,1,1,...,,,,POINT (434861.561 4964755.556),7.0,Carver,02275e3b2bf790d9495ad0b8be12db3c,Residential,"LINESTRING (434492.906 4964757.450, 434601.117...",2.713648e-02
4,173158,35f4a4fae2f6b4f20e57b4920c4cb129,dfd9ddb174e4b04f620cd60462bf422c,79402,75440,0.037592,tertiary,94th Street,1,1,...,,,,POINT (434857.607 4964734.377),7.0,Carver,de68f20ad9550b8b30f491cc56366994,Residential,"LINESTRING (434870.201 4964755.502, 434868.357...",1.179554e+01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
664,213326,e1dbde729df7ddbde3630d7b64bd6ac1,601b763977f5fb0a3ecaff05eb6b4900,96568,96574,0.163466,residential,530th Avenue,1,1,...,,,,POINT (535208.911 4953263.746),9.0,Pierce,fe02808a07e6979b7c2434500b4a5182,Primary,"LINESTRING (535086.745 4953173.767, 535270.941...",4.048765e+01
665,213341,e1dbde729df7ddbde3630d7b64bd6ac1,998684513a5f8d36d21cbd553ef19d3e,96574,96568,0.163466,residential,530th Avenue,1,1,...,,,,POINT (535208.911 4953263.746),9.0,Pierce,fe02808a07e6979b7c2434500b4a5182,Primary,"LINESTRING (535086.745 4953173.767, 535270.941...",4.048765e+01
666,218806,,61bb3ebc3904db1808a32e29e1de4a99,98848,73443,0.539567,residential,"190th Street,130th Avenue",1,1,...,,,,POINT (561783.112 4936943.011),9.0,Pierce,08ac5c90c02ba66c0f3eaef4c0c0d067,Other,"LINESTRING (562229.625 4936950.061, 562110.159...",1.862157e-09
667,269962,5acda57a8d502486216468ad53e9b8ec,caeea9ce94be407b8ed97a4e61599f93,120833,120876,0.152991,residential,301st Avenue,1,1,...,,,,POINT (529622.991 4944123.686),9.0,Pierce,3ad9693fa97df6652ac0d80d900c1b13,Primary,"LINESTRING (529708.741 4944030.547, 529631.816...",4.802174e+01


## organize match outcome

In [45]:
# combine tight and relaxed match outcome
match_gdf = pd.concat(
    [tight_match_gdf, relaxed_match_gdf],
    sort = False,
    ignore_index = True
)

In [47]:
out_match_gdf = pd.merge(
    match_gdf[['link_id','A','B','NAME','geoId','roadway','geometry','snap_distance','shstGeometryId']].rename(
        columns = {'link_id':'model_link_id_old', 'NAME':'county', 'A':'A_old','B':'B_old'}
    ),
    roadway_network.links_df[['model_link_id','A','B','shstGeometryId', 'shstReferenceId']].rename(
        columns = {'model_link_id':'model_link_id_new', 'A':'A_new','B':'B_new'}
    ),
    how = 'left',
    on = "shstGeometryId"
)

In [52]:
out_match_gdf.model_link_id_old.nunique()

13466

In [48]:
out_match_gdf

Unnamed: 0,model_link_id_old,A_old,B_old,county,geoId,roadway,geometry,snap_distance,shstGeometryId,model_link_id_new,A_new,B_new,shstReferenceId
0,190365,87088,131048,Carver,5d71e5bdddb801d7808004bcebfa3530,tertiary,"LINESTRING (420558.656 4980247.723, 420579.704...",9.129683e-10,037de3fe8189159ca2b94a98eb539d98,6439,63305,65512,02fc891611aaac18aef3652640908e71
1,190365,87088,131048,Carver,5d71e5bdddb801d7808004bcebfa3530,tertiary,"LINESTRING (420558.656 4980247.723, 420579.704...",9.129683e-10,037de3fe8189159ca2b94a98eb539d98,608375,65512,63305,1bdf5e679736f0a422530f4f57c18daa
2,222373,100245,136963,Carver,,tertiary,"LINESTRING (422572.941 4980999.285, 422571.916...",0.000000e+00,f8d56ca3cb68a17f30ddd8283c3cd4a4,229391,65398,69104,69e433096409bdda0b4b0ba6a44ed952
3,222373,100245,136963,Carver,,tertiary,"LINESTRING (422572.941 4980999.285, 422571.916...",0.000000e+00,f8d56ca3cb68a17f30ddd8283c3cd4a4,1017691,69104,65398,ec3756c4b75867a8560c277fe3779ff6
4,224395,101068,101072,Carver,9cdeaa2f98802337f6820fca84fe13cc,tertiary,"LINESTRING (427836.843 4976901.053, 427092.451...",0.000000e+00,d32278c27cfe8dc8ca33db1695d4a5f2,374013,61156,61157,acff5fd53c379cbfa5081bde220abd8a
...,...,...,...,...,...,...,...,...,...,...,...,...,...
26134,218806,98848,73443,Pierce,,residential,"LINESTRING (562229.625 4936950.061, 562110.159...",1.862157e-09,08ac5c90c02ba66c0f3eaef4c0c0d067,629219,187467,183713,2666874e6a25e7997678a9d350637cf1
26135,269962,120833,120876,Pierce,5acda57a8d502486216468ad53e9b8ec,residential,"LINESTRING (529708.741 4944030.547, 529631.816...",4.802174e+01,3ad9693fa97df6652ac0d80d900c1b13,39318,189346,189354,1205275ab78ade71d950127d499acd10
26136,269962,120833,120876,Pierce,5acda57a8d502486216468ad53e9b8ec,residential,"LINESTRING (529708.741 4944030.547, 529631.816...",4.802174e+01,3ad9693fa97df6652ac0d80d900c1b13,900646,189354,189346,b056de4bda72c0d7670ad8ed2d1cd26d
26137,270058,120876,120833,Pierce,5acda57a8d502486216468ad53e9b8ec,residential,"LINESTRING (529708.741 4944030.547, 529631.816...",4.802174e+01,3ad9693fa97df6652ac0d80d900c1b13,39318,189346,189354,1205275ab78ade71d950127d499acd10


#### write out matched links using step 2

In [49]:
out_match_gdf.to_file(os.path.join(interim_dir, "step2_matched.geojson"), driver = "GeoJSON")

#### write out unmatched links after step 2

In [50]:
shst_failed_links_gdf[~(shst_failed_links_gdf.link_id.isin(out_match_gdf["model_link_id_old"].unique()))].to_file(
    os.path.join(interim_dir, 'old_model_links_failed_step2_nearest_match.geojson'), driver = "GeoJSON"
)

In [51]:
shst_failed_links_gdf[~(shst_failed_links_gdf.link_id.isin(out_match_gdf["model_link_id_old"].unique()))].shape

(2104, 44)

## add to the crosswalk

In [53]:
# exisiting crosswalk
crosswalk_df = pd.read_csv(os.path.join(interim_dir, 'model_network_id_crosswalk_step1_from_shst.csv'))

In [54]:
crosswalk_df

Unnamed: 0,shstReferenceId,A_old,B_old,model_link_id_old,shstGeometryId,model_link_id_new,A_new,B_new
0,c2f57cd96062b9cc5696feed9739f4b7,3153,10010,82,4182fb024a75046c971ec6b44529a336,937127.0,200714.0,36302.0
1,e9162d6a8e1d33d5a7a5cbdce7f23266,3153,3859,83,5ed091cb952ad1a07b0cc96d85242ece,504352.0,200714.0,200718.0
2,d592a065763a320947a181cb28f603ea,3153,165440,84,710762f57a26720c4076aec65a3953ae,462155.0,200714.0,289953.0
3,0c93d7c2521821337f48297b7f243a5e,3153,165440,84,aac137ff123bd4b1467ea3a532120764,27498.0,289953.0,34433.0
4,c75d2fc0c7de11abc01d9deb763f720e,3165,118345,102,b5e7ddb3e598d4966700af7a3721c72c,431150.0,259650.0,10191.0
...,...,...,...,...,...,...,...,...
463658,1b60035f47c097aeefea115da7eda642,170183,170181,420988,2a24b9bb9a9659bcc8c4cf2d27a6f53a,59543.0,257090.0,257088.0
463659,ece3f1efb5b15b12d3c5392554a85ea1,170183,74513,420989,4014d1bb3d7d527efdc99f266addf245,512486.0,257090.0,183970.0
463660,9ef969963ce79d46547a562b5b5165ad,170184,77840,420990,cbf18ef01ebfb998c121cd711d5917da,343666.0,257089.0,339119.0
463661,144014db4526a260bd4e116e0c5394c6,171166,58874,421172,a1d78cf6cb5d4ebedb0083a2f894e864,44128.0,361327.0,181049.0


In [55]:
augmented_crosswalk_df = pd.concat(
    [
        crosswalk_df,
        out_match_gdf[[c for c in out_match_gdf.columns if c in crosswalk_df.columns]]
    ],
    sort = False,
    ignore_index = True
)

In [56]:
augmented_crosswalk_df['model_link_id_new'] = augmented_crosswalk_df['model_link_id_new'].astype(int)
augmented_crosswalk_df['A_new'] = augmented_crosswalk_df['A_new'].astype(int)
augmented_crosswalk_df['B_new'] = augmented_crosswalk_df['B_new'].astype(int)
augmented_crosswalk_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 489802 entries, 0 to 489801
Data columns (total 8 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   shstReferenceId    489802 non-null  object
 1   A_old              489802 non-null  int64 
 2   B_old              489802 non-null  int64 
 3   model_link_id_old  489802 non-null  int64 
 4   shstGeometryId     489802 non-null  object
 5   model_link_id_new  489802 non-null  int32 
 6   A_new              489802 non-null  int32 
 7   B_new              489802 non-null  int32 
dtypes: int32(3), int64(3), object(2)
memory usage: 24.3+ MB


In [57]:
augmented_crosswalk_df

Unnamed: 0,shstReferenceId,A_old,B_old,model_link_id_old,shstGeometryId,model_link_id_new,A_new,B_new
0,c2f57cd96062b9cc5696feed9739f4b7,3153,10010,82,4182fb024a75046c971ec6b44529a336,937127,200714,36302
1,e9162d6a8e1d33d5a7a5cbdce7f23266,3153,3859,83,5ed091cb952ad1a07b0cc96d85242ece,504352,200714,200718
2,d592a065763a320947a181cb28f603ea,3153,165440,84,710762f57a26720c4076aec65a3953ae,462155,200714,289953
3,0c93d7c2521821337f48297b7f243a5e,3153,165440,84,aac137ff123bd4b1467ea3a532120764,27498,289953,34433
4,c75d2fc0c7de11abc01d9deb763f720e,3165,118345,102,b5e7ddb3e598d4966700af7a3721c72c,431150,259650,10191
...,...,...,...,...,...,...,...,...
489797,2666874e6a25e7997678a9d350637cf1,98848,73443,218806,08ac5c90c02ba66c0f3eaef4c0c0d067,629219,187467,183713
489798,1205275ab78ade71d950127d499acd10,120833,120876,269962,3ad9693fa97df6652ac0d80d900c1b13,39318,189346,189354
489799,b056de4bda72c0d7670ad8ed2d1cd26d,120833,120876,269962,3ad9693fa97df6652ac0d80d900c1b13,900646,189354,189346
489800,1205275ab78ade71d950127d499acd10,120876,120833,270058,3ad9693fa97df6652ac0d80d900c1b13,39318,189346,189354


In [58]:
# new crosswalk
augmented_crosswalk_df.to_csv(os.path.join(interim_dir, 'model_network_id_crosswalk_step2_augmented.csv'),index=False)

In [59]:
augmented_crosswalk_df[augmented_crosswalk_df.model_link_id_old == 383240]

Unnamed: 0,shstReferenceId,A_old,B_old,model_link_id_old,shstGeometryId,model_link_id_new,A_new,B_new
477422,97e7aa2b0b940c791f3cdcf0011abb27,10916,3800,383240,0664e8becfb36ee9afb2ebea3918e32d,328353,369952,201278
