# lion_01_data_wrangling_with_shst

This notebook is for creating a lookup table for connecting Sharedstreets segments and LION segments

## 0. Making small batches

Because the processed lion dataset is too big to apply Sharedstreet API in the local machine, we will divide the dataset into small batches to solve the problem. Each batch will contain less than 60000 lion segments.

In [1]:
import geopandas as gpd
import pandas as pd

In [2]:
# make sure that you run the 'lion_00_data_wrangling_&_filtration' notebook to get the 'lion_filtered.shp'
# import the sharedstreet segments
gdf_lion = gpd.read_file('../data/cleaned_data/lion_filtered/lion_filtered.shp')

In [3]:
# drop unnecessary columns
gdf_lion_simplified = gdf_lion[['objectid','segment_id','geometry']]

In [4]:
# change datatype of 'objectid'
gdf_lion_simplified['objectid'] = gdf_lion_simplified['objectid'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gdf_lion_simplified['objectid'] = gdf_lion_simplified['objectid'].astype(int)


In [5]:
gdf_lion_simplified.head()

Unnamed: 0,objectid,segment_id,geometry
0,1,78126,"LINESTRING (-73.90347 40.83035, -73.90238 40.8..."
1,2,79796,"LINESTRING (-73.90120 40.86661, -73.90207 40.8..."
2,3,77356,"LINESTRING (-73.90118 40.82439, -73.90051 40.8..."
3,5,73490,"LINESTRING (-73.90696 40.89361, -73.90696 40.8..."
4,6,174633,"LINESTRING (-73.90707 40.89927, -73.90712 40.8..."


In [6]:
# split the dataset into small batches
gdf_lion_simplified_0_60000 = gdf_lion_simplified.iloc[:60000]
gdf_lion_simplified_60000_120000 = gdf_lion_simplified.iloc[60000:120000]
gdf_lion_simplified_120000_ = gdf_lion_simplified.iloc[120000:]

In [7]:
# export the small batches
gdf_lion_simplified_0_60000.to_file('../data/sharedstreets_results/lion/before_applying/lion_street_filtered_0_60000.geojson', driver='GeoJSON')
gdf_lion_simplified_60000_120000.to_file('../data/sharedstreets_results/lion/before_applying/lion_street_filtered_60000_120000.geojson', driver='GeoJSON')
gdf_lion_simplified_120000_.to_file('../data/sharedstreets_results/lion/before_applying/lion_street_filtered_120000_.geojson', driver='GeoJSON')

## 1. Processing with SharedStreet API

Please check 'how_to_use_sharedstreets_api' document. We will use search-radius=10m to map lion segments on the Sharedstreet geometry.

## 2. Create Sharedstreets - LION lookup table 

In [8]:
# import matched lion geojson files
gdf_lion_matched_1 = gpd.read_file('../data/sharedstreets_results/lion/radius10/lion_street_filtered_0_60000.matched.geojson')
gdf_lion_matched_2 = gpd.read_file('../data/sharedstreets_results/lion/radius10/lion_street_filtered_60000_120000.matched.geojson')
gdf_lion_matched_3 = gpd.read_file('../data/sharedstreets_results/lion/radius10/lion_street_filtered_120000_.matched.geojson')

In [9]:
# merge the matched lion geodataframes
gdf_lion_matched = pd.concat([gdf_lion_matched_1,gdf_lion_matched_2,gdf_lion_matched_3], axis=0, ignore_index=True)

In [10]:
gdf_lion_matched.head(5)

Unnamed: 0,shstReferenceId,shstGeometryId,shstFromIntersectionId,shstToIntersectionId,referenceLength,gisReferenceId,gisGeometryId,gisTotalSegments,gisSegmentIndex,gisFromIntersectionId,gisToIntersectionId,startSideOfStreet,endSideOfStreet,sideOfStreet,score,matchType,pp_objectid,pp_segment_id,geometry
0,d4e6f420ffc4636cf17051f0e9f6120a,a5c5f21851e81c507b141aecc2c6235e,bd6c80b61b92580a0312999f9ccba3bc,1b0e08342f1185f0aef62f19564a5101,132.47,7109a952974a536de71e0f1de86e2789,b3f402083c371b3e696e860ae241c169,1,1,2884568468e16e1660364ee1d3258e96,6045bb5dac9fc1493586bb9b79ba0948,left,right,unknown,3.94,hmm,1,78126,"LINESTRING (-73.90345 40.83034, -73.90233 40.8..."
1,f3b73d2b6332e52694779f4f3b14e300,a5c5f21851e81c507b141aecc2c6235e,1b0e08342f1185f0aef62f19564a5101,bd6c80b61b92580a0312999f9ccba3bc,132.47,df65652fc6fd576a1857c5596d205321,b3f402083c371b3e696e860ae241c169,1,1,6045bb5dac9fc1493586bb9b79ba0948,2884568468e16e1660364ee1d3258e96,left,right,unknown,3.94,hmm,1,78126,"LINESTRING (-73.90233 40.82970, -73.90345 40.8..."
2,15e9092333bd832a0cd0a4ae937b208c,8486febe149b7fbeba3b499188f5bc2a,e55a0b052bd17c485e92e3df182ec0ae,368c367d565d1c6c7c9a740c8194e0d4,84.94,d257eaabfe83d5e8749f8ca380b77da3,f8c936cc5c7b68f79d749a515c86ad72,1,1,d85303d7629a8f4c470648585a2c4449,4d306ac0bdf0417e26008b863be2eb0d,left,right,unknown,0.49,hmm,2,79796,"LINESTRING (-73.90120 40.86662, -73.90208 40.8..."
3,af11ffc0c5f396bafdb166cd537ab5d6,8486febe149b7fbeba3b499188f5bc2a,368c367d565d1c6c7c9a740c8194e0d4,e55a0b052bd17c485e92e3df182ec0ae,84.94,4e8501efddb4fbd676f0cd22f7b6fc99,f8c936cc5c7b68f79d749a515c86ad72,1,1,4d306ac0bdf0417e26008b863be2eb0d,d85303d7629a8f4c470648585a2c4449,left,right,unknown,0.49,hmm,2,79796,"LINESTRING (-73.90208 40.86700, -73.90120 40.8..."
4,13d62bc345a7eafeac31e3543049afca,300959ea23f9820fbe692a4a89a67396,f84ed4a470a558d3cda29402640076fa,7fc9495699f9d85b0c433544768b08e0,188.83,f77f4a297c4a5fd4523568a390eb4833,300959ea23f9820fbe692a4a89a67396,1,1,6b6630c99af2f236cba8fb24c4e7ef0c,2d09df71d6a739536b7cb8e20f73be50,left,right,unknown,0.52,hmm,3,77356,"LINESTRING (-73.90118 40.82439, -73.90051 40.8..."


In [11]:
# drop unnecessary columns
gdf_lion_matched = gdf_lion_matched[['shstGeometryId','pp_objectid','pp_segment_id','geometry']]
gdf_lion_matched_keys = gdf_lion_matched[['shstGeometryId','pp_objectid','pp_segment_id']]

In [12]:
# drop duplicates
gdf_lion_matched_keys = gdf_lion_matched_keys.drop_duplicates()

In [13]:
gdf_lion_matched_keys.head()

Unnamed: 0,shstGeometryId,pp_objectid,pp_segment_id
0,a5c5f21851e81c507b141aecc2c6235e,1,78126
2,8486febe149b7fbeba3b499188f5bc2a,2,79796
4,300959ea23f9820fbe692a4a89a67396,3,77356
5,d39717c7db5c64930d589e45414b839e,5,73490
7,8af91954050b3b92b77dccfa206cfc19,6,174633


In [14]:
# export the table
gdf_lion_matched_keys.to_csv('shst_lion_lookuptable.csv', index=False)