## Filtering Attributes in the LION Data Set 
### This notebook prepares attributes such as roadway type, speed and number of lanes to be joined later to the work zones data to enhace it and prepare it for clustering. 

In [1]:
import numpy
import pandas as pd
import geopandas as gpd
import numpy as np

from fiona.crs import from_epsg
import shapely
from shapely.geometry import Point, Polygon

import re 

import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns

pylab.rcParams['figure.figsize'] = 30,30
pylab.rcParams['figure.dpi'] = 100
pd.set_option("display.max_columns", 100)

### Importing LION - LION Release 20-A was used

In [2]:
# data can be dowloaded from: https://www1.nyc.gov/site/planning/data-maps/open-data/bytes-archive.page?sorts[year]=0
# select lion from the drop down data menu
# select LION 20A
# the geodatabase was then saved as a shapefile using GIS software
lion_s = gpd.GeoDataFrame.from_file('../data/NYC LION/LION_street/Lion_20A/lion_20A.shp')

In [3]:
type(lion_s.SegmentID.iloc[0])

str

In [4]:
lion_s[lion_s.SegmentID.astype(int) == 65969]

Unnamed: 0,OBJECTID,Street,SAFStreetN,FeatureTyp,SegmentTyp,IncExFlag,RB_Layer,NonPed,TrafDir,TrafSrc,SpecAddr,FaceCode,SeqNum,StreetCode,SAFStreetC,LGC1,LGC2,LGC3,LGC4,LGC5,LGC6,LGC7,LGC8,LGC9,BOE_LGC,SegmentID,SegCount,LocStatus,LZip,RZip,LBoro,RBoro,L_CD,R_CD,LATOMICPOL,RATOMICPOL,LCT2010,LCT2010Suf,RCT2010,RCT2010Suf,LCB2010,LCB2010Suf,RCB2010,RCB2010Suf,LCT2000,LCT2000Suf,RCT2000,RCT2000Suf,LCB2000,LCB2000Suf,...,YFrom,XTo,YTo,ArcCenterX,ArcCenterY,CurveFlag,Radius,NodeIDFrom,NodeIDTo,NodeLevelF,NodeLevelT,ConParity,Twisted,RW_TYPE,PhysicalID,GenericID,NYPDID,FDNYID,LBlockFace,RBlockFace,LegacyID,Status,StreetWidt,StreetWi_1,StreetWi_2,BikeLane,BIKE_TRAFD,ACTIVE_FLA,POSTED_SPE,Snow_Prior,Number_Tra,Number_Par,Number_Tot,Carto_Disp,FCC,ROW_Type,LLo_Hyphen,LHi_Hyphen,RLo_Hyphen,RHi_Hyphen,FromLeft,ToLeft,FromRight,ToRight,Join_ID,L_PD_Servi,R_PD_Servi,TRUCK_ROUT,SHAPE_Leng,geometry


In [5]:
lion_s.head()

Unnamed: 0,OBJECTID,Street,SAFStreetN,FeatureTyp,SegmentTyp,IncExFlag,RB_Layer,NonPed,TrafDir,TrafSrc,SpecAddr,FaceCode,SeqNum,StreetCode,SAFStreetC,LGC1,LGC2,LGC3,LGC4,LGC5,LGC6,LGC7,LGC8,LGC9,BOE_LGC,SegmentID,SegCount,LocStatus,LZip,RZip,LBoro,RBoro,L_CD,R_CD,LATOMICPOL,RATOMICPOL,LCT2010,LCT2010Suf,RCT2010,RCT2010Suf,LCB2010,LCB2010Suf,RCB2010,RCB2010Suf,LCT2000,LCT2000Suf,RCT2000,RCT2000Suf,LCB2000,LCB2000Suf,...,YFrom,XTo,YTo,ArcCenterX,ArcCenterY,CurveFlag,Radius,NodeIDFrom,NodeIDTo,NodeLevelF,NodeLevelT,ConParity,Twisted,RW_TYPE,PhysicalID,GenericID,NYPDID,FDNYID,LBlockFace,RBlockFace,LegacyID,Status,StreetWidt,StreetWi_1,StreetWi_2,BikeLane,BIKE_TRAFD,ACTIVE_FLA,POSTED_SPE,Snow_Prior,Number_Tra,Number_Par,Number_Tot,Carto_Disp,FCC,ROW_Type,LLo_Hyphen,LHi_Hyphen,RLo_Hyphen,RHi_Hyphen,FromLeft,ToLeft,FromRight,ToRight,Join_ID,L_PD_Servi,R_PD_Servi,TRUCK_ROUT,SHAPE_Leng,geometry
0,1.0,EAST 168 STREET,,0,U,,B,,T,DOT,,2510,3070,226700,,1,,,,,,,,,1,78126,1,X,10456,10456,2.0,2.0,203,203,402,101,149,,185,,3001,,2000,,149,,137,,4000,,...,241812,1011265,241555,0,0,,0,47740,9045677,M,M,,,1,35231.0,30694.0,,,1422600653,1422602017,78126,2,34.0,34.0,,,,,25.0,S,2,2,4,,,,599.0,699.0,596.0,716.0,599,699,596,716,2251001000000,,,,396.030947,"LINESTRING (-73.90347 40.83035, -73.90238 40.8..."
1,2.0,WEST 192 STREET,,0,U,,B,,A,DOT,,7984,40,274810,,1,,,,,,,,,1,79796,1,,10468,10468,2.0,2.0,207,207,302,104,265,,265,,2000,,1004,,265,,265,,3001,,...,255024,1011335,255164,0,0,,0,48679,48678,M,M,,,1,35248.0,30711.0,,,1522607129,1522607721,79796,2,30.0,30.0,,,,,25.0,S,1,2,3,,,,58.0,98.0,63.0,99.0,58,98,63,99,2798401000000,,,,279.360514,"LINESTRING (-73.90120 40.86661, -73.90207 40.8..."
2,3.0,UNION AVENUE,,0,U,,B,,W,DOT,,7280,130,270420,,1,,,,,,,,,1,77356,2,X,10459,10459,2.0,2.0,203,203,402,401,135,,131,,2000,,3006,,135,,131,,4000,,...,239640,1011786,240230,0,0,,0,47288,47822,M,M,,,1,35252.0,30715.0,,,1422603726,1422604132,77356,2,34.0,34.0,,,,,25.0,S,1,2,3,,,,1017.0,1079.0,1016.0,1084.0,1017,1079,1016,1084,2728001000000,,,,618.327133,"LINESTRING (-73.90118 40.82439, -73.90051 40.8..."
3,4.0,UNION AVENUE,BEHAGEN PLAYGROUND,0,U,,B,,W,DOT,N,7280,130,270420,212795.0,1,,,,,,,,,1,77356,2,X,10459,10459,2.0,2.0,203,203,402,401,135,,131,,2000,,3006,,135,,131,,4000,,...,239640,1011786,240230,0,0,,0,47288,47822,M,M,,,1,35252.0,30715.0,,,1422603726,1422604132,77356,2,34.0,34.0,,,,,25.0,S,1,2,3,,,,,,,,0,0,0,0,21279501000000N,,,,618.327133,"LINESTRING (-73.90118 40.82439, -73.90051 40.8..."
4,5.0,DELAFIELD AVENUE,,6,U,,B,,T,DOT,,1876,1020,224120,,1,,,,,,,,,1,73490,1,,10471,10471,2.0,2.0,208,208,119,123,335,,335,,1004,,1008,,333,,333,,1009,,...,264857,1009974,265527,0,0,,0,45034,45038,M,M,,,1,35275.0,30738.0,,,1522605283,1522610703,73490,2,30.0,30.0,,,,,,V,1,2,3,,,,4601.0,4645.0,4600.0,4664.0,4601,4645,4600,4664,2187601000000,,,,670.281037,"LINESTRING (-73.90696 40.89361, -73.90696 40.8..."


In [6]:
lion_s.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 228248 entries, 0 to 228247
Columns: 122 entries, OBJECTID to geometry
dtypes: float64(8), geometry(1), int64(11), object(102)
memory usage: 212.5+ MB


In [7]:
lion_s.shape

(228248, 122)

### Data Cleaning

In [8]:
print("lion_s Shape: {}\n\nNull Values in each column\n".format(lion_s.shape))
for col in lion_s.columns:
    print(col, '\t\t', lion_s[col].isnull().sum())

lion_s Shape: (228248, 122)

Null Values in each column

OBJECTID 		 0
Street 		 0
SAFStreetN 		 211093
FeatureTyp 		 0
SegmentTyp 		 0
IncExFlag 		 223389
RB_Layer 		 0
NonPed 		 196061
TrafDir 		 34693
TrafSrc 		 44543
SpecAddr 		 211093
FaceCode 		 0
SeqNum 		 0
StreetCode 		 0
SAFStreetC 		 211093
LGC1 		 0
LGC2 		 162139
LGC3 		 216197
LGC4 		 222435
LGC5 		 227751
LGC6 		 227915
LGC7 		 228243
LGC8 		 228245
LGC9 		 228248
BOE_LGC 		 0
SegmentID 		 0
SegCount 		 0
LocStatus 		 174675
LZip 		 9536
RZip 		 9036
LBoro 		 1408
RBoro 		 883
L_CD 		 1409
R_CD 		 884
LATOMICPOL 		 1409
RATOMICPOL 		 884
LCT2010 		 1409
LCT2010Suf 		 177011
RCT2010 		 884
RCT2010Suf 		 176871
LCB2010 		 1409
LCB2010Suf 		 228248
RCB2010 		 884
RCB2010Suf 		 228248
LCT2000 		 1409
LCT2000Suf 		 183749
RCT2000 		 884
RCT2000Suf 		 183745
LCB2000 		 1409
LCB2000Suf 		 205444
RCB2000 		 884
RCB2000Suf 		 205520
LCT1990 		 1409
LCT1990Suf 		 180756
RCT1990 		 884
RCT1990Suf 		 180630
LAssmDist 		 1429
LElectD

In [9]:
# define a function that puts underscore before capital letters and convert to lower case
def convert(name):
    s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
    return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()

# removing present underscores in some columns
tmp = lion_s.columns.str.replace('_', '')

# mapping the convert function on column names
tmp = list(map(convert, lion_s.columns))

tmp = [name.replace('__', '_') for name in tmp]
tmp = [name.replace('saf_street_n', 'saf_street_name') for name in tmp]
tmp = [name.replace('saf_street_c', 'saf_street_code') for name in tmp]

for name in tmp:
    print(name)

objectid
street
saf_street_name
feature_typ
segment_typ
inc_ex_flag
rb_layer
non_ped
traf_dir
traf_src
spec_addr
face_code
seq_num
street_code
saf_street_code
lgc1
lgc2
lgc3
lgc4
lgc5
lgc6
lgc7
lgc8
lgc9
boe_lgc
segment_id
seg_count
loc_status
l_zip
r_zip
l_boro
r_boro
l_cd
r_cd
latomicpol
ratomicpol
lct2010
lct2010_suf
rct2010
rct2010_suf
lcb2010
lcb2010_suf
rcb2010
rcb2010_suf
lct2000
lct2000_suf
rct2000
rct2000_suf
lcb2000
lcb2000_suf
rcb2000
rcb2000_suf
lct1990
lct1990_suf
rct1990
rct1990_suf
l_assm_dist
l_elect_dist
r_assm_dist
r_elect_dist
split_elect
l_schl_dist
r_schl_dist
split_schl
l_sub_sect
r_sub_sect
san_dist_ind
map_from
map_to
boro_bndry
mh_ri_flag
x_from
y_from
x_to
y_to
arc_center_x
arc_center_y
curve_flag
radius
node_id_from
node_id_to
node_level_f
node_level_t
con_parity
twisted
rw_type
physical_id
generic_id
nypdid
fdnyid
l_block_face
r_block_face
legacy_id
status
street_widt
street_wi_1
street_wi_2
bike_lane
bike_trafd
active_fla
posted_spe
snow_prior
number_tra
nu

In [10]:
lion_s.columns = tmp

In [11]:
lion_s.crs

<Geographic 2D CRS: EPSG:4326>
Name: WGS 84
Axis Info [ellipsoidal]:
- Lat[north]: Geodetic latitude (degree)
- Lon[east]: Geodetic longitude (degree)
Area of Use:
- name: World
- bounds: (-180.0, -90.0, 180.0, 90.0)
Datum: World Geodetic System 1984
- Ellipsoid: WGS 84
- Prime Meridian: Greenwich

In [12]:
# lion_s.to_file('../data/cleaned_data/lion_segments.shp')

### Filtering out attributes that we do not need for our analysis.
### This was based on recommendations from previous work our sponsor has done

In [13]:
lion_s.feature_typ.value_counts().sort_index()

0    178187
1      8331
2      9237
3     11164
5       286
6      9381
7      3008
8      1642
9      1025
A      4005
C        50
F       934
W       998
Name: feature_typ, dtype: int64

In [14]:
# selecting feature type 0: street other than vehicle only street & 6: Private Street
feature_type = ['0', '6']
lion_f = lion_s[lion_s.feature_typ.isin(feature_type)]
print('before:', lion_s.shape, '\nafter:', lion_f.shape, '\nlost:', lion_s.shape[0] - lion_f.shape[0])

before: (228248, 122) 
after: (187568, 122) 
lost: 40680


In [15]:
lion_s.segment_typ.value_counts().sort_index()

B       596
C       836
E      5554
F      3356
G     12972
R     27845
S       976
T      1851
U    174262
Name: segment_typ, dtype: int64

In [17]:
# selecting segment type
# B: both (generic and roadbed (odd number of roadways))
# C: connector (connect adjacent roadbeds of a divided street)
# E: entrance/exit ramp (connects a highway to a different street or highway)
# R: roadbed segment (depicts physically separated carriageway segments of a particular street)
# T: terminator (a divided section of a street terminates, but the street itself continues)
# U: undivided street (all other LION segments that do not fall into any of the above categories)
# S: suppressed (undivided segment to be suppressed in a generic view of LION)
segment_type = ['B', 'C', 'E', 'R', 'T', 'U', 'S']
lion_g = lion_f[lion_f.segment_typ.isin(segment_type)]
print('before:', lion_f.shape, '\nafter:', lion_g.shape, '\nlost:', lion_f.shape[0] - lion_g.shape[0])

before: (187568, 122) 
after: (171319, 122) 
lost: 16249


In [18]:
lion_s.rw_type.value_counts().sort_index()

1     152393
10      4007
12      4135
13       552
14       934
2      14354
3       3253
4        359
5        198
6       7404
7        275
8       1165
9       5837
Name: rw_type, dtype: int64

In [22]:
# selecting roadway types
# 1: street
# 2: Highway
# 3: Bridge
# 4: Tunnel
# 8: Driveway
# 9: Ramp
# 10: Alley
# 11: Unknown
# 13: U-turn

roadway_type = ['1', '2', '3', '4', '8', '9', '10', '11', '13']
lion_r = lion_g[lion_g.rw_type.isin(roadway_type)]
print('before:', lion_g.shape, '\nafter:', lion_r.shape, '\nlost:', lion_g.shape[0] - lion_r.shape[0])

before: (171319, 122) 
after: (164386, 122) 
lost: 6933


In [24]:
lion_r.rw_type.value_counts().sort_index()

1     143935
10         2
13       552
2       9865
3       2877
4        261
8       1131
9       5763
Name: rw_type, dtype: int64

In [30]:
lion_r.rw_type.isnull().sum()

0

In [31]:
lion_s.traf_dir.value_counts().sort_index()

A    40288
P    11529
T    96655
W    45083
Name: traf_dir, dtype: int64

In [32]:
lion_s.traf_dir.unique()

array(['T', 'A', 'W', 'P', None], dtype=object)

In [33]:
len(lion_s.traf_dir.isnull())

228248

In [34]:
# selecting traffic
# 'W': With: One-way street, traffic flows with the segment's directionality
# 'A': Against: One-way street, traffic flows from against the segment's directionality
# 'T': Two-Way: Traffic flows in both directions

traffic_direction = ['W', 'A', 'T']
lion_t = lion_r[lion_r.traf_dir.isin(traffic_direction)]
print('before:', lion_r.shape, '\nafter:', lion_t.shape, '\nlost:', lion_r.shape[0] - lion_t.shape[0])

before: (164386, 122) 
after: (161942, 122) 
lost: 2444


In [35]:
print('rows lost from all the filtration:', lion_s.shape[0] - lion_t.shape[0])

rows lost from all the filtration: 66306


In [36]:
print('number of duplicate segment ids: ', lion_t.shape[0] - lion_t.drop_duplicates(subset='segment_id').shape[0])

number of duplicate segment ids:  17474


In [37]:
lion_t.saf_street_name.unique()

array([None, 'BEHAGEN PLAYGROUND', 'DREW PLAYGROUND', ...,
       'DEBBIE STREET', 'NORBERT LEESEBERG PERENNIAL GDN',
       'GRANITEVILLE SWAMP PARK'], dtype=object)

In [38]:
# rows that have a special address place name
lion_t[~lion_t.saf_street_name.isnull()].shape

(15913, 122)

In [39]:
# selecting rows that do not have a special address place name
lion_d = lion_t[lion_t.saf_street_name.isnull()]
print('before:', lion_t.shape, '\nafter:', lion_d.shape, '\nlost:', lion_t.shape[0] - lion_d.shape[0])

before: (161942, 122) 
after: (146029, 122) 
lost: 15913


In [40]:
print('number of duplicate segment ids: ', lion_d.shape[0] - lion_d.drop_duplicates(subset='segment_id').shape[0])

number of duplicate segment ids:  1561


In [41]:
# dropping identical line segments i.e. having the same 2d position and elevation 
lion_i = lion_d.drop_duplicates(subset=['x_from', 'y_from', 'x_to', 'y_to', 'node_level_f', 'node_level_t'])
print('before:', lion_d.shape, '\nafter:', lion_i.shape, '\nlost:', lion_d.shape[0] - lion_i.shape[0])

before: (146029, 122) 
after: (144441, 122) 
lost: 1588


In [42]:
print('number of duplicate segment ids: ', lion_i[lion_i.duplicated(subset='segment_id')].shape[0])

number of duplicate segment ids:  14


In [43]:
lion_i = lion_i.drop_duplicates(subset='segment_id')
lion_i.shape

(144427, 122)

In [44]:
print('number of duplicate segment ids: ', lion_i[lion_i.duplicated(subset='segment_id')].shape[0])

number of duplicate segment ids:  0


In [31]:
# lion_i.to_file('../data/cleaned_data/lion_filtered.geojson', driver='GeoJSON')

In [46]:
lion_i.to_file('../data/cleaned_data/lion_filtered.shp')

In [47]:
# lion_i.to_file('../data/cleaned_data/lion_filtered.csv', index=False)

In [33]:
# lion_i.plot()

In [48]:
# subsetting interesting columns
lion_sub = lion_i[['segment_id', 'feature_typ', 'segment_typ', 'rw_type', 'traf_dir', 'loc_status', 'curve_flag', 'radius', 'street_widt', 'street_wi_1', 'bike_lane', 'bike_trafd', 'number_tra', 'number_par', 'number_tot', 'posted_spe', 'snow_prior', 'geometry']]
lion_sub.head(3)

Unnamed: 0,segment_id,feature_typ,segment_typ,rw_type,traf_dir,loc_status,curve_flag,radius,street_widt,street_wi_1,bike_lane,bike_trafd,number_tra,number_par,number_tot,posted_spe,snow_prior,geometry
0,78126,0,U,1,T,X,,0,34.0,34.0,,,2,2,4,25,S,"LINESTRING (-73.90347 40.83035, -73.90238 40.8..."
1,79796,0,U,1,A,,,0,30.0,30.0,,,1,2,3,25,S,"LINESTRING (-73.90120 40.86661, -73.90207 40.8..."
2,77356,0,U,1,W,X,,0,34.0,34.0,,,1,2,3,25,S,"LINESTRING (-73.90118 40.82439, -73.90051 40.8..."


In [49]:
# renaming columns
lion_sub.columns = ['segment_id', 'feat_typ', 'seg_typ', 'rw_type', 'traf_dir', 'loc_status', 'curve_flag', 'radius', 'st_wid_min', 'st_wid_max', 'bike_lane', 'b_traf_dir', 'n_trvl_ln', 'n_park_ln', 'n_total_ln', 'posted_spd', 'snow_prio', 'geometry']
lion_sub.head(2)

Unnamed: 0,segment_id,feat_typ,seg_typ,rw_type,traf_dir,loc_status,curve_flag,radius,st_wid_min,st_wid_max,bike_lane,b_traf_dir,n_trvl_ln,n_park_ln,n_total_ln,posted_spd,snow_prio,geometry
0,78126,0,U,1,T,X,,0,34.0,34.0,,,2,2,4,25,S,"LINESTRING (-73.90347 40.83035, -73.90238 40.8..."
1,79796,0,U,1,A,,,0,30.0,30.0,,,1,2,3,25,S,"LINESTRING (-73.90120 40.86661, -73.90207 40.8..."


In [50]:
# lion_sub.to_file('../data/cleaned_data/lion_filtered_sub.csv', index=False)

In [46]:
lion_sub.to_file('../data/cleaned_data/lion_filtered_sub.shp')

In [52]:
lion_s[lion_s['segment_id'] == '9017006']

Unnamed: 0,objectid,street,saf_street_name,feature_typ,segment_typ,inc_ex_flag,rb_layer,non_ped,traf_dir,traf_src,spec_addr,face_code,seq_num,street_code,saf_street_code,lgc1,lgc2,lgc3,lgc4,lgc5,lgc6,lgc7,lgc8,lgc9,boe_lgc,segment_id,seg_count,loc_status,l_zip,r_zip,l_boro,r_boro,l_cd,r_cd,latomicpol,ratomicpol,lct2010,lct2010_suf,rct2010,rct2010_suf,lcb2010,lcb2010_suf,rcb2010,rcb2010_suf,lct2000,lct2000_suf,rct2000,rct2000_suf,lcb2000,lcb2000_suf,...,y_from,x_to,y_to,arc_center_x,arc_center_y,curve_flag,radius,node_id_from,node_id_to,node_level_f,node_level_t,con_parity,twisted,rw_type,physical_id,generic_id,nypdid,fdnyid,l_block_face,r_block_face,legacy_id,status,street_widt,street_wi_1,street_wi_2,bike_lane,bike_trafd,active_fla,posted_spe,snow_prior,number_tra,number_par,number_tot,carto_disp,fcc,row_type,l_lo_hyphen,l_hi_hyphen,r_lo_hyphen,r_hi_hyphen,from_left,to_left,from_right,to_right,join_id,l_pd_servi,r_pd_servi,truck_rout,shape_leng,geometry
105192,105193.0,EAST 25 STREET,,W,U,,B,,P,DOT,,1470,38,117490,,1,2,,,,,,,,1,9017006,1,,10010,10010,1.0,1.0,106,106,103,104,68,,68,,5000,,5001,,68,,68,,1003,,...,209035,989179,208921,0,0,,0,9011613,21001,M,M,,,1,16775.0,14191.0,,,,,33183,2,,,,,,,,V,,,,,,,155,199,158,198,155,199,158,198,1147001020000,,,,235.140348,"LINESTRING (-73.98295 40.74043, -73.98221 40.7..."


In [53]:
lion_sub.shape

(144427, 18)

In [55]:
lion_sub[lion_sub.duplicated(subset='segment_id')]

Unnamed: 0,segment_id,feat_typ,seg_typ,rw_type,traf_dir,loc_status,curve_flag,radius,st_wid_min,st_wid_max,bike_lane,b_traf_dir,n_trvl_ln,n_park_ln,n_total_ln,posted_spd,snow_prio,geometry
