In [1]:
import numpy as np
import pandas as pd
import geopandas as gpd
import json
import pickle # For various data processing checkpoints

import permit_functions # Regex and general cleaning functions
import geo_functions # Geographic matching/intersection/segmentation functions
from geo_functions import BORO_DICT

# Retrieving/Cleaning NYC Film Permit and Street Geometry Data
### Data Sources
- Street Geometry: [NYC Planning - NYC Digital City Map](https://www1.nyc.gov/site/planning/data-maps/open-data/dwn-digital-city-map.page)
- Zip Code Geometry: [NYC OpenData - Zip Code Boundaries](https://data.cityofnewyork.us/Business/Zip-Code-Boundaries/i8iw-xf4u)
- Film Permit Data: [NYC OpenData - Film Permits](https://data.cityofnewyork.us/City-Government/Film-Permits/tg4x-b46p)

### Retrieve Film Permit Data
- Film permit data was retrieved from Socrata via SodaPy and saved as JSON file
- The strings indicating streets where parking was held for filming had to be split and cleaned (i.e. ['water street between dock st and main st', ...] -> [('water street', 'dock street', 'main street'), ...])
- Since film permits may hold multiple streets for filming, film permits were split into each street being held

In [2]:
# Retrieve and clean film permit data
with open('./temp/film_events.json', 'r') as f:
    film = json.load(f)

cleaned_film = []
for f in film:
    data = permit_functions.clean_data(f)
    cleaned_film.append(data)
    
# Create DataFrame of film permits
df = permit_functions.create_film_df(cleaned_film)

In [3]:
df.head(2)

Unnamed: 0,id,borough,zipcode,startdate,enddate,enteredon,category,subcategory,origin,main_st,cross_st_1,cross_st_2
0,605327,Brooklyn,[11201],2021-10-06 12:00:00,2021-10-07 01:00:00,2021-10-01 18:03:08,Film,Feature,United States of America,water street,dock street,main street
1,605327,Brooklyn,[11201],2021-10-06 12:00:00,2021-10-07 01:00:00,2021-10-01 18:03:08,Film,Feature,United States of America,cadman plaza west,prospect street,clark street


### Geographic Data
- NYC zip codes and street geometry was retrieved and reprojected to longitude, latitude coordinate system
- Street geometry was merged from distinct sections (i.e. each street having multiple rows of line geometry) to single Shapely MultiLineString
- In NYC, a street name indicated separate streets (i.e. '100th street' in Brooklyn is distinct from '100th street' in Queens), so geometry was grouped by street name and borough

In [4]:
# Get zipcode geometry and reproject to MapBox readable coordinate system
zipcodes = gpd.read_file('./geodata/zipcodes/ZIP_CODE_040114.shp')
zipcodes['geometry'] = zipcodes['geometry'].to_crs('EPSG:4326')
zipcodes = pd.DataFrame(zipcodes[['ZIPCODE', 'COUNTY', 'geometry']])
zipcodes['COUNTY'] = zipcodes['COUNTY'].map(lambda x: BORO_DICT[x])
zipcodes.columns = ['zipcode', 'borough', 'geometry']

In [34]:
# DataFrame from cell above was saved as pickle file
with open('zip_codes.p', 'wb') as f:
    pickle.dump(zipcodes, f)

with open('zip_codes.p', 'rb') as f:
    zipcodes = pickle.load(f)

In [None]:
# Read file and reproject to MapBox readable coordinate system
nyc = gpd.read_file('./geodata/dcm_scl/DCM_StreetCenterLine.shp')
nyc['geometry'] = nyc['geometry'].to_crs('EPSG:4326')

# Merge street segments
nyc['street'] = nyc['Street_NM'] + ', ' + nyc['Borough']
nyc = nyc.dissolve('street').reset_index()

# Determine zipcodes of streets (long format)
nyc['zipcodes'] = nyc['geometry'].map(lambda x: geo_functions.seg_in_zipcode(x, zipcodes))
nyc = pd.DataFrame(nyc[['Street_NM', 'Borough', 'zipcodes', 'geometry']])
nyc.columns = ['street', 'borough', 'zipcodes', 'geometry']
nyc['street'] = nyc['street'].map(lambda x: permit_functions.clean_street(x))
temp = gpd.GeoDataFrame(nyc['zipcodes'].to_list(), index=nyc.index).stack().reset_index()
nyc = temp.merge(nyc, how='left', left_on='level_0', right_on=nyc.index)
nyc.drop(columns=['level_0', 'level_1', 'zipcodes'], inplace=True)
nyc.rename(columns={0: 'zipcode'}, inplace=True)

In [5]:
# DataFrame from cell above was saved as pickle file
# with open('nyc.p', 'wb') as f:
#     pickle.dump(nyc, f)

with open('nyc.p', 'rb') as f:
    nyc = pickle.load(f)

nyc = gpd.GeoDataFrame(nyc) # Re-instantiate as DataFrame for type hints

In [6]:
nyc.head(2)

Unnamed: 0,zipcode,street,borough,geometry
0,10465,1st avenue,Bronx,MULTILINESTRING Z ((-73.80721 40.82418 0.00000...
1,11232,1st avenue,Brooklyn,MULTILINESTRING Z ((-74.01272 40.65661 0.00000...


### Matching Streets in Film Permits
- The geometry of the main street and cross streets had to be matched to the street names
- Since filming may take place in multiple boroughs, each street had to be matched on name and zip code from the listed film shoot (i.e. 'north 4th street' in film permit had to be matched on a street name of 'north 4th street' and zip code of '11211')

#### Future Improvements
- Further improvements must be made for creating false intersections where 2 streets do not have an official NYC Planning Street Center Line intersection
- Further improvements must be made for list of zip codes having multiple same name streets (i.e. filming taking place in downtown Brooklyn which has a 'West Street' and westside Manhattan which has a 'West Street')

In [None]:
### DO NOT RUN CELL ###
# Match street name to street geometry
df['ms_geom'] = df.apply(lambda x: geo_functions.match_street_geo(x['main_st'], x['zipcode'], nyc), axis=1)
df['cs1_geom'] = df.apply(lambda x: geo_functions.match_street_geo(x['cross_st_1'], x['zipcode'], nyc), axis=1)
df['cs2_geom'] = df.apply(lambda x: geo_functions.match_street_geo(x['cross_st_2'], x['zipcode'], nyc), axis=1)
### DO NOT RUN CELL ###

In [7]:
# DataFrame from cell above was saved as pickle file
# with open('df.p', 'wb') as f:
#     pickle.dump(df, f)

with open('df.p', 'rb') as f:
    df = pickle.load(f)

df = gpd.GeoDataFrame(df) # Re-instantiate as DataFrame for type hints

In [8]:
df.head(2)

Unnamed: 0,id,borough,zipcode,startdate,enddate,enteredon,category,subcategory,origin,main_st,cross_st_1,cross_st_2,ms_geom,cs1_geom,cs2_geom
0,605327,Brooklyn,[11201],2021-10-06 12:00:00,2021-10-07 01:00:00,2021-10-01 18:03:08,Film,Feature,United States of America,water street,dock street,main street,(LINESTRING Z (-73.98788994171684 40.703141574...,(LINESTRING Z (-73.99255059003967 40.703299935...,(LINESTRING Z (-73.99066328463533 40.703907459...
1,605327,Brooklyn,[11201],2021-10-06 12:00:00,2021-10-07 01:00:00,2021-10-01 18:03:08,Film,Feature,United States of America,cadman plaza west,prospect street,clark street,(LINESTRING Z (-73.99102943975372 40.699846702...,(LINESTRING Z (-73.9897364359837 40.7007053107...,(LINESTRING Z (-73.99301808265935 40.697461904...


In [9]:
df.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 105545 entries, 0 to 105544
Data columns (total 15 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   id           105545 non-null  object        
 1   borough      105545 non-null  object        
 2   zipcode      105545 non-null  object        
 3   startdate    105545 non-null  datetime64[ns]
 4   enddate      105545 non-null  datetime64[ns]
 5   enteredon    105545 non-null  datetime64[ns]
 6   category     105545 non-null  object        
 7   subcategory  105545 non-null  object        
 8   origin       105545 non-null  object        
 9   main_st      105545 non-null  object        
 10  cross_st_1   105545 non-null  object        
 11  cross_st_2   105545 non-null  object        
 12  ms_geom      103797 non-null  object        
 13  cs1_geom     103147 non-null  object        
 14  cs2_geom     102279 non-null  object        
dtypes: datetime64[ns](3), obje

### Intersections and Street Segments
- The geometry of the streets blocked for filming was derived from the intersection of the main street and cross street geometries
- ~18% of the rows had to be drop due to unmatched street names in the film permit to official street names, streets where intersections did not exist based on geometry
- The `get_held_geometry` function was able to create false intersections between many streets but not all streets where an intersection between a main street and cross street did not exist
- Note: Deprecation warning from setting GeoSeries objects to MultiLineString geometries, function utilizes `geoms` property

In [30]:
### DO NOT RUN CELL ###
# Drop rows with missing street geometry
df.dropna(axis=0, subset=['ms_geom', 'cs1_geom', 'cs2_geom'], inplace=True)

# Get geometry of parking held for filming
df['ph_geom'] = df.apply(lambda x: geo_functions.get_held_geometry(x), axis=1)
df.dropna(axis=0, subset=['ph_geom'], inplace=True)

# Drop empty geometries
df['ph_geom'] = df['ph_geom'].map(lambda x: np.nan if x.is_empty else x)
df.dropna(axis=0, subset=['ph_geom'], inplace=True)
df.rename(columns={'ph_geom': 'geometry'}, inplace=True)
### DO NOT RUN CELL ###

  arr = construct_1d_object_array_from_listlike(values)
  result[:] = values


In [31]:
# DataFrame from cell above was saved as pickle file
# with open('film_df.p', 'wb') as f:
#     pickle.dump(df, f)

with open('film_df.p', 'rb') as f:
    df = pickle.load(f)

df = gpd.GeoDataFrame(df) # Re-instantiate as DataFrame for type hints

In [32]:
df.head(2)

Unnamed: 0,id,borough,zipcode,startdate,enddate,enteredon,category,subcategory,origin,main_st,cross_st_1,cross_st_2,ms_geom,cs1_geom,cs2_geom,geometry
0,605327,Brooklyn,[11201],2021-10-06 12:00:00,2021-10-07 01:00:00,2021-10-01 18:03:08,Film,Feature,United States of America,water street,dock street,main street,(LINESTRING Z (-73.98788994171684 40.703141574...,(LINESTRING Z (-73.99255059003967 40.703299935...,(LINESTRING Z (-73.99066328463533 40.703907459...,"LINESTRING Z (-73.99255 40.70330 0.00000, -73...."
2,605327,Brooklyn,[11201],2021-10-06 12:00:00,2021-10-07 01:00:00,2021-10-01 18:03:08,Film,Feature,United States of America,tillary street,cadman plaza west,adams street - brooklyn bridge boulevard,(LINESTRING Z (-73.98035914408307 40.696190742...,(LINESTRING Z (-73.99102943975372 40.699846702...,(LINESTRING Z (-73.98876375219565 40.696197512...,MULTILINESTRING Z ((-73.99118 40.69629 0.00000...


In [33]:
df.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 86993 entries, 0 to 105544
Data columns (total 16 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   id           86993 non-null  object        
 1   borough      86993 non-null  object        
 2   zipcode      86993 non-null  object        
 3   startdate    86993 non-null  datetime64[ns]
 4   enddate      86993 non-null  datetime64[ns]
 5   enteredon    86993 non-null  datetime64[ns]
 6   category     86993 non-null  object        
 7   subcategory  86993 non-null  object        
 8   origin       86993 non-null  object        
 9   main_st      86993 non-null  object        
 10  cross_st_1   86993 non-null  object        
 11  cross_st_2   86993 non-null  object        
 12  ms_geom      86993 non-null  object        
 13  cs1_geom     86993 non-null  object        
 14  cs2_geom     86993 non-null  object        
 15  geometry     86993 non-null  object        
