In [93]:
##################################################
##################################################
######## IMPORT THE RELEVANT LIBRARIES
##################################################
##################################################
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm import tqdm
import folium
from folium.plugins import MarkerCluster, FastMarkerCluster

from google.cloud import datastore
import googlemaps
from datetime import datetime

import concurrent.futures

# setup a style to view ipython notebook graphs
sns.set_style('whitegrid')
sns.set_context('notebook')

%matplotlib inline
import datetime

import itertools

### library to get data from nyc_open_data
from sodapy import Socrata

### import libraries for folium tiles
import os
import folium
from folium.features import CustomIcon

In [29]:
####### grabbing the data regarding NYC filming permits 

client = Socrata("data.cityofnewyork.us", None)
results = client.get("tg4x-b46p", limit=50000)
film_permits = pd.DataFrame.from_records(results)

###### filtering the dataset to 2017+ only
film_permits_since_2017 = film_permits[film_permits['startdatetime'] >= '2017-01-01']



In [46]:
###### parsing the "parking held" variable to find the non-comma separated values
def get_locations(row):
    locations = row['parkingheld']
    borough = row['borough']
    
    parsed_locations = locations.split(",")  
    parsed_locations = [ f"{l.strip()}, {borough}"  for l in parsed_locations ] 
    return pd.Series([parsed_locations])

In [47]:
film_permits_since_2017['locations'] = film_permits_since_2017.apply(get_locations,axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [6]:
#### setting up the geocode_result API

API_KEY=''
project=''
gmaps = googlemaps.Client(key = API_KEY)

In [8]:
##############################################################
### split our dataset into an individual row for each location
### this implies that our dataset will have duplicate rows 
##############################################################
def splitDataFrameList(df,target_column,separator):
    ''' df = dataframe to split,
    target_column = the column containing the values to split
    separator = the symbol used to perform the split
    returns: a dataframe with each entry for the target column separated, with each element moved into a new row. 
    The values in the other columns are duplicated across the newly divided rows.
    '''
    def splitListToRows(row,row_accumulator,target_column,separator):
        split_row = row[target_column].split(separator)
        for s in split_row:
            new_row = row.to_dict()
            new_row[target_column] = s
            row_accumulator.append(new_row)
    new_rows = []
    df.apply(splitListToRows,axis=1,args = (new_rows,target_column,separator))
    new_df = pd.DataFrame(new_rows)
    return new_df

In [11]:
#### find all of the distinct locations
distinct_locations = sorted(list(set(list(itertools.chain(*film_permits_since_2017['locations'])))))

##### how many distinct locations are there?
#len(distinct_locations)


In [23]:

# Instantiates a client
datastore_client = datastore.Client(project=project)
gmaps = googlemaps.Client(key=API_KEY)
 
def geocode_helper(location, kind='Location'):
    task_key = datastore_client.key(kind, location)
 
    # Prepares the new entity
    task = datastore_client.get(task_key)
    if task is None :
        task = datastore.Entity(task_key)
       
    payload = task.get('payload',None)
   
    if "geocodable" not in task:
        if "payload" not in task:
            payload = gmaps.geocode(location)
            task['payload'] = payload
            if len(payload) > 0:
                geometry = payload[0]['geometry']
                task['geometry']= geometry
                task['location'] =geometry['location']
                task['geocodable'] = True
            else:
                task['geocodable'] = False
               
        else:
            task['geocodable'] = True
        datastore_client.put(task)
       
       
 
    return payload

    
def geocode(location, kind='Location'):
    location_variants =[]
    location_variants.append(location)
    
    location_variants.append(location.replace(", Brooklyn",",Brooklyn")\
                             .replace(", Manhattan",",Manhattan")\
                             .replace(", Queens",",Queens")
                            )
    for loc in location_variants:
        res = geocode_helper(loc,kind)
        if res is not None:
            return res
    return None


 
raw_address_mapping = {}
 
# We can use a with statement to ensure threads are cleaned up promptly
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
    # Start the load operations and mark each future with its URL
    future_to_geocode = {executor.submit(geocode, location): location for location in distinct_locations}
    for future in tqdm(concurrent.futures.as_completed(future_to_geocode)):
        location = future_to_geocode[future]
        try:
            data = future.result()
            raw_address_mapping[location] = data
            #geocoded_list.append({"location":location, "lat" :data["lat"],  "long": data["long"]})
        except Exception as exc:
            print (exc)
            print (f"Failed for {location}")
            

#######################################################################
############ attaching the lat and long information to our main dataset
#######################################################################
s = film_permits_since_2017.apply(lambda x: pd.Series(x['locations']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'location'
film_permits_since_2017 = film_permits_since_2017.join(s)

13118it [02:01, 108.30it/s]


In [55]:
############################################
#### 
############################################
def parse_lat_long(entry):
    location = entry['location']

    try:
        payload_coordinates = raw_address_mapping[location][0]["geometry"]['location']
    
        entry["latitude"] = payload_coordinates["lat"]
        entry["longitude"] = payload_coordinates["lng"]
    except:
        print(f"Failed to find {location}")
        entry["latitude"]=None
        entry["longitude"]=None
    return entry

film_permits_since_2017 = film_permits_since_2017.apply(parse_lat_long, axis=1)

In [87]:
#### dropping instances where we could not find the address
film_permits_since_2017 = film_permits_since_2017[~pd.isnull(film_permits_since_2017['latitude'])]

In [121]:
#############################################################
### creating a folium graph with a tile for each category!
#############################################################

m = Map(
    location=starting_location,
    zoom_start=12,
    tiles='cartodbpositron'
)

callback = ('function (row) {' 
                'var circle = L.circle(new L.LatLng(row[0], row[1]), {color: "red",  radius: 10});'
                'return circle};')

for x in film_permits_since_2017['category'].unique():
    feature_group = FeatureGroup(name=x)
    feature_group.add_child(FastMarkerCluster(film_permits_since_2017[film_permits_since_2017['category'] == x][['latitude', 'longitude']].values.tolist(), callback=callback))
    feature_group.add_to(m)
    
LayerControl().add_to(m)

m

In [137]:
#############################################################
### this code creates an interactive graph for each subcategory
#############################################################

##m = Map(
##    location=starting_location,
##    zoom_start=12,
##    tiles='cartodbpositron'
##)
##
##callback = ('function (row) {' 
##                'var circle = L.circle(new L.LatLng(row[0], row[1]), {color: "red",  radius: 10});'
##                'return circle};')
##
##for x in film_permits_since_2017['subcategoryname'].unique():
##    feature_group = FeatureGroup(name=x)
##    feature_group.add_child(FastMarkerCluster(film_permits_since_2017[film_permits_since_2017['subcategoryname'] == x][['latitude', 'longitude']].values.tolist(), callback=callback))
##    feature_group.add_to(m)
##    
##LayerControl().add_to(m)
##
##m