In this notebook I experimented with reducing the data size to see if it would help our folium code run more quickly. I reduced the size of the data_frame from 55mb to 32mb using methods outined in this website https://www.dataquest.io/blog/pandas-big-data/ (recommended by classmates and instructor). However, this did not allow our folium code to run more quickly. The map rendered in about 8 seconds either way

In [57]:
import csv
import folium
import timeit
import pandas as pd
from folium.plugins import FastMarkerCluster
from datetime import datetime
from time import process_time


In [28]:
with open('../data/Building_Permits___Current.csv',encoding='ascii',errors='ignore') as csvfile:
    permits = pd.read_csv(csvfile)
#  Change data types to reflect actual data types   
permits['Issue Date']= pd.to_datetime(permits['Issue Date'], format='%m/%d/%y')
permits['Final Date']= pd.to_datetime(permits['Final Date'], format='%m/%d/%y')
permits['Expiration Date']= pd.to_datetime(permits['Expiration Date'], format='%m/%d/%y')
permits['Application Date']= pd.to_datetime(permits['Application Date'], format='%m/%d/%y')

# Change permit value from type Object to Type float, so we can choose permits above a specified value
permits['Value'] = permits['Value'].map(lambda x: x.lstrip('$'))
permits['Value'] =permits['Value'].str.replace(',', '').astype(float)

In [29]:
gl = permits
gl.info(memory_usage='deep')
for dtype in ['float','datetime','object']:
    selected_dtype = gl.select_dtypes(include=[dtype])
    mean_usage_b = selected_dtype.memory_usage(deep=True).mean()
    mean_usage_mb = mean_usage_b / 1024 ** 2
    print("Average memory usage for {} columns: {:03.2f} MB".format(dtype,mean_usage_mb))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59166 entries, 0 to 59165
Data columns (total 20 columns):
Application/Permit Number          59166 non-null int64
Permit Type                        59166 non-null object
Address                            58940 non-null object
Description                        58895 non-null object
Category                           55903 non-null object
Action Type                        55963 non-null object
Work Type                          59166 non-null object
Value                              59166 non-null float64
Applicant Name                     58663 non-null object
Application Date                   45313 non-null datetime64[ns]
Issue Date                         38328 non-null datetime64[ns]
Final Date                         26381 non-null datetime64[ns]
Expiration Date                    38427 non-null datetime64[ns]
Status                             54614 non-null object
Contractor                         10631 non-null object
Perm

In [33]:
# We're going to be calculating memory usage a lot,
# so we'll create a function to save us some time!

def mem_usage(pandas_obj):
    if isinstance(pandas_obj,pd.DataFrame):
        usage_b = pandas_obj.memory_usage(deep=True).sum()
    else: # we assume if not a df it's a series
        usage_b = pandas_obj.memory_usage(deep=True)
    usage_mb = usage_b / 1024 ** 2 # convert bytes to megabytes
    return "{:03.2f} MB".format(usage_mb)

gl_int = gl.select_dtypes(include=['int'])
converted_int = gl_int.apply(pd.to_numeric,downcast='unsigned')


gl_float = gl.select_dtypes(include=['float'])
converted_float = gl_float.apply(pd.to_numeric,downcast='float')

print(mem_usage(gl_float))
print(mem_usage(converted_float))

compare_floats = pd.concat([gl_float.dtypes,converted_float.dtypes],axis=1)
compare_floats.columns = ['before','after']
compare_floats.apply(pd.Series.value_counts)

1.81 MB
0.90 MB


Unnamed: 0,before,after
float32,,4.0
float64,4.0,


In [34]:
optimized_gl = gl.copy()
optimized_permits = permits.copy()
optimized_gl[converted_int.columns] = converted_int
optimized_gl[converted_float.columns] = converted_float
print(mem_usage(gl))
print(mem_usage(optimized_gl))

55.90 MB
55.00 MB


In [35]:
gl_obj = gl.select_dtypes(include=['object']).copy()
gl_obj.describe()

converted_obj = pd.DataFrame()

for col in gl_obj.columns:
    num_unique_values = len(gl_obj[col].unique())
    num_total_values = len(gl_obj[col])
    if num_unique_values / num_total_values < 0.5:
        converted_obj.loc[:,col] = gl_obj[col].astype('category')
    else:
        converted_obj.loc[:,col] = gl_obj[col]

print(mem_usage(gl_obj))
print(mem_usage(converted_obj))

compare_obj = pd.concat([gl_obj.dtypes,converted_obj.dtypes],axis=1)
compare_obj.columns = ['before','after']
compare_obj.apply(pd.Series.value_counts)        

51.84 MB
28.76 MB


Unnamed: 0,before,after
object,11.0,4
category,,7


In [36]:
optimized_gl[converted_obj.columns] = converted_obj

mem_usage(optimized_gl)

'31.92 MB'

In [37]:
permits = optimized_gl.copy()

In [38]:
core_permits = permits.filter(['Application/Permit Number', 'Address','Description', 'Category','Action Type','Value','Application Date','Issue Date','Final Date','Expiration Date','Status','Latitude','Longitude','Location'])
core_permits = core_permits[core_permits['Final Date'].notnull()]
core_permits = core_permits[core_permits['Action Type'] == 'NEW']
core_permits = core_permits[core_permits['Category'] != 'SINGLE FAMILY / DUPLEX']
length = core_permits.shape[0]
length


1249

In [39]:
# long_permits = permits[str(permits['Final Date'] != 'NaT')] 
core_permits.dtypes
import numpy as np
core_permits['Accident_Count'] = np.zeros(length)+1

In [40]:
map_osm = folium.Map(location=[47.6062, -122.3321], zoom_start = 11)

In [41]:
map_osm 


In [52]:
def add_marker(data, map_object):
    for index, row in data.iterrows():
        folium.CircleMarker(
        location = [row['Latitude'], row['Longitude']],
        radius=row['Accident_Count'],
        fill_color='#132b5e'
        ).add_to(map_object)


In [53]:
t = process_time()

map_osm = folium.Map(location=[47.6062, -122.3321], zoom_start = 11)
add_marker(core_permits, map_osm)
%timeit 
map_osm


In [54]:
elapsed_time = process_time() - t


In [55]:
elapsed_time
optimized_process_time = elapsed_time

In [56]:
print(optimized_process_time)
original_process_time

8.625


8.6875