# Quickstart - Porygon

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
from shapely.geometry import shape
from geopandas import GeoDataFrame

from IPython.display import IFrame

from porygon import PorygonDataFrame 
from porygon.data import load_chicago_traffic_accidents, load_chicago_census_tract_boundaries

## Load data
Load 2019 traffic accident data downloaded from [Chicago Data Portal](https://data.cityofchicago.org/Transportation/Traffic-Crashes-Crashes/85ca-t3if).  
And turn it into a `PorygonDataFrame`. 

In [3]:
df = load_chicago_traffic_accidents()
df.head()

Unnamed: 0,rd_no,crash_date,latitude,longitude,unit_type,make
0,JD100044,2019-12-31,41.871798,-87.69619,DRIVER,UNKNOWN
1,JD100044,2019-12-31,41.871798,-87.69619,PARKED,CHEVROLET
2,JD100044,2019-12-31,41.871798,-87.69619,PARKED,JEEP
3,JD100010,2019-12-31,41.812437,-87.743434,DRIVER,VOLKSWAGEN
4,JD100008,2019-12-31,41.887805,-87.765268,DRIVERLESS,CHEVROLET


In [4]:
vehicle_brand_dummies = pd.get_dummies(df.make)
top_brands = df.make.value_counts()[:21].index.tolist()
top_brands.remove('UNKNOWN')
df['count'] = 1
df_brands = pd.concat([df[['latitude', 'longitude', 'count']], vehicle_brand_dummies[top_brands]], axis=1)
h3df = PorygonDataFrame().from_h3(df_brands, h3_level=8, aggfunc=np.sum)
h3df.head()

  super(GeoDataFrame, self).__setattr__(attr, val)


Unnamed: 0_level_0,count,CHEVROLET,"TOYOTA MOTOR COMPANY, LTD.",FORD,NISSAN,HONDA,DODGE,HYUNDAI,JEEP,KIA MOTORS CORP,...,GENERAL MOTORS CORP.,BUICK,LEXUS,MERCEDES-BENZ,MAZDA,BMW,INFINITI,CADILLAC,SUBARU,geometry
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8826641903fffff,60,3,2,7,10,0,3,1,2,3,...,0,1,1,0,0,1,2,3,0,"POLYGON ((-87.54767 41.67405, -87.55369 41.672..."
8826641905fffff,9,0,1,0,2,1,0,0,0,1,...,0,0,0,1,0,0,0,0,0,"POLYGON ((-87.56377 41.68259, -87.56978 41.680..."
8826641907fffff,44,7,1,13,4,2,1,0,2,1,...,1,1,0,0,0,0,0,0,0,"POLYGON ((-87.55905 41.67510, -87.56507 41.673..."
8826641909fffff,8,3,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,"POLYGON ((-87.54571 41.68798, -87.55173 41.686..."
882664190bfffff,6,1,0,1,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"POLYGON ((-87.54100 41.68049, -87.54702 41.678..."


# Plot them

In [5]:
m = h3df.to_choropleth('count')
m.save('accidents_by_h3_choropleth.html')
IFrame(src='./accidents_by_h3_choropleth.html', width=700, height=600)

You can pass any of the standard [folium.Choropeth](https://python-visualization.github.io/folium/modules.html#folium.folium.Map.choropleth) keyword args (which correspond to those of [Leaflet](https://leafletjs.com/reference-1.5.0.html#map)

In [6]:
m = h3df.to_choropleth('count', bins=7, fill_color='Purples', legend_name='Traffic Accidents')
m

In [7]:
zscore  = lambda x: (x - x.mean()) / (x.std())
zscores = h3df[top_brands].copy()
for c in zscores.columns:
    zscores[c] = zscore(zscores[c])
    
h3df['top_brand_val'] = zscores.max(axis=1)
h3df['top_brand_name'] = zscores.idxmax(axis=1)

m = h3df.to_categorical_map('top_brand_val', 'top_brand_name', legend_title='Accidents by Vehical Make')
m.save('accidents_by_h3_categorical.html')
IFrame(src='./accidents_by_h3_categorical.html', width=700, height=600)

## Okay, now how about using census tract boundaries? 
Census shapefiles downloaded from [Chicago Data Portal](https://data.cityofchicago.org/Facilities-Geographic-Boundaries/Boundaries-Census-Tracts-2010/5jrd-6zik).  

To use `PorygonDataFrame().from_boundaries()`, you must provide the geometries as a `geopandas.GeoDataFrame` with valid polygon `geometry` with index set to a unique `id` field. Any other fields (e.g. `census_name`) will be included in the final `PorygonDataFrame`. 

In [8]:
census_tracts = load_chicago_census_tract_boundaries()
boundaries = [shape(tract['the_geom']) for tract in census_tracts]
ids = [tract['geoid10'] for tract in census_tracts]
names = [tract['namelsad10'] for tract in census_tracts]
gpdf_census = GeoDataFrame({'geometry': boundaries, 'id': ids, 'census_name': names}).set_index('id')
gpdf_census.head()

Unnamed: 0_level_0,geometry,census_name
id,Unnamed: 1_level_1,Unnamed: 2_level_1
17031842400,"MULTIPOLYGON (((-87.62405 41.73022, -87.62405 ...",Census Tract 8424
17031840300,"MULTIPOLYGON (((-87.68608 41.82296, -87.68607 ...",Census Tract 8403
17031841100,"MULTIPOLYGON (((-87.62935 41.85280, -87.62934 ...",Census Tract 8411
17031841200,"MULTIPOLYGON (((-87.68813 41.85569, -87.68816 ...",Census Tract 8412
17031839000,"MULTIPOLYGON (((-87.63312 41.87449, -87.63306 ...",Census Tract 8390


In [9]:
%%time
cdf = PorygonDataFrame().from_boundaries(df_brands, gpdf_census)
cdf.head()

CPU times: user 9min 10s, sys: 2 s, total: 9min 12s
Wall time: 9min 14s


Unnamed: 0_level_0,index,count,CHEVROLET,"TOYOTA MOTOR COMPANY, LTD.",FORD,NISSAN,HONDA,DODGE,HYUNDAI,JEEP,...,BUICK,LEXUS,MERCEDES-BENZ,MAZDA,BMW,INFINITI,CADILLAC,SUBARU,geometry,census_name
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
17031010100,0,184.0,19.0,24.0,18.0,21.0,13.0,3.0,7.0,4.0,...,1.0,1.0,1.0,2.0,1.0,0.0,1.0,1.0,"MULTIPOLYGON (((-87.66368 42.01940, -87.66384 ...",Census Tract 101
17031010201,1,336.0,21.0,63.0,25.0,22.0,39.0,14.0,7.0,7.0,...,2.0,6.0,3.0,5.0,6.0,2.0,1.0,9.0,"MULTIPOLYGON (((-87.68010 42.01254, -87.68027 ...",Census Tract 102.01
17031010202,2,245.0,14.0,52.0,13.0,19.0,20.0,13.0,9.0,10.0,...,1.0,4.0,3.0,7.0,6.0,1.0,1.0,3.0,"MULTIPOLYGON (((-87.67336 42.01937, -87.67311 ...",Census Tract 102.02
17031010300,3,233.0,12.0,38.0,18.0,19.0,22.0,13.0,13.0,8.0,...,4.0,1.0,3.0,3.0,4.0,1.0,0.0,7.0,"MULTIPOLYGON (((-87.66506 42.01280, -87.66543 ...",Census Tract 103
17031010400,4,360.0,33.0,59.0,31.0,29.0,32.0,14.0,17.0,11.0,...,2.0,6.0,7.0,10.0,5.0,4.0,1.0,5.0,"MULTIPOLYGON (((-87.65080 41.99849, -87.65500 ...",Census Tract 104


In [10]:
zscore  = lambda x: (x - x.mean()) / (x.std())
zscores = cdf[top_brands].copy()
for c in zscores.columns:
    zscores[c] = zscore(zscores[c])
    
cdf['top_brand_val'] = zscores.max(axis=1)
cdf['top_brand_name'] = zscores.idxmax(axis=1)

m = cdf.to_categorical_map('top_brand_val', 'top_brand_name', legend_title='Accidents by Vehical Make')
m.save('accidents_by_censustract_categorical.html')
IFrame(src='./accidents_by_censustract_categorical.html', width=700, height=600)

In [12]:
m = cdf.to_categorical_map('top_brand_val', 'top_brand_name', legend_title='Accidents by Vehical Make')
m.save('accidents_by_censustract_categorical.html')