## Imports

In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
from shapely import wkt

## Ingest Data

In [3]:
# because the dataset is so big, this takes around 50 seconds on a Macbook Pro.

df = pd.read_csv("../data/raw_parking_data.csv")

## Preliminary EDA

In [4]:
# what the data looks like

df.head()

Unnamed: 0,Citation Number,Citation Issued DateTime,Violation,Violation Description,Citation Location,Vehicle Plate State,Vehicle Plate,Fine Amount,Date Added,geom,Neighborhoods,SF Find Neighborhoods,Current Police Districts,Current Supervisor Districts,Analysis Neighborhoods
0,771857995,10/27/2009 03:36:00 PM,T32A.1,TWAWY ZN#1,669 MISSION ST,CA,6ANP484,83.0,10/27/2009 12:00:00 AM,POINT (-122.40131583199997 37.78670342300006),32.0,32.0,1.0,10.0,8.0
1,771104246,09/14/2009 11:05:00 AM,V5204A,REG TABS,7 GROVE ST,CA,5FEK647,63.0,09/14/2009 12:00:00 AM,POINT (-122.41511664599994 37.77876024500006),21.0,21.0,5.0,10.0,36.0
2,770561131,09/10/2009 03:30:00 PM,T202,PRK METER,2851 24TH ST,CA,6DEG966,53.0,09/10/2009 12:00:00 AM,POINT (-122.40919778399996 37.752743407000025),53.0,53.0,3.0,2.0,20.0
3,770024076,09/07/2009 03:21:00 PM,T202,PRK METER,417 BAY ST,CA,6DKY758,53.0,09/07/2009 12:00:00 AM,POINT (-122.41413599899994 37.80540600300003),106.0,106.0,6.0,3.0,23.0
4,770025093,09/09/2009 01:23:00 PM,T58A,BLK WHEELS,136 LAWTON ST,CA,4JTL645,48.0,09/09/2009 12:00:00 AM,POINT (-122.46446999799997 37.75869600400006),109.0,109.0,10.0,8.0,14.0


In [5]:
# number of rows and columns

print(f"Data has {df.shape[0]} rows and {df.shape[1]} columns")

Data has 19234535 rows and 15 columns


In [6]:
# what are the different types of violations sorted by popularity? looking at the 20 most popular violation types

violation_gb = df.groupby(["Violation", "Violation Description"]).size().sort_values(ascending=False)
violation_gb.head(20)

Violation    Violation Description
TRC7.2.22    STR CLEAN                4833134
T37C         ST CLEANIN               2397076
TRC7.2.23B   MTR OUT DT               1332915
T202         PRK METER                1271179
TRC7.2.20    RES/OT                   1145485
T315A        RESIDENTL                 769895
TRC7.2.23A   METER DTN                 639608
T202.1       PRK MTR #1                622286
V5204A       REG TABS                  607814
V5200        NO PLATES                 378102
V22500E      DRIVEWAY                  374623
V22500H      DBL PARK                  365086
V22500F      ON SIDEWLK                286463
TRC7.2.35    PRK GRADE                 283791
T58A         BLK WHEELS                277545
TRC7.2.40    PRK PROHIB                273685
TRC7.2.26    YEL ZONE                  232837
TRC7.2.41    PK PHB OTD                197878
TRC7.2.101A  FAIL DISPL                171151
T32A.1       TWAWY ZN#1                134705
dtype: int64

In [7]:
# only consider rows with geometry for now. load geometry with shapely

df_hasgeom = df[~df.geom.isnull()]
df_hasgeom.loc[:, 'geom'] = df_hasgeom.geom.apply(wkt.loads)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_hasgeom.loc[:, 'geom'] = df_hasgeom.geom.apply(wkt.loads)


In [8]:
# get latitude and longitude information from points

df_hasgeom[:, 'long'] = df_hasgeom.geom.apply(lambda point: point.x)
df_hasgeom[:, 'lat'] = df_hasgeom.geom.apply(lambda point: point.y)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_hasgeom['lat'] = df_hasgeom.geom.apply(lambda point: point.x)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_hasgeom['long'] = df_hasgeom.geom.apply(lambda point: point.y)


In [25]:
df_sample = df_hasgeom.sample(n=10000, random_state=42)
df_sample['count'] = 1

In [27]:
fig = px.density_mapbox(df_sample, lat='long', lon='lat', z='count', 
                        mapbox_style="stamen-terrain")
 
fig

In [15]:
import pandas as pd
df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/earthquakes-23k.csv')

import plotly.express as px
fig = px.density_mapbox(df, lat='Latitude', lon='Longitude', z='Magnitude', radius=10,
                        center=dict(lat=0, lon=180), zoom=0,
                        mapbox_style="stamen-terrain")
fig.show()

In [16]:
df.head()

Unnamed: 0,Date,Latitude,Longitude,Magnitude
0,01/02/1965,19.246,145.616,6.0
1,01/04/1965,1.863,127.352,5.8
2,01/05/1965,-20.579,-173.972,6.2
3,01/08/1965,-59.076,-23.557,5.8
4,01/09/1965,11.938,126.427,5.8


In [17]:
df_sample

Unnamed: 0,Citation Number,Citation Issued DateTime,Violation,Violation Description,Citation Location,Vehicle Plate State,Vehicle Plate,Fine Amount,Date Added,geom,Neighborhoods,SF Find Neighborhoods,Current Police Districts,Current Supervisor Districts,Analysis Neighborhoods,lat,long,count
15871135,866302404,03/21/2016 11:02:00 AM,TRC7.2.35,PRK GRADE,1103 FULTON ST,CA,7RFL201,62.0,03/21/2016 12:00:00 AM,POINT (-122.43494182599994 37.77720898100006),23.0,23.0,4.0,11.0,9.0,-122.434942,37.777209,1
12949222,906463596,02/04/2019 07:09:00 AM,TRC7.2.41,PK PHB OTD,756 O'FARRELL ST,CA,7YZA313,102.0,04/30/2020 12:00:00 AM,POINT (-122.41731141099996 37.78532174800006),20.0,20.0,5.0,10.0,36.0,-122.417311,37.785322,1
16017241,875620410,11/18/2016 10:06:00 AM,TRC7.2.22,STR CLEAN,128 CLARENDON AVE,CA,7CRH733,71.0,12/12/2016 12:00:00 AM,POINT (-122.45209302899997 37.75862299700003),110.0,110.0,7.0,8.0,14.0,-122.452093,37.758623,1
11614142,821791530,01/22/2013 11:26:00 AM,T38C,WHITE ZONE,365 FULTON ST,CA,CESLEY,98.0,01/22/2013 12:00:00 AM,POINT (-122.42278897299997 37.77852201300004),21.0,21.0,4.0,11.0,9.0,-122.422789,37.778522,1
13311909,909409535,04/08/2019 07:12:00 AM,TRC7.2.22,STR CLEAN,1591 30TH AVE,CA,5XFX863,76.0,04/30/2020 12:00:00 AM,POINT (-122.48856944199997 37.75761689500007),39.0,39.0,10.0,7.0,35.0,-122.488569,37.757617,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7547551,790855656,01/14/2011 01:49:00 PM,T202,PRK METER,516 HAYES ST,CA,5L87346,55.0,01/14/2011 12:00:00 AM,POINT (-122.42491202499997 37.77681600200003),22.0,22.0,4.0,11.0,9.0,-122.424912,37.776816,1
16126743,PD28903416,10/22/2016 01:00:00 PM,TRC7.2.45,TMP PK RES,200 NAPOLEON ST,CA,4XOJ611,192.0,10/09/2017 12:00:00 AM,POINT (-122.39886601599994 37.74758600100006),85.0,85.0,2.0,9.0,1.0,-122.398866,37.747586,1
268229,837470222,01/25/2014 04:12:00 PM,TRC7.2.23B,MTR OUT DT,408 CLEMENT ST,CA,5FFL210,64.0,02/22/2018 12:00:00 AM,POINT (-122.46377698299995 37.78322501500003),5.0,5.0,8.0,4.0,11.0,-122.463777,37.783225,1
9015679,788328096,10/06/2010 03:35:00 PM,T202,PRK METER,27 29TH ST,CA,5E49364,55.0,10/06/2010 12:00:00 AM,POINT (-122.42149652899997 37.74395034600008),83.0,83.0,9.0,2.0,2.0,-122.421497,37.743950,1
