## Imports

In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
from shapely import wkt

## Ingest Data

In [29]:
# because the dataset is so big, this takes around 50 seconds on a Macbook Pro.

df = pd.read_csv("../data/raw_parking_data.csv")

In [30]:
# what the data looks like

df.head()

Unnamed: 0,Citation Number,Citation Issued DateTime,Violation,Violation Description,Citation Location,Vehicle Plate State,Vehicle Plate,Fine Amount,Date Added,geom,Neighborhoods,SF Find Neighborhoods,Current Police Districts,Current Supervisor Districts,Analysis Neighborhoods
0,771857995,10/27/2009 03:36:00 PM,T32A.1,TWAWY ZN#1,669 MISSION ST,CA,6ANP484,83.0,10/27/2009 12:00:00 AM,POINT (-122.40131583199997 37.78670342300006),32.0,32.0,1.0,10.0,8.0
1,771104246,09/14/2009 11:05:00 AM,V5204A,REG TABS,7 GROVE ST,CA,5FEK647,63.0,09/14/2009 12:00:00 AM,POINT (-122.41511664599994 37.77876024500006),21.0,21.0,5.0,10.0,36.0
2,770561131,09/10/2009 03:30:00 PM,T202,PRK METER,2851 24TH ST,CA,6DEG966,53.0,09/10/2009 12:00:00 AM,POINT (-122.40919778399996 37.752743407000025),53.0,53.0,3.0,2.0,20.0
3,770024076,09/07/2009 03:21:00 PM,T202,PRK METER,417 BAY ST,CA,6DKY758,53.0,09/07/2009 12:00:00 AM,POINT (-122.41413599899994 37.80540600300003),106.0,106.0,6.0,3.0,23.0
4,770025093,09/09/2009 01:23:00 PM,T58A,BLK WHEELS,136 LAWTON ST,CA,4JTL645,48.0,09/09/2009 12:00:00 AM,POINT (-122.46446999799997 37.75869600400006),109.0,109.0,10.0,8.0,14.0


In [31]:
# number of rows and columns

print(f"Data has {df.shape[0]} rows and {df.shape[1]} columns")

Data has 19234535 rows and 15 columns


## Initial Filtering

In [39]:
# number of non-SF citations

df['SF Find Neighborhoods'].isnull().sum()

6166567

In [40]:
# filter out non-SF citations

df = df[~df['SF Find Neighborhoods'].isnull()]
df.shape

(13067968, 15)

In [43]:
# convert citation issued date to datetime for additional filtering. this takes a long time!

df['Citation Issued DateTime'] = pd.to_datetime(df['Citation Issued DateTime'])

In [47]:
# only look at data > 2020 for now.

short_df = df[df['Citation Issued DateTime'] > pd.Timestamp(2020,1,1)].sort_values('Citation Issued DateTime')
short_df.shape

(336100, 15)

In [48]:
short_df.to_csv('../data/parking_data_2020.csv', index=False)