In [103]:
# force geopandas to use shapely instead of pygeos
import os
os.environ['USE_PYGEOS'] = '0'

import numpy as np
import pandas as pd
import plotly.express as px
from shapely import wkt
from shapely.geometry import Point
import geopandas as gpd
import matplotlib.pyplot as plt

In [104]:
df = pd.read_csv("../data/data_2022_with_geom.csv")
df.head()

Unnamed: 0,Citation Number,Citation Issued DateTime,Violation,Violation Description,Citation Location,Vehicle Plate State,Vehicle Plate,Fine Amount,Date Added,geom
0,950226616,04/20/2022 12:10:00 PM,TRC7.2.22,STR CLEAN,1318 FILBERT ST,CA,V507699,84.0,05/01/2022 12:00:00 AM,POINT (-122.42119800399996 37.800016984000024)
1,952126114,06/17/2022 04:12:00 PM,TRC7.2.20,RES/OT,1318 FILBERT ST,CA,BV44D97,96.0,07/17/2022 12:00:00 AM,POINT (-122.42119800399996 37.800016984000024)
2,950441903,04/23/2022 03:13:00 PM,V5200,NO PLATES,1318 FILBERT ST,CA,V507699,121.0,05/18/2022 12:00:00 AM,POINT (-122.42119800399996 37.800016984000024)
3,948460144,02/22/2022 11:54:00 AM,TRC7.2.20,RES/OT,1318 FILBERT ST,TX,HVT6737,96.0,05/23/2022 12:00:00 AM,POINT (-122.42119800399996 37.800016984000024)
4,948169644,02/17/2022 01:39:00 PM,TRC7.2.20,RES/OT,1318 FILBERT ST,OH,HQA6554,96.0,04/03/2022 12:00:00 AM,POINT (-122.42119800399996 37.800016984000024)


In [105]:
df['geom'] = df.geom.apply(wkt.loads)
df = gpd.GeoDataFrame(df, geometry='geom')
df.crs = 'EPSG:4326'

In [106]:
df['Citation Issued DateTime'] = pd.to_datetime(df['Citation Issued DateTime'])

In [107]:
df['citation_day'] = df['Citation Issued DateTime'].dt.day_name()

## Feature Engineering

Features we need

1. Violation Description
2. Location Lat / Long
3. Weekday
4. Lag Features


Ideas:

1. Length of street?

In [108]:
# order by date for creating lag features / time series cross-validation

df.sort_values('Citation Issued DateTime', inplace=True)

In [109]:
# get only top violation types to simplify modeling

top_violations = df['Violation Description'].value_counts().head(n=5).index
raw_data = df[df['Violation Description'].isin(top_violations)]
raw_data.shape

(785710, 11)

### Create Lag Features

In [110]:
# load and process streets information

street_df = pd.read_csv('../data/street_sweeping.csv')
street_df = street_df[~street_df.Line.isnull()]
street_df.loc[:, 'Line'] = street_df.Line.apply(wkt.loads)
street_df = gpd.GeoDataFrame(street_df, geometry='Line')
street_df.crs = 'EPSG:4326'

In [111]:
street_df['segment_midpoint_x'] = street_df['Line'].apply(lambda line: line.centroid.x)
street_df['segment_midpoint_y'] = street_df['Line'].apply(lambda line: line.centroid.y)

In [112]:
# spatial join - may need to reproject to flat geometry but seems like points are so close together that it would make a neglible difference. 

raw_joined = gpd.sjoin_nearest(raw_data, street_df, distance_col='dist', how='left')




In [113]:
# drop duplicates from spatial join, still need to figure out why this happens

raw_joined = raw_joined.drop_duplicates(subset='Citation Number', keep="first")
assert raw_joined.shape[0] == raw_data.shape[0]

In [114]:
# get x, y data of each citation from geometry column

# raw_joined['x'] = raw_joined.geom.apply(lambda point: point.x)
# raw_joined['y'] = raw_joined.geom.apply(lambda point: point.y)

In [115]:
raw_joined['segment_id'] = raw_joined['Corridor'] + ' from ' + raw_joined['Limits']
raw_joined.head()

Unnamed: 0,Citation Number,Citation Issued DateTime,Violation,Violation Description,Citation Location,Vehicle Plate State,Vehicle Plate,Fine Amount,Date Added,geom,...,Week2,Week3,Week4,Week5,Holidays,BlockSweepID,segment_midpoint_x,segment_midpoint_y,dist,segment_id
1051672,PD36997575,2022-01-01 02:45:00,V5204A,REG TABS,2251 GREENWICH,CA,8XOD966,341.0,01/30/2023 12:00:00 AM,POINT (-122.43672 37.79858),...,0,1,0,0,0,1638225,-122.436596,37.798382,0.000214,Pixley St from Fillmore St - Steiner St
247467,PD36969144,2022-01-01 08:16:00,V5204A,REG TABS,100 YACHT ROAD,CA,7NVL746,121.0,11/20/2022 12:00:00 AM,POINT (-122.44857 37.80643),...,0,1,0,0,0,1640391,-122.448085,37.804952,0.001507,Marina Blvd from Marina Blvd - Lyon St
502110,PD36969155,2022-01-01 08:45:00,V5204A,REG TABS,1551 BAY STREET,CA,8MTL483,121.0,02/01/2023 12:00:00 AM,POINT (-122.43249 37.80311),...,0,1,0,0,0,1612113,-122.432504,37.803162,5.7e-05,Bay St from Laguna St - Buchanan St
494193,PD36969166,2022-01-01 08:51:00,V5204A,REG TABS,3401 LAGUNA STREET,CA,7CTD980,121.0,02/20/2022 12:00:00 AM,POINT (-122.43156 37.80242),...,0,1,0,0,0,1624216,-122.431589,37.802822,5.1e-05,Laguna St from Francisco St - Bay St
247504,PD36969225,2022-01-01 12:17:00,V5204A,REG TABS,100 YACHT ROAD,CA,8FJU233,121.0,09/04/2022 12:00:00 AM,POINT (-122.44857 37.80643),...,0,1,0,0,0,1640391,-122.448085,37.804952,0.001507,Marina Blvd from Marina Blvd - Lyon St


In [116]:
filter_joined = raw_joined[['Citation Issued DateTime', 'Violation Description', 'segment_midpoint_x', 'segment_midpoint_y', 'segment_id']]

In [117]:
# must be a better way to do this
filter_joined.loc[:, 'date'] = filter_joined['Citation Issued DateTime'].dt.date

s_x = filter_joined.groupby(['date', 'segment_id', 'Violation Description']).segment_midpoint_x.nth(0)
s_y = filter_joined.groupby(['date', 'segment_id', 'Violation Description']).segment_midpoint_y.nth(0)

s_num_citations = filter_joined.groupby(['date', 'segment_id', 'Violation Description']).size()

raw_feats = pd.concat([s_x, s_y, s_num_citations], axis=1)
raw_feats.rename({0: 'num_citations'}, axis=1, inplace=True)
raw_feats.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filter_joined.loc[:, 'date'] = filter_joined['Citation Issued DateTime'].dt.date


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,segment_midpoint_x,segment_midpoint_y,num_citations
date,segment_id,Violation Description,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2022-01-01,Bay St from Laguna St - Buchanan St,REG TABS,-122.432504,37.803162,1
2022-01-01,Blake St from Geary Blvd - Anza St,REG TABS,-122.450982,37.781303,1
2022-01-01,Laguna St from Francisco St - Bay St,REG TABS,-122.431589,37.802822,1
2022-01-01,Marina Blvd from Marina Blvd - Lyon St,REG TABS,-122.448085,37.804952,4
2022-01-01,Pixley St from Fillmore St - Steiner St,REG TABS,-122.436596,37.798382,1


In [118]:
all_idx = pd.MultiIndex.from_product([filter_joined.date.unique(), filter_joined.segment_id.unique(), 
filter_joined['Violation Description'].unique()], names=['date', 'segment_id', 'citation_type'])
feats_all = raw_feats.reindex(all_idx, fill_value=0)

In [119]:
joined_geometry = raw_joined[['segment_id', 'segment_midpoint_x', 'segment_midpoint_y']].drop_duplicates(subset='segment_id', keep='first').set_index('segment_id')
joined_geometry.rename({'segment_midpoint_x': 's_x', 'segment_midpoint_y': 's_y'}, axis=1, inplace=True)
feats_all = feats_all.join(joined_geometry, how='inner').drop(['segment_midpoint_x', 'segment_midpoint_y'], axis=1)
feats_all.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,num_citations,s_x,s_y
date,segment_id,citation_type,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2022-01-01,Pixley St from Fillmore St - Steiner St,REG TABS,1,-122.436596,37.798382
2022-01-01,Pixley St from Fillmore St - Steiner St,STR CLEAN,0,-122.436596,37.798382
2022-01-01,Pixley St from Fillmore St - Steiner St,MTR OUT DT,0,-122.436596,37.798382
2022-01-01,Pixley St from Fillmore St - Steiner St,METER DTN,0,-122.436596,37.798382
2022-01-01,Pixley St from Fillmore St - Steiner St,RES/OT,0,-122.436596,37.798382


In [120]:
feats_all.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,num_citations,s_x,s_y
date,segment_id,citation_type,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2022-01-01,Pixley St from Fillmore St - Steiner St,REG TABS,1,-122.436596,37.798382
2022-01-01,Pixley St from Fillmore St - Steiner St,STR CLEAN,0,-122.436596,37.798382
2022-01-01,Pixley St from Fillmore St - Steiner St,MTR OUT DT,0,-122.436596,37.798382
2022-01-01,Pixley St from Fillmore St - Steiner St,METER DTN,0,-122.436596,37.798382
2022-01-01,Pixley St from Fillmore St - Steiner St,RES/OT,0,-122.436596,37.798382


In [121]:
# create time-lagged features

for i in range(1, 15):
    feats_all[f'num_citations_shift{i}'] = feats_all.groupby(level=[1, 2])['num_citations'].shift(i)
feats_all

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,num_citations,s_x,s_y,num_citations_shift1,num_citations_shift2,num_citations_shift3,num_citations_shift4,num_citations_shift5,num_citations_shift6,num_citations_shift7,num_citations_shift8,num_citations_shift9,num_citations_shift10,num_citations_shift11,num_citations_shift12,num_citations_shift13,num_citations_shift14
date,segment_id,citation_type,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2022-01-01,Pixley St from Fillmore St - Steiner St,REG TABS,1,-122.436596,37.798382,,,,,,,,,,,,,,
2022-01-01,Pixley St from Fillmore St - Steiner St,STR CLEAN,0,-122.436596,37.798382,,,,,,,,,,,,,,
2022-01-01,Pixley St from Fillmore St - Steiner St,MTR OUT DT,0,-122.436596,37.798382,,,,,,,,,,,,,,
2022-01-01,Pixley St from Fillmore St - Steiner St,METER DTN,0,-122.436596,37.798382,,,,,,,,,,,,,,
2022-01-01,Pixley St from Fillmore St - Steiner St,RES/OT,0,-122.436596,37.798382,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-01-26,09th St from Folsom St - Ringold St,REG TABS,0,-122.411341,37.773551,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2023-01-26,09th St from Folsom St - Ringold St,STR CLEAN,0,-122.411341,37.773551,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2023-01-26,09th St from Folsom St - Ringold St,MTR OUT DT,0,-122.411341,37.773551,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2023-01-26,09th St from Folsom St - Ringold St,METER DTN,0,-122.411341,37.773551,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [122]:
feats_all = feats_all.dropna().reset_index()

In [123]:
feats_all.shape

(20614360, 20)

In [124]:
from catboost import CatBoostRegressor, Pool
from sklearn.metrics import r2_score, mean_squared_error

In [125]:
feats_all.head()

Unnamed: 0,date,segment_id,citation_type,num_citations,s_x,s_y,num_citations_shift1,num_citations_shift2,num_citations_shift3,num_citations_shift4,num_citations_shift5,num_citations_shift6,num_citations_shift7,num_citations_shift8,num_citations_shift9,num_citations_shift10,num_citations_shift11,num_citations_shift12,num_citations_shift13,num_citations_shift14
0,2022-01-15,Pixley St from Fillmore St - Steiner St,REG TABS,0,-122.436596,37.798382,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,2022-01-15,Pixley St from Fillmore St - Steiner St,STR CLEAN,0,-122.436596,37.798382,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2022-01-15,Pixley St from Fillmore St - Steiner St,MTR OUT DT,3,-122.436596,37.798382,0.0,0.0,0.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
3,2022-01-15,Pixley St from Fillmore St - Steiner St,METER DTN,0,-122.436596,37.798382,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2022-01-15,Pixley St from Fillmore St - Steiner St,RES/OT,0,-122.436596,37.798382,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [126]:
feats_all['date']

0           2022-01-15
1           2022-01-15
2           2022-01-15
3           2022-01-15
4           2022-01-15
               ...    
20614355    2023-01-26
20614356    2023-01-26
20614357    2023-01-26
20614358    2023-01-26
20614359    2023-01-26
Name: date, Length: 20614360, dtype: object

In [127]:
feats_all['weekday'] = feats_all['date'].apply(lambda date: date.weekday())

In [128]:
train = feats_all.head(n=1000000)[feats_all.head(n=1000000)['date'] < pd.to_datetime('2022-2-01')]
train.shape

  train = feats_all.head(n=1000000)[feats_all.head(n=1000000)['date'] < pd.to_datetime('2022-2-01')]


(929560, 21)

In [129]:
valid = feats_all.iloc[900000: 1300000][feats_all.iloc[900000: 1300000]['date'] < pd.to_datetime('2022-2-05')]
valid.shape

  valid = feats_all.iloc[900000: 1300000][feats_all.iloc[900000: 1300000]['date'] < pd.to_datetime('2022-2-05')]


(248280, 21)

In [130]:
test = feats_all.iloc[1300000: 1600000][feats_all.iloc[1300000: 1600000]['date'] < pd.to_datetime('2022-2-10')]
test.shape

  test = feats_all.iloc[1300000: 1600000][feats_all.iloc[1300000: 1600000]['date'] < pd.to_datetime('2022-2-10')]


(121680, 21)

In [131]:
train.columns

Index(['date', 'segment_id', 'citation_type', 'num_citations', 's_x', 's_y',
       'num_citations_shift1', 'num_citations_shift2', 'num_citations_shift3',
       'num_citations_shift4', 'num_citations_shift5', 'num_citations_shift6',
       'num_citations_shift7', 'num_citations_shift8', 'num_citations_shift9',
       'num_citations_shift10', 'num_citations_shift11',
       'num_citations_shift12', 'num_citations_shift13',
       'num_citations_shift14', 'weekday'],
      dtype='object')

In [132]:
train_pool = Pool(train.drop(['date', 'num_citations'], axis=1),
                  label=train.num_citations,
                  cat_features=['segment_id', 'weekday', 'citation_type'])

valid_pool = Pool(valid.drop(['date', 'num_citations'], axis=1),
                  label=valid.num_citations,
                  cat_features=['segment_id', 'weekday', 'citation_type'])

test_pool = Pool(test.drop(['date', 'num_citations'], axis=1),
                  label=test.num_citations,
                  cat_features=['segment_id', 'weekday', 'citation_type'])

In [133]:
model = CatBoostRegressor(objective='Poisson')

model.fit(train_pool, plot=True, eval_set=valid_pool, verbose=500)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.8962967	test: 0.8970256	best: 0.8970256 (0)	total: 315ms	remaining: 5m 14s
500:	learn: 0.0598924	test: 0.1054298	best: 0.0961125 (251)	total: 1m 31s	remaining: 1m 31s
999:	learn: 0.0514366	test: 0.1147172	best: 0.0961125 (251)	total: 3m 2s	remaining: 0us

bestTest = 0.0961125415
bestIteration = 251

Shrink model to first 252 iterations.


<catboost.core.CatBoostRegressor at 0x7fa048ad1a00>

In [134]:
actual_counts = test.num_citations
predicted_counts_poisson = model.predict(test_pool)
r2_poisson = r2_score(actual_counts, predicted_counts_poisson)
rmse_score_poisson_model = np.sqrt(mean_squared_error(actual_counts, predicted_counts_poisson))
print('R2 score: {:.3f}\nRMSE score: {:.2f}'.format(r2_poisson, rmse_score_poisson_model))

R2 score: 0.237
RMSE score: 0.28


In [135]:
test['predicted'] = predicted_counts_poisson

In [147]:
test[[col for col in test.columns if 'shift' in col]].sum(axis=1).value_counts()

0.0      104307
1.0        7537
2.0        3549
3.0        2070
4.0        1197
          ...  
107.0         1
36.0          1
34.0          1
124.0         1
55.0          1
Length: 68, dtype: int64

In [146]:
test[[col for col in test.columns if 'shift' in col]][test.citation_type == 'STR CLEAN'].sum(axis=1).value_counts()

0.0      14077
1.0       4274
2.0       2242
3.0       1320
4.0        777
5.0        517
6.0        316
7.0        208
8.0        151
9.0        131
10.0        94
11.0        59
12.0        29
13.0        23
14.0        22
16.0        17
15.0        16
17.0        10
18.0         8
20.0         5
22.0         4
27.0         4
21.0         3
19.0         3
26.0         2
23.0         2
30.0         2
28.0         2
29.0         2
59.0         2
44.0         2
124.0        1
81.0         1
42.0         1
51.0         1
127.0        1
49.0         1
45.0         1
24.0         1
84.0         1
25.0         1
47.0         1
37.0         1
dtype: int64

In [141]:
r2_score(test[test.citation_type == 'STR CLEAN']['num_citations'], test[test.citation_type == 'STR CLEAN']['predicted'])

0.16346236184610774

In [148]:
test.predicted.astype(int).describe()

count    121680.000000
mean          0.008087
std           0.127242
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max          12.000000
Name: predicted, dtype: float64

In [149]:
test.num_citations.describe()

count    121680.000000
mean          0.038067
std           0.315939
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max          30.000000
Name: num_citations, dtype: float64