In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import random
import shapefile

import matplotlib.pyplot as plt
%matplotlib inline

import folium
from folium.plugins import MarkerCluster

from bokeh.plotting import figure, output_file, show, output_notebook
from bokeh.palettes import Category20, OrRd
from bokeh.models import (
    ColumnDataSource,
    HoverTool,
    LogColorMapper, LinearColorMapper,
    Legend, Range1d)
from bokeh.io import show

import pvd_crime as pc

In [2]:
output_notebook()

*My current plan is to publish this as a blog post, but in the last month or so, I haven't had time to get a proper blog up and running. This is the outline for a post using data from Apil, it will be updated when I set the blog up.*

# When Do Providence Crimes Happen?

Providence only keeps the last 180 days of crime logs available (from what I could find, if anyone has a history of the logs, let me know), so there isn’t quite enough time to look make definitive statements about when crime happens, but with # different cases it’s worth taking a look at when crime happened in the last 180 days. I hope to update this information every month or so.

In [3]:
pvd_crimes = pd.read_csv('pvd_crime_master.csv')
pvd_crimes = pvd_crimes.assign(reported_date = pd.to_datetime(pvd_crimes['reported_date']))
non_offenses = pvd_crimes[pvd_crimes['counts']==0]
pvd_crimes = pvd_crimes[pvd_crimes['counts']>0]

In [4]:
counts_of_offenses = pvd_crimes.groupby("offense_desc").sum().sort_values('counts', ascending = False)['counts']
top_offenses = counts_of_offenses[:15].index

How often do the top offense in the dataset occur?

In [5]:
p = figure(x_range=(0,1500), y_range=list(top_offenses)[::-1], plot_width = 700, plot_height=500, title="Top 15 Offense in Providence",
           toolbar_location=None, tools="")

p.hbar(y=list(top_offenses), height=0.85, right=counts_of_offenses[top_offenses].values, color='#ff9896')

p.xaxis.axis_line_color = None
p.yaxis.axis_line_color = None

p.outline_line_color = None

p.xgrid.grid_line_color = None
p.ygrid.grid_line_color = None


p.yaxis.major_tick_line_color = None

show(p)

Traffic Violations lead the way (you’d expect that if you drive in RI everyday), followed by theft from a motor vehicle and then vanadalism. I am a little surpised to see vandalism so high, and would like to know if that makes sense to everyone else. Feel free to email me your thoughts on that.

Larceny, Other Burglary, and Larceny From A Building all seem similar, is their overlap in these charges, are they often a part of the same case?

In [6]:
mask = (pvd_crimes['offense_desc']=='Larceny, Other') | (pvd_crimes['offense_desc']=='Burglary') | (pvd_crimes['offense_desc']=='Larceny from Building')

In [7]:
len(pvd_crimes[mask].casenumber) - len(pvd_crimes[mask].casenumber.unique())

16

In [8]:
larceny_burglary = pvd_crimes[mask].pivot_table(index='casenumber', columns='offense_desc', 
                        aggfunc=len, fill_value=0)
larceny_burglary = larceny_burglary['city']

larceny_burglary['Total'] = larceny_burglary['Burglary'] + larceny_burglary['Larceny, Other'] + larceny_burglary['Larceny from Building']

larceny_burglary[larceny_burglary['Total'] > 1]

offense_desc,Burglary,Larceny from Building,"Larceny, Other",Total
casenumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-00109327,1,0,1,2
2017-00130319,1,1,0,2
2017-00131354,1,1,0,2
2017-00132706,1,1,0,2
2017-00134052,1,1,0,2
2018-00010631,1,1,0,2
2018-00012517,1,0,1,2
2018-00012612,0,2,0,2
2018-00017154,1,0,1,2
2018-00018551,1,1,0,2


There are only 16 cases where at least two of these charges appear together and none where all 3 occur. I've reached out to the dataset owner to get clarification on the different offenses.

Providence doesn’t include the category of the offense in the data, but from their weekly PDF reports, we can bin the offense into either violent, property, or other offenses. We will quickly see that violent crime is low(good) compared to other two. 

In [9]:
data = pd.DataFrame(pvd_crimes["offense_cat"].value_counts())
data.index = ['Other Crime', 'Property Crime', 'Violent Crime']

p = figure(x_range=(0,5500), y_range=list(data.index)[::-1], plot_width = 700, plot_height=500, title="Providence Offense by Catageory",
           toolbar_location=None, tools="")

p.hbar(y=list(data.index), height=0.75, right=list(data.values), color='#ff9896')

p.xaxis.axis_line_color = None
p.yaxis.axis_line_color = None

p.outline_line_color = None

p.xgrid.grid_line_color = None
p.ygrid.grid_line_color = None


p.yaxis.major_tick_line_color = None

show(p)

In [10]:
time_crime_data = pvd_crimes.set_index(pd.DatetimeIndex(pvd_crimes['reported_date']))

delta = pvd_crimes['reported_date'].max() - pvd_crimes['reported_date'].min()
days = delta.days

First we can look at the number of daily crimes reported over the last 6 months or so, there is a massive spike on 11/21, which was the day the most incidences occured (124).

In [11]:
daily_crime_count = time_crime_data.iloc[176:].resample('D').counts.sum()

p = figure(plot_width=950, x_axis_type='datetime', plot_height=500, tools='tap', toolbar_location='above', 
           title='Daily Crime (PVD)')

p.line(daily_crime_count.index.values, daily_crime_count.values,
                line_width=3, color='#ff9896')

#p.xaxis.axis_line_color = None
p.yaxis.axis_line_color = None

p.outline_line_color = None

p.xgrid.grid_line_color = None
p.ygrid.grid_line_color = None

show(p)

The data is very noisy, there is a small lull around the end of December and beginning of January, but otherwise the amount of crime seems pretty steady. Let’s look at the weekly crime data.

In [12]:
daily_crime_count = time_crime_data.iloc[176:].resample('W').counts.sum()

p = figure(plot_width=950, x_axis_type='datetime', plot_height=500, tools='tap', toolbar_location='above', 
           title='Daily Crime (PVD)')

p.line(daily_crime_count.index.values, daily_crime_count.values,
                line_width=3, color='#ff9896')

#p.xaxis.axis_line_color = None
p.yaxis.axis_line_color = None

p.xgrid.grid_line_color = None
p.ygrid.grid_line_color = None
p.outline_line_color = None

p.y_range=Range1d(0, 550)


show(p)

Pretty large drop in total crimes reported in January, it looks large enough to be meaningful, but without more data it's pretty difficult to know.

Let’s take a closer look at the crime on 11/21, the number of incidents that day is 67 offenses above the daily average.

In [13]:
mean_daily_crime = time_crime_data.iloc[176:].resample('D').counts.sum().mean()

In [14]:
#unique cases
len(time_crime_data['11-21-17']['casenumber'].unique())

75

In [15]:
#cases with multiple offenses
len(time_crime_data['11-21-17']['casenumber']) - len(time_crime_data['11-21-17']['casenumber'].unique())

49

In [16]:
data = pd.DataFrame(time_crime_data['11-21-17']['offense_desc'].value_counts()[:10])

source = ColumnDataSource(data=data)

p = figure(x_range=(0,61), y_range=list(data.index)[::-1], plot_width = 700, plot_height=500, title="Offenses on November 21, 2017",
           toolbar_location=None, tools="")

p.hbar(y=list(data.index), height=0.85, right=data.offense_desc.values, color='#ff9896')

p.xaxis.axis_line_color = None
p.yaxis.axis_line_color = None

p.outline_line_color = None

p.xgrid.grid_line_color = None
p.ygrid.grid_line_color = None


p.yaxis.major_tick_line_color = None

show(p)

In [17]:
time_crime_data[time_crime_data['offense_desc']=='Traffic Violation']['11-21-17']

Unnamed: 0_level_0,casenumber,counts,location,month,offense_desc,reported_date,reporting_officer,statute_code,statute_desc,year,offense_cat,city,lat,lon,neighborhood
reported_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2017-11-21 20:29:00,2017-00122747,1,570 Douglas Ave,11,Traffic Violation,2017-11-21 20:29:00,JLanier,31-47-09,Owner/Owner Operating/Operating Without Insura...,2017,other_crime,Providence,41.845614,-71.428934,Wanskuck
2017-11-21 20:29:00,2017-00122747,1,570 Douglas Ave,11,Traffic Violation,2017-11-21 20:29:00,JLanier,31-3-32,Driving with Expired Registration,2017,other_crime,Providence,41.845614,-71.428934,Wanskuck
2017-11-21 19:30:00,2017-00122842,1,Cranston St At Fricker St,11,Traffic Violation,2017-11-21 19:30:00,KMatsumoto,31-22-22(g),No seat belt - Operator,2017,other_crime,Providence,41.817219,-71.42231,Federal Hill
2017-11-21 19:10:00,2017-00122838,1,Broad St At Potters Ave,11,Traffic Violation,2017-11-21 19:10:00,KMatsumoto,31-11-18,"Driving after Denial, Suspension or Revocation...",2017,other_crime,Providence,41.802993,-71.41891,Lower South Providence
2017-11-21 19:10:00,2017-00122838,1,Broad St At Potters Ave,11,Traffic Violation,2017-11-21 19:10:00,KMatsumoto,31-24-1,Times When Lights Required,2017,other_crime,Providence,41.802993,-71.41891,Lower South Providence
2017-11-21 19:10:00,2017-00122838,1,Broad St At Potters Ave,11,Traffic Violation,2017-11-21 19:10:00,KMatsumoto,31-11-20,Permitting Unauthorized Person to Drive,2017,other_crime,Providence,41.802993,-71.41891,Lower South Providence
2017-11-21 18:50:00,2017-00122836,1,Broad St At Potters Ave,11,Traffic Violation,2017-11-21 18:50:00,KMatsumoto,31-22-22(g),No seat belt - Operator,2017,other_crime,Providence,41.802993,-71.41891,Lower South Providence
2017-11-21 18:50:00,2017-00122836,1,Broad St At Potters Ave,11,Traffic Violation,2017-11-21 18:50:00,KMatsumoto,31-47-9,Operating a MV without Evidence of Insurance,2017,other_crime,Providence,41.802993,-71.41891,Lower South Providence
2017-11-21 18:50:00,2017-00122836,1,Broad St At Potters Ave,11,Traffic Violation,2017-11-21 18:50:00,KMatsumoto,31-23.3-5,Windshield/Window - Owner/Operating w/Unlawful...,2017,other_crime,Providence,41.802993,-71.41891,Lower South Providence
2017-11-21 18:50:00,2017-00122836,1,Broad St At Potters Ave,11,Traffic Violation,2017-11-21 18:50:00,KMatsumoto,31-8-2,Operation of MV When Registration Canceled,2017,other_crime,Providence,41.802993,-71.41891,Lower South Providence


49 of the cases have multiple offenses, with the largest number of offense being Traffic Violations, most of these have multiple offense attached to each case number, with each unbuckled seatbelt, license suspension, and actual traffic related offense recieiving it's own line in the data.

With the way traffic offense skew the data, we might want to look at the day with the most offense that are not related to traffic.

In [18]:
print(time_crime_data[time_crime_data['offense_desc']!='Traffic Violation'].resample('24H').sum()['counts'].idxmax())
time_crime_data[time_crime_data['offense_desc']!='Traffic Violation'].resample('24H').sum()['counts'].max()

2018-02-12 00:00:00


88

When we remove traffic violations, February 22nd becomes the day with the most offense.

In [19]:
feb_12 = time_crime_data[time_crime_data['offense_desc']!='Traffic Violation']['02-12-2018']['offense_desc'].value_counts()[:10]
data = pd.DataFrame(feb_12)

source = ColumnDataSource(data=data)

p = figure(x_range=(0,61), y_range=list(data.index)[::-1], plot_width = 700, plot_height=500, title="Offenses on February 12, 2018",
           toolbar_location=None, tools="")

p.hbar(y=list(data.index), height=0.85, right=data.offense_desc.values, color='#ff9896')

p.xaxis.axis_line_color = None
p.yaxis.axis_line_color = None

p.outline_line_color = None

p.xgrid.grid_line_color = None
p.ygrid.grid_line_color = None


p.yaxis.major_tick_line_color = None

show(p)

About 18% of our cases have multiple offenses attached, I considered filtering to only look at cases, but I don't think that gives an accurate representation of the offenses occuring. 

In [20]:
(pvd_crimes.casenumber.shape[0] - pvd_crimes.casenumber.unique().shape[0]) / pvd_crimes.casenumber.shape[0]

0.18252714708785786

The weeky/daily data above would appear to can answer our question of when does crime happen, but there aren't any clear trends, what if we look at what time of day these crimes tend to occur?

There are 86 unique offenses in the data set, some occuring only once or twice, we are only going to look at the 15 most common occurences.

**The chart below is interactive, click the offense on the right to fill in it's line color.**

In [21]:
counts_of_offenses = pvd_crimes.groupby("offense_desc").sum().sort_values('counts', ascending = False)['counts']
for_bokeh = time_crime_data.groupby(['offense_desc', time_crime_data.index.hour]).count()['counts'].div(days).unstack().T
for_bokeh.fillna(0, inplace=True)
top_offenses = counts_of_offenses[:15].index

In [22]:
p = figure(plot_width=950, plot_height=500, tools=('tap'), toolbar_location='above', title='Hourly Crime Rate (PVD)')

legend_it = []

for offense, color in zip(top_offenses,Category20[15]):

    c = p.line(for_bokeh.index.values, for_bokeh[offense].values,
                line_width=5, color=color, alpha=1,
                muted_color='gray', muted_alpha=0.15)
    c.muted = True
    if offense == 'RI Statute Violation':
        c.muted = False
    legend_it.append((offense, [c]))


legend = Legend(items=legend_it, location=(5, 75))
legend.click_policy="mute"
legend.border_line_color = None

legend.inactive_fill_alpha = 0.45

p.xaxis.axis_line_color = None
p.yaxis.axis_line_color = None
p.outline_line_color = None

p.xgrid.grid_line_color = None
p.ygrid.grid_line_color = None

p.xaxis.axis_label = 'hour of day'

p.x_range=Range1d(0, 23)


p.add_layout(legend, 'right')

show(p)

Some clear patterns start to immerge. The most obivous, in regard to inuition and the data, is that traffic violations spike during the work commute times and around the time bars close. Car break-ins on the other hand peak around 10 p.m, which may not be that suprising, but the other peak around 9:30 am is. It appears that 5 a.m is the safest time in the city.

Home break in (larceny other) climb during the workday peaking at around 3p.m, which makes sense. Bulgary follows a similar pattern, but actually peaks around 6p.m, when larceny other is winding down.

In [23]:
offense_occruences_cat = time_crime_data.groupby(['offense_cat', time_crime_data.index.hour]).count()['counts'].div(days).unstack().T

Next we can group the offense into the 3 broad categories outlines by the Providence Police and chart when those they tend to occur.

In [24]:
p = figure(plot_width=950, plot_height=500, tools='tap', toolbar_location='above', title='Hourly Crime Rate by Category')

legend_it = []

for offense, color in zip(offense_occruences_cat.columns.values,Category20[3]):

    c = p.line(offense_occruences_cat.index.values, offense_occruences_cat[offense].values,
                line_width=5, color=color, alpha=1)
    
    legend_it.append((offense, [c]))


legend = Legend(items=legend_it, location=(5, 225))
legend.border_line_color = None

p.xaxis.axis_line_color = None
p.yaxis.axis_line_color = None

p.xgrid.grid_line_color = None
p.ygrid.grid_line_color = None

p.xaxis.axis_label = 'hour of day'
p.xaxis.axis_label_text_font = 'helvetica'
#p.axis.axis_label_text_font_size = '12'
p.x_range=Range1d(0, 23)


p.add_layout(legend, 'right')

show(p)

The Other Crime category is heavily influenced by traffic violations, so we mostly see the ebb and flow of that offense when we map the rates by hour, let’s try removing that.

In [25]:
no_other_offenses = time_crime_data[time_crime_data['offense_cat']!='other_crime']

In [26]:
offense_occruences_cat = no_other_offenses.groupby(['offense_cat', no_other_offenses.index.hour]).count()['counts'].div(days).unstack().T

In [27]:
p = figure(plot_width=950, plot_height=500, tools='tap', toolbar_location='above', title='Hourly Crime Rate by Category')

legend_it = []

for offense, color in zip(offense_occruences_cat.columns.values,Category20[3]):

    c = p.line(offense_occruences_cat.index.values, offense_occruences_cat[offense].values,
                line_width=5, color=color, alpha=1)
    
    legend_it.append((offense, [c]))


legend = Legend(items=legend_it, location=(5, 225))
legend.border_line_color = None

p.xaxis.axis_line_color = None
p.yaxis.axis_line_color = None

p.xgrid.grid_line_color = None
p.ygrid.grid_line_color = None

p.xaxis.axis_label = 'hour of day'
p.xaxis.axis_label_text_font = 'helvetica'
#p.axis.axis_label_text_font_size = '12'
p.x_range=Range1d(0, 23)


p.add_layout(legend, 'right')

show(p)

Property crimes follow the trend outlined by Larceny, Other in the offense description chart, peaking right before 10 and again around 3. There is no real trend of clear peak in violent crime, which is to be expected. 

With more data we could feel more confident in the times we’ve stated that crime occurs. We could also get a better idea of the seasonality of the crimes, how the weekday vs weekend relation affects times or frequency and even how the weather affects crime rate in Providence. I will be hunting for other, possibly less detailed, datasets that can help answer those questions. 


When I set out to write this article the original title was Where and When Do Providence Crimes Occur, but the when portion has become pretty long, so I’ve broken the post into a second part.

>This part is below, but will be in a separate blog post when published.  

I also realized that the data contained a few or some might argue most the major holidays from 2017, click here to view a short analysis of what crimes happen on those holidays and when!

> This follows the 'where' analysis below, but would also be a separate blog post.

## Where Do Crimes Occur?

For this analysis, we are only looking at crime within Providence and calls that did not result in a "Non-offense".

In [28]:
mask = (pvd_crimes.lat.notnull()) & (pvd_crimes.city == 'Providence') | (pvd_crimes.city == 'North Providence') | (pvd_crimes.city == 'Pawtucket')
pvd_crime_locs = pvd_crimes[mask]
pvd_crime_locs.reset_index(drop=True, inplace=True)

In [29]:
PVD_Coor = (41.824, -71.4128)

pvd_map = folium.Map(location=PVD_Coor, tiles='CartoDB positron', zoom_start=12)

#add a marker for every record in the filtered data, use a clustered view
marker_cluster = MarkerCluster().add_to(pvd_map)


for i in pvd_crime_locs.index:
    
    label = str(pvd_crime_locs['offense_desc'][i])+' '+str(pvd_crime_locs['reporting_officer'][i])+' '+str(pvd_crime_locs["reported_date"][i])
    
    popup = folium.Popup(label, parse_html=True)

    folium.Marker(location = [pvd_crime_locs["lat"][i], pvd_crime_locs["lon"][i]], popup=popup, 
                  icon=folium.Icon(color='darkblue')).add_to(marker_cluster)

In [30]:
pvd_map.save("pvd_map_04_10_18.html")

In [37]:
from IPython.display import IFrame
IFrame('pvd_map_04_10_18.html', width=700, height=500)

Above we have mapped the offenses using markers, but alternatively we could use a heat map to look at some of the high reported crime areas. Using all of the offense is a little overwhelming, so we will break out a few of the more broad categories.

First we need to load a shapefile of the neighborhoods and clean up some of the name difference between the shape file and our dataset.

In [38]:
sf = shapefile.Reader('hood_shapefile/pvd.shp')

In [39]:
hoods = []
shapes = []
for i in range(len(sf.records())):
    hoods.append(sf.records()[i][2])
    shapes.append(sf.shape(i))

In [40]:
county_xs = []
county_ys = []
for shape in shapes:
    county_x = [shape.points[i][0] for i in range(0, len(shape.points))]
    county_xs.append(county_x)
    county_y = [shape.points[i][1] for i in range(0, len(shape.points))]
    county_ys.append(county_y)

In [41]:
crime_heat_map = pvd_crimes.replace('West End Providence', 'West End')
crime_heat_map = crime_heat_map.replace('Ward 13', 'Federal Hill')
crime_heat_map = crime_heat_map.replace('Downtown Providence', 'Downtown')
crime_heat_map = crime_heat_map.replace('Jewelry District', 'Downtown')

## Which neighborhood has less property crime (break in, theft, vandalism)?

In [42]:
total_property_crimes = []
for hood in hoods:
    prop_crimes = crime_heat_map[(crime_heat_map.neighborhood == hood) & (crime_heat_map.offense_cat == 'property_crime')].shape[0]
    total_property_crimes.append(prop_crimes)    

In [44]:
from bokeh.io import show
from bokeh.models import (
    ColumnDataSource,
    HoverTool,
    LogColorMapper, LinearColorMapper,
    GMapOptions
)
from bokeh.palettes import OrRd
from bokeh.plotting import figure, gmap
from bokeh.tile_providers import CARTODBPOSITRON

palette = OrRd[5]

map_options = GMapOptions(lat=41.824, lng=-71.4128, map_type="roadmap", zoom=12)

TOOLS = "pan,wheel_zoom,reset,hover,save"

p = gmap("AIzaSyBfqST11rH1smDl7mvdFimtxT0WD87gp4I", map_options, title="Total Property Crime Provdience Neighborhoods",
         tools=TOOLS, x_axis_location=None, y_axis_location=None)

palette.reverse()

color_mapper = LinearColorMapper(palette=palette)

source = ColumnDataSource(data=dict(
    x=county_xs,
    y=county_ys,
    name=hoods,
    rate=total_property_crimes,
))

p.grid.grid_line_color = None

p.patches('x', 'y', source=source,
          fill_color={'field': 'rate', 'transform': color_mapper},
          fill_alpha=0.8, line_color="black", line_width=0.5)

hover = p.select_one(HoverTool)
hover.point_policy = "follow_mouse"
hover.tooltips = [
    ("Neighborhood", "@name"),
    ("Total Property Crime", "@rate")
]

p.axis.visible = False

show(p)

## Which neighboorhood has the worst drivers?

In [45]:
total_traffic_violations = []
for hood in hoods:
    prop_crimes = crime_heat_map[(crime_heat_map.neighborhood == hood) & (crime_heat_map.offense_desc == 'Traffic Violation')].shape[0]
    total_traffic_violations.append(prop_crimes)

In [46]:
from bokeh.io import show
from bokeh.models import (
    ColumnDataSource,
    HoverTool,
    LogColorMapper, LinearColorMapper
)
from bokeh.palettes import OrRd
from bokeh.plotting import figure
from bokeh.tile_providers import CARTODBPOSITRON

palette = OrRd[5]

map_options = GMapOptions(lat=41.824, lng=-71.4128, map_type="roadmap", zoom=12)

TOOLS = "pan,wheel_zoom,reset,hover,save"

p = gmap("AIzaSyBfqST11rH1smDl7mvdFimtxT0WD87gp4I", map_options, title="Total Property Crime Provdience Neighborhoods",
         tools=TOOLS, x_axis_location=None, y_axis_location=None)

palette.reverse()

color_mapper = LinearColorMapper(palette=palette)

source = ColumnDataSource(data=dict(
    x=county_xs,
    y=county_ys,
    name=hoods,
    rate=total_traffic_violations,
))

p.grid.grid_line_color = None

p.patches('x', 'y', source=source,
          fill_color={'field': 'rate', 'transform': color_mapper},
          fill_alpha=0.8, line_color="black", line_width=0.5)

hover = p.select_one(HoverTool)
hover.point_policy = "follow_mouse"
hover.tooltips = [
    ("Neighborhood", "@name"),
    ("total Traffic Violations", "@rate")
]

p.axis.visible = False

show(p)

## Holidays!

I realized the dataset contains some of more major holidays, so let's take a look at what is happening crime wise in Providence on these various days of celebration.

In [47]:
holidays = {'2017-10-31':'Halloween', '2017-11-23':'Thanksgiving', '2017-12-23':'Festivus', 
            '2017-12-24':'Christmas Eve', '2017-12-25':'Christmas', 
            '2017-12-31':"New Year's Eve", '2018-01-01':'New Year Day', '2018-04-01':'April Fools Day'}

holiday_crime_count = {}
holiday_crime_sums = {}

for date, holiday in holidays.items():
    holiday_crime_count[holiday] = time_crime_data.loc[date].groupby(['offense_desc']).sum().counts
    holiday_crime_sums[holiday] = time_crime_data.loc[date].counts.sum()
    
df_list=[]
for date, holiday in holidays.items():
    df = time_crime_data.loc[date]
    df = df.assign(holiday = holiday)
    df_list.append(df)
    

holidays_df = pd.concat(df_list)

total_crime_on_holiday = pd.DataFrame(holidays_df.groupby('holiday').sum().sort_values('counts', ascending=False).counts)

In [48]:
data = total_crime_on_holiday

p = figure(x_range=(0,61), y_range=list(data.index)[::-1], plot_width = 700, plot_height=500, title="Offenses on February 12, 2018",
           toolbar_location=None, tools="")

p.hbar(y=list(data.index), height=0.85, right=list(data.values), color='#ff9896')

p.xaxis.axis_line_color = None
p.yaxis.axis_line_color = None

p.outline_line_color = None

p.xgrid.grid_line_color = None
p.ygrid.grid_line_color = None


p.yaxis.major_tick_line_color = None

show(p)

In [49]:
offenses_on_holidays = holidays_df.groupby(['holiday', 'offense_desc']).sum()

for holiday in holidays.values():
    top_3 = offenses_on_holidays.loc[holiday].sort_values('counts', ascending=False)['counts'][:3]
    
    data = top_3

    p = figure(x_range=(0,13), y_range=list(data.index)[::-1], plot_width = 700, plot_height=500, title=holiday,
           toolbar_location=None, tools="")

    p.hbar(y=list(data.index), height=0.85, right=list(data.values), color='#ff9896')

    p.xaxis.axis_line_color = None
    p.yaxis.axis_line_color = None

    p.outline_line_color = None

    p.xgrid.grid_line_color = None
    p.ygrid.grid_line_color = None

    p.yaxis.major_tick_line_color = None

    show(p)

You can access Timestamp as pandas.Timestamp
  if pd and isinstance(obj, pd.tslib.Timestamp):


## Non-Offense

The final piece of the data set I wanted to return to was the calls that results in a non-offense description. Below we can see that these are often a missing person report, but can also include a general request for assistance, lost and found maters and other events.

In [50]:
counts_of_offenses = non_offenses.groupby("offense_desc").count().sort_values('counts', ascending = False)['counts']
top_offenses = counts_of_offenses[:15].index

In [51]:
p = figure(x_range=(0,651), y_range=list(top_offenses)[::-1], plot_width = 700, plot_height=500, title="Top 15 Offense in Providence",
           toolbar_location=None, tools="")

p.hbar(y=list(top_offenses), height=0.85, right=counts_of_offenses[top_offenses].values, color='#ff9896')

p.xaxis.axis_line_color = None
p.yaxis.axis_line_color = None

p.outline_line_color = None

p.xgrid.grid_line_color = None
p.ygrid.grid_line_color = None


p.yaxis.major_tick_line_color = None

show(p)

In [52]:
time_non_data = non_offenses.set_index(pd.DatetimeIndex(non_offenses['reported_date']))

delta = non_offenses['reported_date'].max() - non_offenses['reported_date'].min()
days = delta.days

In [53]:
daily_crime_count = time_non_data.iloc[176:].resample('D').counts.count()

p = figure(plot_width=950, x_axis_type='datetime', plot_height=500, tools='tap', toolbar_location='above', 
           title='Daily Crime (PVD)')

p.line(daily_crime_count.index.values, daily_crime_count.values,
                line_width=3, color='#ff9896')

#p.xaxis.axis_line_color = None
p.yaxis.axis_line_color = None

p.outline_line_color = None

p.xgrid.grid_line_color = None
p.ygrid.grid_line_color = None

show(p)

We can see that the data is vert noisy, but also fairly consistent. Let's break out the non-offense by time of day.

In [54]:
counts_of_offenses = non_offenses.groupby("offense_desc").count().sort_values('counts', ascending = False)['counts']
for_bokeh = time_non_data.groupby(['offense_desc', time_non_data.index.hour]).count()['counts'].div(days).unstack().T
for_bokeh.fillna(0, inplace=True)
top_offenses = counts_of_offenses[:15].index

p = figure(plot_width=950, plot_height=500, tools=('tap'), toolbar_location='above', title='Hourly Crime Rate (PVD)')

legend_it = []

for offense, color in zip(top_offenses,Category20[15]):

    c = p.line(for_bokeh.index.values, for_bokeh[offense].values,
                line_width=5, color=color, alpha=1,
                muted_color='gray', muted_alpha=0.15)
    c.muted = True
    if offense == 'RI Statute Violation':
        c.muted = False
    legend_it.append((offense, [c]))


legend = Legend(items=legend_it, location=(5, 75))
legend.click_policy="mute"
legend.border_line_color = None

legend.inactive_fill_alpha = 0.45

p.xaxis.axis_line_color = None
p.yaxis.axis_line_color = None
p.outline_line_color = None

p.xgrid.grid_line_color = None
p.ygrid.grid_line_color = None

p.xaxis.axis_label = 'hour of day'

p.x_range=Range1d(0, 23)


p.add_layout(legend, 'right')

show(p)

We see non-offense calls rarely occur overnight, and most tend to spike around 3 in the afternoon.