# Crimes and Streetlight repairs

## Setup

In [1]:
# load our packages
import pandas as pd
pd.set_option('max_columns', 50)

import geopandas as gpd
import numpy as np

% matplotlib  inline

import urllib.request
import geopandas as gpd
import shapely

shapely.speedups.enable()

from pathlib import Path


## Loading data
We want to load (or convert) all of our geo data to the same coordinate system.
[DC uses](https://octo.dc.gov/page/coordinate-system-standards) the Maryland state 1983 datum coordinates, which [has code `EPSG:2804`](http://spatialreference.org/ref/epsg/3559/) and units of meters.

In [2]:
def to_gdf(df, x, y, crs):
    return gpd.GeoDataFrame(
        df,
        crs=crs,
        geometry=[shapely.geometry.Point(xy) for xy in zip(df[x], df[y])]
)

def check_download(source_url, target_file):
    if Path(target_file).is_file():
        print(f'{target_file} exists; skipping download')
    else: 
        print(f'downloading {target_file} to {target_file}')
        urllib.request.urlretrieve(source_url, target_file)  

def download_and_parse(source_url, target_file):
    check_download(source_url, target_file)
    print('reading into geopandas')
    return gpd.read_file(target_file)

def download_parse_and_concat(soure_url_and_target_files):
    return pd.concat(
        map(lambda s_and_t: download_and_parse(s_and_t[0], s_and_t[1]), soure_url_and_target_files)
    )

In [3]:
# get our crimes data
crimes = download_parse_and_concat([
    ['https://opendata.arcgis.com/datasets/38ba41dd74354563bce28a359b59324e_0.geojson', 'crimes_2018.geojson'],
    ["https://opendata.arcgis.com/datasets/6af5cb8dc38e4bcbac8168b27ee104aa_38.geojson", "crimes_2017.geojson"],
    ["https://opendata.arcgis.com/datasets/bda20763840448b58f8383bae800a843_26.geojson", "crimes_2016.geojson"],
]).assign(
    REPORT_DAT=lambda df: pd.to_datetime(df.REPORT_DAT),
    START_DATE=lambda df: pd.to_datetime(df.START_DATE),
    END_DATE=lambda df: pd.to_datetime(df.END_DATE, errors='coerce'),
).to_crs(
    {'init': 'epsg:2804'}
)


crimes_2018.geojson exists; skipping download
reading into geopandas
crimes_2017.geojson exists; skipping download
reading into geopandas
crimes_2016.geojson exists; skipping download
reading into geopandas


In [4]:
repairs = to_gdf(
    pd.read_excel(
        'repairs.xlsx',
        thousands=',',
        converters={
            'Day of Datewoclosed': pd.to_datetime
        },
        na_values='0'
    ).dropna(),
    x='Woxcoordinate',
    y='Woycoordinate',
    crs={"init": "EPSG:2804"}
)


##  Filtering

For each repair of a streetlight, we want to create a count of the crimes that happened a week before it, in the surrounding area, and the count for the crimes after it.

First let's join the street data to the crime data, so that there is a row for every crime that happened nearby and a week before the repair:

In [104]:
WITHIN_M = 65 # half of city block

repairs_circles = repairs.assign(geometry=lambda x: x.geometry.buffer(WITHIN_M))
# plot_with_roads(repairs_circles)

night_crimes = crimes[(crimes.REPORT_DAT.dt.hour < 6) | (crimes.REPORT_DAT.dt.hour > 19)]

In [105]:
repairs_with_crimes = gpd.sjoin(repairs_circles, night_crimes, 'left')



In [106]:
from dateutil.relativedelta import relativedelta

In [107]:
TIME_MARGIN = pd.to_timedelta('7 day').view(np.int64)

In [108]:
repairs_with_crimes = repairs_with_crimes.assign(
    time_diff=lambda df: pd.cut(
        (df['START_DATE'] - df['Day of Datewoclosed']).view(np.int64),
        bins=[-TIME_MARGIN, 0, TIME_MARGIN],
        labels=['before', 'after']
    )
)
dif = repairs_with_crimes.groupby(['time_diff']).size()
print(dif)
print((dif[0]-dif[1])/dif[0])

time_diff
before    376
after     331
dtype: int64
0.119680851064


so we see a difference of about 12% within the a 1 week margin. 

In [109]:
for x in range(1,15):
    TIME_MARGIN = pd.to_timedelta(str(x)+' day').view(np.int64)
    repairs_with_crimes = repairs_with_crimes.assign(
        time_diff=lambda df: pd.cut(
            (df['START_DATE'] - df['Day of Datewoclosed']).view(np.int64),
            bins=[-TIME_MARGIN, 0, TIME_MARGIN],
            labels=['before', 'after']
        )
    )
    dif = repairs_with_crimes.groupby(['time_diff']).size()
    print((dif[0]-dif[1])/dif[0])

0.046511627907
0.132075471698
0.0890410958904
0.151658767773
0.109090909091
0.114906832298
0.119680851064
0.0943396226415
0.0672268907563
0.0772128060264
0.041166380789
0.0237717908082
0.0118694362018
0.0138696255201


And that difference gets smaller over time, which makes sense if lights are not out for very long, and if darker places accrue risk over time. 