# Complaints data preprocessing

### Setup

In [44]:
import random
import json
from collections import OrderedDict

import pandas as pd
import numpy as np
import geopandas as gpd

import matplotlib  
import matplotlib.pyplot as plt  
from matplotlib import rcParams

In [45]:
from tqdm import tqdm, tqdm_pandas, tqdm_notebook
tqdm.pandas(tqdm_notebook)

In [46]:
%config InlineBackend.figure_format = 'retina'
%matplotlib inline

In [47]:
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;

<IPython.core.display.Javascript object>

In [48]:
pd.set_option("display.max_columns",100)

### Helpers

In [49]:
# helper to read a sample of a CSV file
# helps to avoid processing large files during development
def read_csv_sample(file, sample_size):
    records_in_file = sum(1 for line in open(file))    
    skip = sorted(random.sample(range(records_in_file), records_in_file-sample_size))[1:]
    return pd.read_csv(file, skiprows=skip)

### Download, read and format complaints data

In [50]:
%%bash

# NYPD Complaint Data Current YTD -- 2016
# https://data.cityofnewyork.us/Public-Safety/NYPD-Complaint-Data-Current-YTD/5uac-w243
if [ ! -f "raw/NYPD_Complaint_Data_Current_YTD.csv" ]; then
  wget -q "https://data.cityofnewyork.us/api/views/5uac-w243/rows.csv?accessType=DOWNLOAD" -O "raw/NYPD_Complaint_Data_Current_YTD.csv"
fi

In [51]:
#df = read_csv_sample("raw/NYPD_Complaint_Data_Current_YTD.csv", 100)
df = pd.read_csv("raw/NYPD_Complaint_Data_Current_YTD.csv")
df.head(1)

Unnamed: 0,CMPLNT_NUM,CMPLNT_FR_DT,CMPLNT_FR_TM,CMPLNT_TO_DT,CMPLNT_TO_TM,RPT_DT,KY_CD,OFNS_DESC,PD_CD,PD_DESC,CRM_ATPT_CPTD_CD,LAW_CAT_CD,JURIS_DESC,BORO_NM,ADDR_PCT_CD,LOC_OF_OCCUR_DESC,PREM_TYP_DESC,PARKS_NM,HADEVELOPT,X_COORD_CD,Y_COORD_CD,Latitude,Longitude,Lat_Lon
0,645327550,12/27/2016,22:38:00,12/28/2016,07:05:00,12/28/2016,110,GRAND LARCENY OF MOTOR VEHICLE,441.0,"LARCENY,GRAND OF AUTO",COMPLETED,FELONY,N.Y. POLICE DEPT,MANHATTAN,19,FRONT OF,STREET,,,995467.0,218032.0,40.765125,-73.959508,"(40.765125007, -73.959508312)"


In [52]:
columns = OrderedDict()
columns['CMPLNT_FR_DT'] = "from_date" 
columns['CMPLNT_FR_TM'] = "from_time" 
columns['CMPLNT_TO_DT'] = "to_date" 
columns['CMPLNT_TO_TM'] = "to_time" 
columns['OFNS_DESC'] = "offense" 
columns['PD_DESC'] = "offense_description" 
columns['LOC_OF_OCCUR_DESC'] = "occurence" 
columns['PREM_TYP_DESC'] = "premise" 
columns['BORO_NM'] = "borough" 
columns['Latitude'] = "latitude" 
columns['Longitude'] = "longitude"

df = df[list(columns.keys())]
df.columns = columns.values()

df.dropna(subset=['latitude','longitude'], inplace=True)

df.head(1)

Unnamed: 0,from_date,from_time,to_date,to_time,offense,offense_description,occurence,premise,borough,latitude,longitude
0,12/27/2016,22:38:00,12/28/2016,07:05:00,GRAND LARCENY OF MOTOR VEHICLE,"LARCENY,GRAND OF AUTO",FRONT OF,STREET,MANHATTAN,40.765125,-73.959508


### Read and format [street/number to latitude/longitude] dataset

In [53]:
%%bash

# http://results.openaddresses.io/
if [ ! -d "raw/openaddr-collected-us_northeast" ]; then
  wget -q "https://s3.amazonaws.com/data.openaddresses.io/openaddr-collected-us_northeast.zip" -O "raw/openaddr-collected-us_northeast.zip"
  ( cd raw && unzip -q openaddr-collected-us_northeast.zip -d openaddr-collected-us_northeast )
fi

In [54]:
#dfs = read_csv_sample('raw/openaddr-collected-us_northeast/us/ny/city_of_new_york.csv', 100)
dfs = pd.read_csv('raw/openaddr-collected-us_northeast/us/ny/city_of_new_york.csv')

def fix_postcode(p):
    try:
        return str(int(p))
    except:
        return ""

dfs.LAT = dfs.LAT.astype(float)
dfs.LON = dfs.LON.astype(float)
dfs.POSTCODE = dfs.POSTCODE.map(fix_postcode)

columns = OrderedDict()
columns['LON'] = "longitude" 
columns['LAT'] = "latitude" 
columns['NUMBER'] = "number" 
columns['STREET'] = "street"
columns['POSTCODE'] = "postcode"

dfs = dfs[list(columns.keys())]
dfs.columns = columns.values()
dfs.head(1)

Unnamed: 0,longitude,latitude,number,street,postcode
0,-73.963457,40.680729,932,ATLANTIC AVENUE,11238


### Download census tract geospacial data

In [55]:
%%bash

# https://data.cityofnewyork.us/City-Government/2000-Census-Tracts/ysjj-vb9j
if [ ! -f "raw/2000 Census Tracts.geojson" ]; then
  wget -q "https://data.cityofnewyork.us/api/geospatial/ysjj-vb9j?method=export&format=GeoJSON" -O "raw/2000 Census Tracts.geojson"
fi

### Extend  [street/number to latitude/longitude] dataset to include borough and census tract

In [60]:
from shapely.geometry import shape, Point, Polygon
from rtree import index 

# read census tracts geospacial json file
# "features": [
#   "properties":{
#     "ntacode":"MN19",
#     "ntaname":"Turtle Bay-East Midtown",
#     "boro_name":"Manhattan",
#     "puma":"3808"},
#   "geometry": { <polygon data> }
# ], ...

with open('raw/2000 Census Tracts.geojson') as f:
    js = json.load(f)

# dictionary to map an index key to a dictionary holding census tract and polygon data
geo = {}

# rshape index to speed up queries on boundaries
# will have the same index as the `geo` dictionary
idx = index.Index()

i=0

# fill geo and idx
for feature in js['features']:
    polygon = shape(feature['geometry'])
    minx, miny, maxx, maxy = polygon.bounds
    
    p = feature['properties']
    
    # adds polygon plus census tract data to the dictionary
    geo[i] = {
        'polygon': polygon,
        'borough': p['boro_name'], 
        'census_tract': p['ntaname'], 
        'census_tract_code': p['ntacode'], 
        'puma': p['puma']
    }
    
    # adds bounding box to rtree index
    idx.insert(i, (minx, miny, maxx, maxy))
    
    i+=1
    
def map_borough_census_tract(row):
    
    x = row.longitude
    y = row.latitude
    point = Point(row.longitude, row.latitude)
    
    # finds a list of geo entries that potentially hold the point
    inter = [ geo[i] for i in idx.intersection((x,y,x,y)) ]
    
    row['borough'], row['census_tract'], row['census_tract_code'], row['puma'] = "","","",""
    
    # finds the actual geo entry that holds the point, and produces the new row entries
    for p in inter:
        if p['polygon'].contains(point):
            row['borough'], row['census_tract'], row['census_tract_code'], row['puma'] = p['borough'], p['census_tract'], p['census_tract_code'], p['puma']
            break
    
    return row

dfs = dfs.progress_apply(map_borough_census_tract, axis=1)
dfs.to_csv('processed/streets.csv.gz', compression="gzip")
dfs.head(1)

101it [00:00, 637.79it/s]            


Unnamed: 0,longitude,latitude,number,street,postcode,borough,census_tract,census_tract_code,puma
0,-73.963457,40.680729,932,ATLANTIC AVENUE,11238,Brooklyn,Prospect Heights,BK64,4006


### Add street, number, borough, census_tract to complaint data

In [57]:
from scipy.spatial import KDTree

# https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.spatial.KDTree.html
# builds a kdtree to speed up the lookup between streets and latitude/longitude in the complaints dataset
tree = KDTree(dfs[['latitude','longitude']].values)

In [58]:
def map_street(row):
    i = tree.query([row.latitude, row.longitude])[1] # tree.query returns (distance, index)
    d = dfs.iloc[i]
    row['street'], row['number'], row['postcode'], row['borough'], row['census_tract'], row['census_tract_code'], row['puma'] = d.street, d.number, d.postcode, d.borough, d.census_tract, d.census_tract_code, d.puma
    return row

# geospacial join between complaints and streets datasets
df = df.progress_apply(map_street, axis=1)
df.head()

df.to_csv('processed/complaints.csv.gz', compression="gzip")

100it [00:01, 94.09it/s]              


In [59]:
df.head(1)

Unnamed: 0,from_date,from_time,to_date,to_time,offense,offense_description,occurence,premise,borough,latitude,longitude,street,number,postcode,census_tract,census_tract_code,puma
0,12/27/2016,22:38:00,12/28/2016,07:05:00,GRAND LARCENY OF MOTOR VEHICLE,"LARCENY,GRAND OF AUTO",FRONT OF,STREET,Manhattan,40.765125,-73.959508,EAST 64 STREET,410,10065,Lenox Hill-Roosevelt Island,MN31,3805
