In [1]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
from tqdm import tqdm
import re
from datetime import datetime

import warnings
warnings.filterwarnings("ignore")


In [5]:
Service_dict={'MURDER & NON-NEGL. MANSLAUGHTER':1, 'RAPE':1, 'ROBBERY':1, 'FELONY ASSAULT':1, 
              'BURGLARY':2, 'GRAND LARCENY':2, 'GRAND LARCENY OF MOTOR VEHICLE':2, 'DANGEROUS DRUGS':2}
CrimeTypes = list(Service_dict.keys())

#### NYPD Complaint

In [2]:
NYPDComplaint = pd.read_csv("../Data/NYPD_Complaint_Data_Historic.csv",  iterator = True, chunksize=100000,  low_memory=False)
NYPDComplaint

<pandas.io.parsers.TextFileReader at 0x296465f9a48>

In [3]:
dataNYPD_useful = []

In [7]:
for chunk in tqdm(NYPDComplaint):
    chunk = chunk[['X_COORD_CD', 'Y_COORD_CD', "OFNS_DESC", "CMPLNT_FR_DT", "CMPLNT_FR_TM"]]
    chunk.dropna(axis=0,how='any', inplace=True)
    chunk = chunk[chunk["OFNS_DESC"].isin(CrimeTypes)]
    chunk["year"] = chunk["CMPLNT_FR_DT"].apply(lambda x: int(x.split("/")[-1]))
    chunk = chunk[(chunk.year>=2015)&(chunk.year<=2019)]
    dataNYPD_useful.append(chunk)

0it [00:00, ?it/s]


In [39]:
finalNYPDdata = pd.concat(dataNYPD_useful,axis=0,ignore_index=True)
finalNYPDdata["X_COORD_CD"] = finalNYPDdata["X_COORD_CD"].apply(lambda x:int("".join(list(filter(str.isdigit, x)))))
finalNYPDdata["Y_COORD_CD"] = finalNYPDdata["Y_COORD_CD"].apply(lambda x:int("".join(list(filter(str.isdigit, x)))))
finalNYPDdata.head()

Unnamed: 0,X_COORD_CD,Y_COORD_CD,OFNS_DESC,CMPLNT_FR_DT,CMPLNT_FR_TM,year
0,1001868,183705,FELONY ASSAULT,07/23/2016,02:45:00,2016
1,1019366,259474,FELONY ASSAULT,02/26/2017,04:20:00,2017
2,986367,185688,DANGEROUS DRUGS,08/14/2017,21:00:00,2017
3,1002228,234677,DANGEROUS DRUGS,09/18/2016,23:15:00,2016
4,988554,216447,DANGEROUS DRUGS,03/17/2016,00:20:00,2016


In [40]:
len(finalNYPDdata)

580256

#### process NYPD with shapefiles

In [41]:
#load the resulting shapefile using geopandas
zipcode = gpd.read_file('../Data/ZIPCODE/ZIP_CODE_040114.shp')
zipcode.head()

Unnamed: 0,ZIPCODE,BLDGZIP,PO_NAME,POPULATION,AREA,STATE,COUNTY,ST_FIPS,CTY_FIPS,URL,SHAPE_AREA,SHAPE_LEN,geometry
0,11436,0,Jamaica,18681.0,22699300.0,NY,Queens,36,81,http://www.usps.com/,0.0,0.0,"POLYGON ((1038098.252 188138.380, 1038141.936 ..."
1,11213,0,Brooklyn,62426.0,29631000.0,NY,Kings,36,47,http://www.usps.com/,0.0,0.0,"POLYGON ((1001613.713 186926.440, 1002314.243 ..."
2,11212,0,Brooklyn,83866.0,41972100.0,NY,Kings,36,47,http://www.usps.com/,0.0,0.0,"POLYGON ((1011174.276 183696.338, 1011373.584 ..."
3,11225,0,Brooklyn,56527.0,23698630.0,NY,Kings,36,47,http://www.usps.com/,0.0,0.0,"POLYGON ((995908.365 183617.613, 996522.848 18..."
4,11218,0,Brooklyn,72280.0,36868800.0,NY,Kings,36,47,http://www.usps.com/,0.0,0.0,"POLYGON ((991997.113 176307.496, 992042.798 17..."


In [42]:
zipcode_polygon = zipcode[["ZIPCODE", "geometry"]].values
def getZipcode(p):
    for code, geometry in zipcode_polygon:
        if geometry.contains(p):
            return int(code)

In [43]:
zipcode_list = []
for idx, row in tqdm(finalNYPDdata.iterrows()):
    X_COORD_CD, Y_COORD_CD, OFNS_DESC, CMPLNT_FR_DT, CMPLNT_FR_TM, year = row
    code = getZipcode(Point(X_COORD_CD, Y_COORD_CD))
    zipcode_list.append(code)

580256it [08:24, 1149.26it/s]


In [44]:
finalNYPDdata["zipcode"] = zipcode_list

In [45]:
finalNYPDdata

Unnamed: 0,X_COORD_CD,Y_COORD_CD,OFNS_DESC,CMPLNT_FR_DT,CMPLNT_FR_TM,year,zipcode
0,1001868,183705,FELONY ASSAULT,07/23/2016,02:45:00,2016,11213.0
1,1019366,259474,FELONY ASSAULT,02/26/2017,04:20:00,2017,10467.0
2,986367,185688,DANGEROUS DRUGS,08/14/2017,21:00:00,2017,11231.0
3,1002228,234677,DANGEROUS DRUGS,09/18/2016,23:15:00,2016,10037.0
4,988554,216447,DANGEROUS DRUGS,03/17/2016,00:20:00,2016,10019.0
...,...,...,...,...,...,...,...
580251,1021600,260152,DANGEROUS DRUGS,10/14/2018,20:00:00,2018,10467.0
580252,1015133,247522,GRAND LARCENY,06/23/2018,09:00:00,2018,10457.0
580253,1041216,191470,GRAND LARCENY,12/14/2018,20:30:00,2018,11433.0
580254,987583,210248,FELONY ASSAULT,01/14/2018,02:50:00,2018,10001.0


In [46]:
finalNYPDdata = finalNYPDdata[["zipcode", "year", "CMPLNT_FR_DT", "OFNS_DESC", "CMPLNT_FR_TM"]]
finalNYPDdata

Unnamed: 0,zipcode,year,CMPLNT_FR_DT,OFNS_DESC,CMPLNT_FR_TM
0,11213.0,2016,07/23/2016,FELONY ASSAULT,02:45:00
1,10467.0,2017,02/26/2017,FELONY ASSAULT,04:20:00
2,11231.0,2017,08/14/2017,DANGEROUS DRUGS,21:00:00
3,10037.0,2016,09/18/2016,DANGEROUS DRUGS,23:15:00
4,10019.0,2016,03/17/2016,DANGEROUS DRUGS,00:20:00
...,...,...,...,...,...
580251,10467.0,2018,10/14/2018,DANGEROUS DRUGS,20:00:00
580252,10457.0,2018,06/23/2018,GRAND LARCENY,09:00:00
580253,11433.0,2018,12/14/2018,GRAND LARCENY,20:30:00
580254,10001.0,2018,01/14/2018,FELONY ASSAULT,02:50:00


In [47]:
finalNYPDdata["month"] = finalNYPDdata["CMPLNT_FR_DT"].apply(lambda x:int(x.split("/")[0]))
finalNYPDdata["timeinterval"] = finalNYPDdata["CMPLNT_FR_TM"].apply(lambda x:int(x.split(":")[0]))
finalNYPDdata.rename(columns={"CMPLNT_FR_DT":"date", "OFNS_DESC":"crime_name"}, inplace=True)
finalNYPDdata["crime_type"] = finalNYPDdata["crime_name"].apply(lambda x:Service_dict[x])
crimetype_name = {1:"Violent Crime", 2:"Property Crime"}
finalNYPDdata["crime_name"] = finalNYPDdata["crime_type"].apply(lambda x:crimetype_name[x])
finalNYPDdata["dayOfWeek"] = finalNYPDdata["date"].apply(lambda x: datetime.strptime(x,"%m/%d/%Y").weekday() + 1)
finalNYPDdata["isWeekend"] = finalNYPDdata["dayOfWeek"].apply(lambda x:1 if x>5 else 0)
finalNYPDdata = finalNYPDdata[["zipcode", "date", "year", "month", "timeinterval", "isWeekend", "crime_name", "crime_type"]]

In [48]:
finalNYPDdata.head()

Unnamed: 0,zipcode,date,year,month,timeinterval,isWeekend,crime_name,crime_type
0,11213.0,07/23/2016,2016,7,2,1,Violent Crime,1
1,10467.0,02/26/2017,2017,2,4,1,Violent Crime,1
2,11231.0,08/14/2017,2017,8,21,0,Property Crime,2
3,10037.0,09/18/2016,2016,9,23,1,Property Crime,2
4,10019.0,03/17/2016,2016,3,0,0,Property Crime,2


In [None]:
finalNYPDdata.to_csv("CrimeData.csv", index=None)