# Exploration of Crime Dataset

In [34]:
import pandas as pd
import pickle

### We are only considering data from 2011 onward to train the model.

In [4]:
#Reading in the files
crimeDFs = [] 
for i in range(2011,2019):
    crimeDFs.append(pd.read_csv("../data/raw/Crimes_-_" + str(i)+ ".csv", header = 0))

In [5]:
fullDF = pd.concat(crimeDFs) #Combining into single dataframe

In [6]:
del crimeDFs

In [23]:
print(len(fullDF[fullDF["Ward"].isnull()])) #Checking for many missing values for Ward. It appears most reliable.

0


Since there were just 29 crimes for which the ward was not recorded, we can drop these rows.

In [19]:
fullDF = fullDF[fullDF["Ward"].notnull()] # Dropping the 29 rows with missing Ward

In [21]:
fullDF.head()

Unnamed: 0,Arrest,Beat,Block,Case Number,Community Area,Date,Description,District,Domestic,FBI Code,...,Latitude,Location,Location Description,Longitude,Primary Type,Updated On,Ward,X Coordinate,Y Coordinate,Year
0,False,631,080XX S VERNON AVE,JB152871,,01/01/2011 10:00:00 AM,FINAN EXPLOIT-ELDERLY/DISABLED,,False,11,...,,,RESIDENCE,,DECEPTIVE PRACTICE,,6.0,,,2011
1,False,823,028XX W 63RD ST,HT636329,,12/17/2011 01:50:00 PM,$500 AND UNDER,,False,6,...,41.77909,"(41.779090123, -87.695158983)",STREET,-87.695159,THEFT,,15.0,1158313.0,1862737.0,2011
2,True,823,028XX W 63RD ST,HT286883,,05/09/2011 10:44:00 AM,POSS: CANNABIS 30GMS OR LESS,,False,18,...,41.77909,"(41.779090123, -87.695158983)",STREET,-87.695159,NARCOTICS,,15.0,1158313.0,1862737.0,2011
3,False,823,028XX W 63RD ST,HT113240,,01/10/2011 11:00:00 AM,RETAIL THEFT,,False,6,...,41.77909,"(41.779090123, -87.695158983)",OTHER,-87.695159,THEFT,,15.0,1158313.0,1862737.0,2011
4,True,1624,040XX N MENARD AVE,HT498458,,09/15/2011 08:00:00 AM,UNLAWFUL ENTRY,,False,5,...,41.953965,"(41.95396528, -87.771918163)",RESIDENCE,-87.771918,BURGLARY,,38.0,1136960.0,1926315.0,2011


In [25]:
fullDF.Ward = fullDF.Ward.astype("int")

In [26]:
fullDF.head()

Unnamed: 0,Arrest,Beat,Block,Case Number,Community Area,Date,Description,District,Domestic,FBI Code,...,Latitude,Location,Location Description,Longitude,Primary Type,Updated On,Ward,X Coordinate,Y Coordinate,Year
0,False,631,080XX S VERNON AVE,JB152871,,01/01/2011 10:00:00 AM,FINAN EXPLOIT-ELDERLY/DISABLED,,False,11,...,,,RESIDENCE,,DECEPTIVE PRACTICE,,6,,,2011
1,False,823,028XX W 63RD ST,HT636329,,12/17/2011 01:50:00 PM,$500 AND UNDER,,False,6,...,41.77909,"(41.779090123, -87.695158983)",STREET,-87.695159,THEFT,,15,1158313.0,1862737.0,2011
2,True,823,028XX W 63RD ST,HT286883,,05/09/2011 10:44:00 AM,POSS: CANNABIS 30GMS OR LESS,,False,18,...,41.77909,"(41.779090123, -87.695158983)",STREET,-87.695159,NARCOTICS,,15,1158313.0,1862737.0,2011
3,False,823,028XX W 63RD ST,HT113240,,01/10/2011 11:00:00 AM,RETAIL THEFT,,False,6,...,41.77909,"(41.779090123, -87.695158983)",OTHER,-87.695159,THEFT,,15,1158313.0,1862737.0,2011
4,True,1624,040XX N MENARD AVE,HT498458,,09/15/2011 08:00:00 AM,UNLAWFUL ENTRY,,False,5,...,41.953965,"(41.95396528, -87.771918163)",RESIDENCE,-87.771918,BURGLARY,,38,1136960.0,1926315.0,2011


### Since the coordinates are frequently not available, we will have to use the ward to determine approximate location, and map it to Zip Code using the following data

In [10]:
wardDF = pd.read_csv("../data/raw/Ward_Offices.csv")

In [11]:
wardDF.head()

Unnamed: 0,WARD,ALDERMAN,ADDRESS,CITY,STATE,ZIPCODE,WARD PHONE,WARD FAX,EMAIL,WEBSITE,LOCATION,CITY HALL ADDRESS,CITY HALL CITY,CITY HALL STATE,CITY HALL ZIPCODE,CITY HALL PHONE
0,1,"Proco ""Joe"" Moreno",2058 North Western Avenue,Chicago,IL,60647,(773) 278-0101,(773) 278-2541,ward01@cityofchicago.org,http://www.ward1.org,"(41.9192937068, -87.6876139388)","121 North LaSalle Street, Room 300",Chicago,IL,60602,(312) 744-3063
1,2,Brian Hopkins,1400 North Ashland Avenue,Chicago,IL,60622,(312)643-2299,,Ward02@cityofchicago.org,http://www.brianhopkinsforchicago.com/,"(41.9070344812, -87.6677379022)","121 North LaSalle Street, Room 200",Chicago,IL,60602,(312) 744-6834
2,3,Pat Dowell,5046 South State Street,Chicago,IL,60609,(773) 373-9273,(773) 373-6852,Pat.Dowell@cityofchicago.org,http://www.dowellfor3rdward.com/,"(41.8023557628, -87.626046431)","121 North LaSalle Street, Room 200",Chicago,IL,60602,(312) 744-8734
3,4,William D. Burns,"435 East 35th Street, 1st Floor",Chicago,IL,60616,(773) 536-8103,(773) 536-7296,ward04@cityofchicago.org,http://www.aldwillburns.com,"(41.8310649884, -87.6155396214)","121 North LaSalle Street, Room 200",Chicago,IL,60602,(312) 744-2690
4,5,Leslie A. Hairston,2325 East 71st Street,Chicago,IL,60649,(773) 324-5555,(773) 324-1585,lhairston@cityofchicago.org,http://www.leslieahairston.com/,"(41.7660318787, -87.5678811948)","121 North LaSalle Street, Room 300",Chicago,IL,60602,(312) 744-6832


In [12]:
wardDFtrim = wardDF[["WARD", "ZIPCODE"]]

In [28]:
wardDFtrim

Unnamed: 0,WARD,ZIPCODE
0,1,60647
1,2,60622
2,3,60609
3,4,60616
4,5,60649
5,6,60619
6,7,60617
7,8,60619
8,9,60628
9,10,60617


In [29]:
wardDFtrim = wardDFtrim.rename(index = str, columns = {"WARD":"Ward", "ZIPCODE":"Zipcode"})

In [31]:
updatedDF = fullDF.merge(wardDFtrim, on = "Ward", how = "left" )

In [39]:
updatedDF.head()

Unnamed: 0,Arrest,Beat,Block,Case Number,Community Area,Date,Description,District,Domestic,FBI Code,...,Location,Location Description,Longitude,Primary Type,Updated On,Ward,X Coordinate,Y Coordinate,Year,Zipcode
0,False,631,080XX S VERNON AVE,JB152871,,01/01/2011 10:00:00 AM,FINAN EXPLOIT-ELDERLY/DISABLED,,False,11,...,,RESIDENCE,,DECEPTIVE PRACTICE,,6,,,2011,60619
1,False,823,028XX W 63RD ST,HT636329,,12/17/2011 01:50:00 PM,$500 AND UNDER,,False,6,...,"(41.779090123, -87.695158983)",STREET,-87.695159,THEFT,,15,1158313.0,1862737.0,2011,60636
2,True,823,028XX W 63RD ST,HT286883,,05/09/2011 10:44:00 AM,POSS: CANNABIS 30GMS OR LESS,,False,18,...,"(41.779090123, -87.695158983)",STREET,-87.695159,NARCOTICS,,15,1158313.0,1862737.0,2011,60636
3,False,823,028XX W 63RD ST,HT113240,,01/10/2011 11:00:00 AM,RETAIL THEFT,,False,6,...,"(41.779090123, -87.695158983)",OTHER,-87.695159,THEFT,,15,1158313.0,1862737.0,2011,60636
4,True,1624,040XX N MENARD AVE,HT498458,,09/15/2011 08:00:00 AM,UNLAWFUL ENTRY,,False,5,...,"(41.95396528, -87.771918163)",RESIDENCE,-87.771918,BURGLARY,,38,1136960.0,1926315.0,2011,60634


In [43]:
print(sum(updatedDF.Zipcode.isnull())) #Just double checking that Zipcode is taken care of

0


In [44]:
allZips = updatedDF.Zipcode.unique()
with open('AllZipCodes.pkl', 'wb') as f:
    pickle.dump(allZips, f)