# SF Crime Classification

* Treating this as a walkthrough, guidance taken from beginner / intermediate submissions for this

## Imports

In [34]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline 
# matplotlin inline is necessary for the notebook to display the plots


## Read in the data

In [35]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

## Show Data Sample and light EDA

In [36]:
print(train.shape)
print(test.shape)

(878049, 9)
(884262, 7)


In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 878049 entries, 0 to 878048
Data columns (total 9 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   Dates       878049 non-null  object 
 1   Category    878049 non-null  object 
 2   Descript    878049 non-null  object 
 3   DayOfWeek   878049 non-null  object 
 4   PdDistrict  878049 non-null  object 
 5   Resolution  878049 non-null  object 
 6   Address     878049 non-null  object 
 7   X           878049 non-null  float64
 8   Y           878049 non-null  float64
dtypes: float64(2), object(7)
memory usage: 60.3+ MB


In [7]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 884262 entries, 0 to 884261
Data columns (total 7 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   Id          884262 non-null  int64  
 1   Dates       884262 non-null  object 
 2   DayOfWeek   884262 non-null  object 
 3   PdDistrict  884262 non-null  object 
 4   Address     884262 non-null  object 
 5   X           884262 non-null  float64
 6   Y           884262 non-null  float64
dtypes: float64(2), int64(1), object(4)
memory usage: 47.2+ MB


In [10]:
train.head(5)

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541


### Notes:

* Training data is 878,000 records over 9 variables, no nulls
* Testing data is similar number of records, no nulls
* Category is the target variable, categorical
* ID in test_data ought to be made the index column as in the train data
* Resolution should be dropped because it isn't in the test data, it can't help train the model
* X and Y are spatial coordinates
* Date is just an object - it should be datetime that could help with analysis so let's convert that

In [37]:
print('Number of Categories: ', train.Category.nunique())
print('Number of Days of the Week: ', train.DayOfWeek.nunique())
print('Number of Police Districts: ', train.PdDistrict.nunique())


Number of Categories:  39
Number of Days of the Week:  7
Number of Police Districts:  10


## Feature Engineering and Preprocessing

In [38]:
# drop the resolution column from training
train.drop('Resolution', axis=1, inplace=True)

In [15]:
train.head(3)

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,VANNESS AV / GREENWICH ST,-122.424363,37.800414


### Convert Date to Datetime

In [39]:
# check again that there are no nulls in the dates column
print('Number of nulls in Dates: ', train.Dates.isnull().sum())
print('Number of nulls in Dates: ', test.Dates.isnull().sum())

Number of nulls in Dates:  0
Number of nulls in Dates:  0


In [40]:
# are the Dates in the correct format? regex search
assert train.Dates.str.match(r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}').all() == True
assert test.Dates.str.match(r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}').all() == True

In [41]:
# convert the dates to datetime in new column Date (not plural dates)
train['Date'] = pd.to_datetime(train['Dates'])
test['Date'] = pd.to_datetime(test['Dates'])

# drop the old Dates column
train.drop('Dates', axis=1, inplace=True)
test.drop('Dates', axis=1, inplace=True)
train.head(3)

Unnamed: 0,Category,Descript,DayOfWeek,PdDistrict,Address,X,Y,Date
0,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,OAK ST / LAGUNA ST,-122.425892,37.774599,2015-05-13 23:53:00
1,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,OAK ST / LAGUNA ST,-122.425892,37.774599,2015-05-13 23:53:00
2,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,VANNESS AV / GREENWICH ST,-122.424363,37.800414,2015-05-13 23:33:00


In [42]:
# confirm data type now
train['Date'].dtype # don't use dot notation for dtypes incase of more complex column names

dtype('<M8[ns]')

### Was a crime committed at night or during the daytime?

Intuitively this feels like it would predict different types of crimes, so let's engineer it from the Date

In [43]:
# engineer the day / night feature as a binary column, Day = 1, Night = 0
# 6am to 8pm is considered day time

train['IsDay'] = 0
train.loc[ (train.Date.dt.hour > 6) & (train.Date.dt.hour < 20), 'IsDay' ] = 1
test['IsDay'] = 0
test.loc[ (test.Date.dt.hour > 6) & (test.Date.dt.hour < 20), 'IsDay' ] = 1

train.sample(7)

Unnamed: 0,Category,Descript,DayOfWeek,PdDistrict,Address,X,Y,Date,IsDay
127118,WARRANTS,WARRANT ARREST,Wednesday,PARK,6TH AV / IRVING ST,-122.463102,37.764178,2013-09-04 22:40:00,0
661648,ASSAULT,BATTERY,Tuesday,BAYVIEW,1300 Block of SILVER AV,-122.409171,37.73129,2005-12-27 17:15:00,1
72847,VEHICLE THEFT,STOLEN AUTOMOBILE,Wednesday,BAYVIEW,1800 Block of OAKDALE AV,-122.395572,37.737504,2014-05-28 01:00:00,0
380288,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Sunday,BAYVIEW,100 Block of SANSOME ST,-122.400772,37.79166,2010-01-17 15:15:00,1
855221,LARCENY/THEFT,GRAND THEFT FROM A BUILDING,Saturday,TARAVAL,3600 Block of TARAVAL ST,-122.505162,37.741685,2003-04-19 12:16:00,1
219672,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Sunday,RICHMOND,BRODERICK ST / BUSH ST,-122.441727,37.785967,2012-06-03 21:00:00,0
266614,NON-CRIMINAL,FOUND PROPERTY,Tuesday,BAYVIEW,3800 Block of 3RD ST,-122.387939,37.74226,2011-10-04 12:59:00,1


See above: 6pm daytime, 2:12am nighttime...

### Day of the Week may be relevant but it needs to be a numerical format
Encode to integer

In [44]:
# a dictionary to encode the days of the week to integer values
days_to_integer = {
    'Monday': 1,
    'Tuesday': 2,
    'Wednesday': 3,
    'Thursday': 4,
    'Friday': 5,
    'Saturday': 6,
    'Sunday': 7
}
train['DayOfWeek'] = train['DayOfWeek'].map(days_to_integer)
test['DayOfWeek'] = test['DayOfWeek'].map(days_to_integer)

train.sample(5)

Unnamed: 0,Category,Descript,DayOfWeek,PdDistrict,Address,X,Y,Date,IsDay
717976,VEHICLE THEFT,RECOVERED VEHICLE - STOLEN OUTSIDE SF,5,SOUTHERN,6TH ST / BRYANT ST,-122.402528,37.776038,2005-03-11 17:41:00,1
78233,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,3,SOUTHERN,200 Block of INTERSTATE80 HY,-122.365565,37.809671,2014-04-30 19:00:00,1
400810,OTHER OFFENSES,PAROLE VIOLATION,7,CENTRAL,400 Block of BAY ST,-122.414428,37.805462,2009-09-20 17:44:00,1
71755,ROBBERY,"ROBBERY, BODILY FORCE",5,TENDERLOIN,MARKET ST / 8TH ST,-122.414744,37.778719,2014-05-30 22:17:00,0
465919,LARCENY/THEFT,GRAND THEFT FROM PERSON,5,CENTRAL,200 Block of JACKSON ST,-122.400117,37.796904,2008-10-17 19:00:00,1


### Hour, Month, Year Columns

In [45]:
train['Hour'] = train.Date.dt.hour
train['Month'] = train.Date.dt.month
train['Year'] = train.Date.dt.year
train['Year'] = train['Year'] - 2000 # to make it more readable, all the data is from the 2000s anyway

test['Hour'] = test.Date.dt.hour
test['Month'] = test.Date.dt.month
test['Year'] = test.Date.dt.year
test['Year'] = test['Year'] - 2000

train.sample(1)


Unnamed: 0,Category,Descript,DayOfWeek,PdDistrict,Address,X,Y,Date,IsDay,Hour,Month,Year
159368,OTHER OFFENSES,MISCELLANEOUS INVESTIGATION,4,SOUTHERN,800 Block of BRYANT ST,-122.403405,37.775421,2013-04-04 17:10:00,1,17,4,13


The encoding of all of these date time features as individual numerical values should help us capture local or temporary spikes e.g. particular crimes that peaked in certain years, nighttime violence etc, seasonal crimes e.g. burglary at Christmas potentially...

The months/days/years are cyclical and we should use Cosine to capture the cyclical nature. 

For example, predictively this will capture that 11pm (23) is very close to 12am (0) despite it being the largest numerical gap in the 24 hour clock. Same with December (12) and January (1) and will lead to more sensible predictions

In [46]:
train['HourCos'] = np.cos(2 * np.pi * train['Hour'] / 24)
train['DayOfWeekCos'] = np.cos(2 * np.pi * train['DayOfWeek'] / 7)
train['MonthCos'] = np.cos(2 * np.pi * train['Month'] / 12)

test['HourCos'] = np.cos(2 * np.pi * test['Hour'] / 24)
test['DayOfWeekCos'] = np.cos(2 * np.pi * test['DayOfWeek'] / 7)
test['MonthCos'] = np.cos(2 * np.pi * test['Month'] / 12)

train.sample(1)

Unnamed: 0,Category,Descript,DayOfWeek,PdDistrict,Address,X,Y,Date,IsDay,Hour,Month,Year,HourCos,DayOfWeekCos,MonthCos
777646,LARCENY/THEFT,PETTY THEFT SHOPLIFTING,7,SOUTHERN,800 Block of MARKET ST,-122.406521,37.785063,2004-05-02 14:40:00,1,14,5,4,-0.866025,1.0,-0.866025


### PD District Encoding with Dummy Variable

In [47]:
train = pd.get_dummies(train, columns=['PdDistrict'])
test = pd.get_dummies(test, columns=['PdDistrict'])

train.sample(1)

Unnamed: 0,Category,Descript,DayOfWeek,Address,X,Y,Date,IsDay,Hour,Month,...,PdDistrict_BAYVIEW,PdDistrict_CENTRAL,PdDistrict_INGLESIDE,PdDistrict_MISSION,PdDistrict_NORTHERN,PdDistrict_PARK,PdDistrict_RICHMOND,PdDistrict_SOUTHERN,PdDistrict_TARAVAL,PdDistrict_TENDERLOIN
260311,SUSPICIOUS OCC,SUSPICIOUS OCCURRENCE,5,1300 Block of STOCKTON ST,-122.408784,37.7981,2011-11-04 12:12:00,1,12,11,...,False,True,False,False,False,False,False,False,False,False


### Encode Categories

In [48]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
train['CategoryCode'] = pd.Series(le.fit_transform(train['Category'])) # PD series to keep the same index and prevent possible misalignments
train.sample(5)

Unnamed: 0,Category,Descript,DayOfWeek,Address,X,Y,Date,IsDay,Hour,Month,...,PdDistrict_CENTRAL,PdDistrict_INGLESIDE,PdDistrict_MISSION,PdDistrict_NORTHERN,PdDistrict_PARK,PdDistrict_RICHMOND,PdDistrict_SOUTHERN,PdDistrict_TARAVAL,PdDistrict_TENDERLOIN,CategoryCode
859775,LARCENY/THEFT,PETTY THEFT SHOPLIFTING,2,1200 Block of SOUTH VAN NESS AV,-122.416269,37.753159,2003-04-01 17:55:00,1,17,4,...,False,False,True,False,False,False,False,False,False,16
753938,ASSAULT,THREATS AGAINST LIFE,2,1700 Block of HARRISON ST,-122.413354,37.769075,2004-08-31 14:15:00,1,14,8,...,False,False,True,False,False,False,False,False,False,1
674643,VEHICLE THEFT,ATTEMPTED STOLEN VEHICLE,3,1000 Block of PINE ST,-122.413096,37.790512,2005-10-19 21:00:00,0,21,10,...,True,False,False,False,False,False,False,False,False,36
334020,VANDALISM,"MALICIOUS MISCHIEF, VANDALISM",7,100 Block of SOUTH VAN NESS AV,-122.4183,37.771538,2010-09-26 00:01:00,0,0,9,...,False,False,False,False,False,False,True,False,False,35
70393,OTHER OFFENSES,"DRIVERS LICENSE, SUSPENDED OR REVOKED",2,MISSION ST / 17TH ST,-122.419516,37.763429,2014-06-10 16:20:00,1,16,6,...,False,False,True,False,False,False,False,False,False,21


## XGBoost