# SF Crime Classification

* Treating this as a walkthrough, guidance taken from beginner / intermediate submissions for this

## Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline 
# matplotlin inline is necessary for the notebook to display the plots


## Read in the data

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

## Show Data Sample and light EDA

In [36]:
print(train.shape)
print(test.shape)

(878049, 9)
(884262, 7)


In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 878049 entries, 0 to 878048
Data columns (total 9 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   Dates       878049 non-null  object 
 1   Category    878049 non-null  object 
 2   Descript    878049 non-null  object 
 3   DayOfWeek   878049 non-null  object 
 4   PdDistrict  878049 non-null  object 
 5   Resolution  878049 non-null  object 
 6   Address     878049 non-null  object 
 7   X           878049 non-null  float64
 8   Y           878049 non-null  float64
dtypes: float64(2), object(7)
memory usage: 60.3+ MB


In [7]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 884262 entries, 0 to 884261
Data columns (total 7 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   Id          884262 non-null  int64  
 1   Dates       884262 non-null  object 
 2   DayOfWeek   884262 non-null  object 
 3   PdDistrict  884262 non-null  object 
 4   Address     884262 non-null  object 
 5   X           884262 non-null  float64
 6   Y           884262 non-null  float64
dtypes: float64(2), int64(1), object(4)
memory usage: 47.2+ MB


In [3]:
train.head(5)

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541


### Notes:

* Training data is 878,000 records over 9 variables, no nulls
* Testing data is similar number of records, no nulls
* Category is the target variable, categorical
* ID in test_data ought to be made the index column as in the train data
* Resolution should be dropped because it isn't in the test data, it can't help train the model
* X and Y are spatial coordinates
* Date is just an object - it should be datetime that could help with analysis so let's convert that

In [37]:
print('Number of Categories: ', train.Category.nunique())
print('Number of Days of the Week: ', train.DayOfWeek.nunique())
print('Number of Police Districts: ', train.PdDistrict.nunique())

Number of Categories:  39
Number of Days of the Week:  7
Number of Police Districts:  10


## Feature Engineering and Preprocessing

In [4]:
# drop the resolution column from training
train.drop('Resolution', axis=1, inplace=True)

In [5]:
train.head(3)

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,VANNESS AV / GREENWICH ST,-122.424363,37.800414


### Convert Date to Datetime

In [6]:
# check again that there are no nulls in the dates column
print('Number of nulls in Dates: ', train.Dates.isnull().sum())
print('Number of nulls in Dates: ', test.Dates.isnull().sum())

Number of nulls in Dates:  0
Number of nulls in Dates:  0


In [7]:
# are the Dates in the correct format? regex search
assert train.Dates.str.match(r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}').all() == True
assert test.Dates.str.match(r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}').all() == True

In [8]:
# convert the dates to datetime in new column Date (not plural dates)
train['Date'] = pd.to_datetime(train['Dates'])
test['Date'] = pd.to_datetime(test['Dates'])

# drop the old Dates column
train.drop('Dates', axis=1, inplace=True)
test.drop('Dates', axis=1, inplace=True)
train.head(3)

Unnamed: 0,Category,Descript,DayOfWeek,PdDistrict,Address,X,Y,Date
0,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,OAK ST / LAGUNA ST,-122.425892,37.774599,2015-05-13 23:53:00
1,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,OAK ST / LAGUNA ST,-122.425892,37.774599,2015-05-13 23:53:00
2,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,VANNESS AV / GREENWICH ST,-122.424363,37.800414,2015-05-13 23:33:00


In [9]:
# confirm data type now
train['Date'].dtype # don't use dot notation for dtypes incase of more complex column names

dtype('<M8[ns]')

### Was a crime committed at night or during the daytime?

Intuitively this feels like it would predict different types of crimes, so let's engineer it from the Date

In [10]:
# engineer the day / night feature as a binary column, Day = 1, Night = 0
# 6am to 8pm is considered day time

train['IsDay'] = 0
train.loc[ (train.Date.dt.hour > 6) & (train.Date.dt.hour < 20), 'IsDay' ] = 1
test['IsDay'] = 0
test.loc[ (test.Date.dt.hour > 6) & (test.Date.dt.hour < 20), 'IsDay' ] = 1

train.sample(7)

Unnamed: 0,Category,Descript,DayOfWeek,PdDistrict,Address,X,Y,Date,IsDay
270601,VEHICLE THEFT,STOLEN TRUCK,Friday,TARAVAL,100 Block of PALOMA AV,-122.471037,37.728943,2011-09-09 22:00:00,0
463060,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Friday,SOUTHERN,5TH ST / BRANNAN ST,-122.398759,37.776569,2008-10-31 20:00:00,0
507220,OTHER OFFENSES,VIOLATION OF RESTRAINING ORDER,Thursday,MISSION,2000 Block of MISSION ST,-122.419658,37.764221,2008-03-20 11:20:00,1
871372,PROSTITUTION,SOLICITS FOR ACT OF PROSTITUTION,Tuesday,MISSION,17TH ST / SHOTWELL ST,-122.41623,37.763634,2003-02-04 22:40:00,0
440034,FRAUD,FALSE ID TO PEACE OFFICER,Thursday,MISSION,2000 Block of MISSION ST,-122.419658,37.764221,2009-03-05 10:02:00,1
125412,NON-CRIMINAL,"AIDED CASE, DOG BITE",Sunday,NORTHERN,1700 Block of CALIFORNIA ST,-122.423157,37.790209,2013-09-08 13:51:00,1
26553,VANDALISM,"MALICIOUS MISCHIEF, VANDALISM OF VEHICLES",Wednesday,SOUTHERN,1200 Block of HOWARD ST,-122.412447,37.775634,2015-01-07 14:02:00,1


See above: 6pm daytime, 2:12am nighttime...

### Day of the Week may be relevant but it needs to be a numerical format
Encode to integer

In [11]:
# a dictionary to encode the days of the week to integer values
days_to_integer = {
    'Monday': 1,
    'Tuesday': 2,
    'Wednesday': 3,
    'Thursday': 4,
    'Friday': 5,
    'Saturday': 6,
    'Sunday': 7
}
train['DayOfWeek'] = train['DayOfWeek'].map(days_to_integer)
test['DayOfWeek'] = test['DayOfWeek'].map(days_to_integer)

train.sample(5)

Unnamed: 0,Category,Descript,DayOfWeek,PdDistrict,Address,X,Y,Date,IsDay
835133,VEHICLE THEFT,TAMPERING WITH A VEHICLE,7,INGLESIDE,200 Block of MONTEREY BL,-122.440906,37.731475,2003-07-27 00:15:00,0
607048,VANDALISM,"MALICIOUS MISCHIEF, GRAFFITI",3,SOUTHERN,0 Block of ISIS ST,-122.414684,37.770111,2006-10-04 02:44:00,0
269991,ASSAULT,BATTERY,7,SOUTHERN,800 Block of BRYANT ST,-122.403405,37.775421,2011-09-11 14:05:00,1
26709,LARCENY/THEFT,GRAND THEFT OF PROPERTY,3,MISSION,4000 Block of 18TH ST,-122.434457,37.760977,2015-01-07 02:32:00,0
573187,OTHER OFFENSES,PEDDLING WITHOUT A LICENSE,5,MISSION,20TH ST / MISSION ST,-122.419053,37.758632,2007-04-06 10:53:00,1


### Hour, Month, Year Columns

In [12]:
train['Hour'] = train.Date.dt.hour
train['Month'] = train.Date.dt.month
train['Year'] = train.Date.dt.year
train['Year'] = train['Year'] - 2000 # to make it more readable, all the data is from the 2000s anyway

test['Hour'] = test.Date.dt.hour
test['Month'] = test.Date.dt.month
test['Year'] = test.Date.dt.year
test['Year'] = test['Year'] - 2000

train.sample(1)


Unnamed: 0,Category,Descript,DayOfWeek,PdDistrict,Address,X,Y,Date,IsDay,Hour,Month,Year
502705,ASSAULT,BATTERY,2,MISSION,400 Block of SOUTH VAN NESS AV,-122.417583,37.765463,2008-04-15 17:14:00,1,17,4,8


The encoding of all of these date time features as individual numerical values should help us capture local or temporary spikes e.g. particular crimes that peaked in certain years, nighttime violence etc, seasonal crimes e.g. burglary at Christmas potentially...

The months/days/years are cyclical and we should use Cosine to capture the cyclical nature. 

For example, predictively this will capture that 11pm (23) is very close to 12am (0) despite it being the largest numerical gap in the 24 hour clock. Same with December (12) and January (1) and will lead to more sensible predictions

In [13]:
train['HourCos'] = np.cos(2 * np.pi * train['Hour'] / 24)
train['DayOfWeekCos'] = np.cos(2 * np.pi * train['DayOfWeek'] / 7)
train['MonthCos'] = np.cos(2 * np.pi * train['Month'] / 12)

test['HourCos'] = np.cos(2 * np.pi * test['Hour'] / 24)
test['DayOfWeekCos'] = np.cos(2 * np.pi * test['DayOfWeek'] / 7)
test['MonthCos'] = np.cos(2 * np.pi * test['Month'] / 12)

train.sample(1)

Unnamed: 0,Category,Descript,DayOfWeek,PdDistrict,Address,X,Y,Date,IsDay,Hour,Month,Year,HourCos,DayOfWeekCos,MonthCos
318521,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,6,INGLESIDE,500 Block of MELROSE AV,-122.453248,37.733448,2010-12-18 19:00:00,1,19,12,10,0.258819,0.62349,1.0


### PD District Encoding with Dummy Variable

In [14]:
train = pd.get_dummies(train, columns=['PdDistrict'])
test = pd.get_dummies(test, columns=['PdDistrict'])

train.sample(1)

Unnamed: 0,Category,Descript,DayOfWeek,Address,X,Y,Date,IsDay,Hour,Month,...,PdDistrict_BAYVIEW,PdDistrict_CENTRAL,PdDistrict_INGLESIDE,PdDistrict_MISSION,PdDistrict_NORTHERN,PdDistrict_PARK,PdDistrict_RICHMOND,PdDistrict_SOUTHERN,PdDistrict_TARAVAL,PdDistrict_TENDERLOIN
337047,LARCENY/THEFT,GRAND THEFT FROM A BUILDING,6,600 Block of ALABAMA ST,-122.411918,37.760998,2010-09-11 14:00:00,1,14,9,...,False,False,False,True,False,False,False,False,False,False


### Encode Categories

In [15]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
train['CategoryCode'] = pd.Series(le.fit_transform(train['Category'])) # PD series to keep the same index and prevent possible misalignments
train.sample(5)

Unnamed: 0,Category,Descript,DayOfWeek,Address,X,Y,Date,IsDay,Hour,Month,...,PdDistrict_CENTRAL,PdDistrict_INGLESIDE,PdDistrict_MISSION,PdDistrict_NORTHERN,PdDistrict_PARK,PdDistrict_RICHMOND,PdDistrict_SOUTHERN,PdDistrict_TARAVAL,PdDistrict_TENDERLOIN,CategoryCode
112386,NON-CRIMINAL,LOST PROPERTY,1,TAYLOR ST / BEACH ST,-122.415619,37.807276,2013-11-11 16:10:00,1,16,11,...,True,False,False,False,False,False,False,False,False,20
271558,ASSAULT,AGGRAVATED ASSAULT WITH A DEADLY WEAPON,3,GEARY ST / MASON ST,-122.409877,37.787149,2011-09-07 10:36:00,1,10,9,...,True,False,False,False,False,False,False,False,False,1
457285,LARCENY/THEFT,PETTY THEFT SHOPLIFTING,6,800 Block of HOWARD ST,-122.403793,37.782324,2008-11-29 07:00:00,1,7,11,...,False,False,False,False,False,False,True,False,False,16
179143,ASSAULT,AGGRAVATED ASSAULT WITH A DEADLY WEAPON,5,200 Block of EDDY ST,-122.411778,37.783981,2012-12-28 10:18:00,1,10,12,...,False,False,False,False,False,False,False,False,True,1
515921,RECOVERED VEHICLE,RECOVERED VEHICLE - STOLEN OUTSIDE SF,3,100 Block of DAKOTA ST,-122.395635,37.753565,2008-02-06 14:10:00,1,14,2,...,False,False,False,False,False,False,False,False,False,24


Something we could get out of specific addresses or Block vs intersection? Look at that later.

### Feature Selection

In [16]:
train.columns

Index(['Category', 'Descript', 'DayOfWeek', 'Address', 'X', 'Y', 'Date',
       'IsDay', 'Hour', 'Month', 'Year', 'HourCos', 'DayOfWeekCos', 'MonthCos',
       'PdDistrict_BAYVIEW', 'PdDistrict_CENTRAL', 'PdDistrict_INGLESIDE',
       'PdDistrict_MISSION', 'PdDistrict_NORTHERN', 'PdDistrict_PARK',
       'PdDistrict_RICHMOND', 'PdDistrict_SOUTHERN', 'PdDistrict_TARAVAL',
       'PdDistrict_TENDERLOIN', 'CategoryCode'],
      dtype='object')

Target Column is CategoryCode.

Remember that Id is still present in the test dataset as the first column

In [17]:
feature_cols = ['X', 'Y', 'IsDay', 'DayOfWeek', 'Month', 'Hour', 'Year',
                'PdDistrict_BAYVIEW', 'PdDistrict_CENTRAL', 'PdDistrict_INGLESIDE',
                'PdDistrict_MISSION', 'PdDistrict_NORTHERN', 'PdDistrict_PARK',
                'PdDistrict_RICHMOND', 'PdDistrict_SOUTHERN', 'PdDistrict_TARAVAL', 'PdDistrict_TENDERLOIN']

target_col = 'CategoryCode'

train_x = train[feature_cols]
train_y = train[target_col]

test_ids = test['Id'] # we will need this column for putting into the submission file
test_x = test[feature_cols]

We can add in the Day/Week/Month Cosine after, see initial results with numerical dates first

In [18]:
train_x.sample(3)

Unnamed: 0,X,Y,IsDay,DayOfWeek,Month,Hour,Year,PdDistrict_BAYVIEW,PdDistrict_CENTRAL,PdDistrict_INGLESIDE,PdDistrict_MISSION,PdDistrict_NORTHERN,PdDistrict_PARK,PdDistrict_RICHMOND,PdDistrict_SOUTHERN,PdDistrict_TARAVAL,PdDistrict_TENDERLOIN
659058,-122.421261,37.801759,1,1,1,18,6,False,False,False,False,True,False,False,False,False,False
362394,-122.500519,37.774615,0,1,5,22,10,False,False,False,False,False,False,True,False,False,False
205322,-122.414758,37.741511,0,3,8,22,12,False,False,True,False,False,False,False,False,False,False


## XGBoost and GridSearchCV

We'll use Cross Validation

In [19]:
type(train_x), type(train_y)

(pandas.core.frame.DataFrame, pandas.core.series.Series)

In [24]:
import xgboost as xgb

# decision matrix is a data structure used by xgboost for optimization
train_xgb = xgb.DMatrix(train_x, label=train_y)
test_xgb = xgb.DMatrix(test_x)

### Parameter tuning and cross-val

In [23]:
params = {
    'max_depth': 4,  # the maximum depth of each tree
    'eta': 0.2,  # the training step for each iteration
    'silent': 1,  # quiet logging so that we don't see warnings
    'objective': 'multi:softprob',  # error evaluation for multiclass training
    'num_class': 39,
}

In [25]:
# cross validation to find the best number of rounds
cv = xgb.cv(params, train_xgb, nfold=3, early_stopping_rounds=10, metrics='mlogloss', verbose_eval=True)

Parameters: { "silent" } are not used.

Parameters: { "silent" } are not used.



[0]	train-mlogloss:3.31306+0.00150	test-mlogloss:3.31389+0.00159
[1]	train-mlogloss:3.14395+0.00208	test-mlogloss:3.14510+0.00183
[2]	train-mlogloss:3.02596+0.00131	test-mlogloss:3.02735+0.00087
[3]	train-mlogloss:2.93655+0.00108	test-mlogloss:2.93827+0.00118
[4]	train-mlogloss:2.86633+0.00135	test-mlogloss:2.86837+0.00145
[5]	train-mlogloss:2.80848+0.00134	test-mlogloss:2.81081+0.00125
[6]	train-mlogloss:2.76153+0.00101	test-mlogloss:2.76404+0.00084
[7]	train-mlogloss:2.72209+0.00109	test-mlogloss:2.72483+0.00083
[8]	train-mlogloss:2.68916+0.00108	test-mlogloss:2.69215+0.00081
[9]	train-mlogloss:2.66109+0.00074	test-mlogloss:2.66434+0.00058


In [26]:
cv

Unnamed: 0,train-mlogloss-mean,train-mlogloss-std,test-mlogloss-mean,test-mlogloss-std
0,3.313058,0.001504,3.31389,0.001589
1,3.143947,0.002082,3.145095,0.001834
2,3.025962,0.001312,3.027352,0.000872
3,2.936553,0.001076,2.938271,0.001178
4,2.86633,0.001349,2.868367,0.001446
5,2.808477,0.001338,2.810812,0.00125
6,2.761528,0.001009,2.764041,0.000839
7,2.722086,0.001094,2.724828,0.000827
8,2.689157,0.001084,2.692148,0.000806
9,2.661094,0.000738,2.664345,0.000577


### Train the model and make predictions

In [27]:
model = xgb.train(params, train_xgb, num_boost_round=cv.shape[0]) # appears to be 10 rounds
y_pred = model.predict(test_xgb)

Parameters: { "silent" } are not used.



### Set up Submission File

In [28]:
# now to convert y_pred into a dataframe with the id column and the crime categories
columns = ['Id'] + le.classes_ # le.classes_ is the list of crime categories that label encoder creates

output = pd.DataFrame(y_pred, columns=le.classes_) # create a dataframe from the predictions
output.insert(0, 'Id', test_ids) # add the index column to the start of the dataframe which is the IDs
output.to_csv('submission.csv', index=False)