In [2]:
# Import Libraries
%matplotlib inline
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

EDA

In [3]:
# Load data from csv file
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

In [5]:
# Check categorical data
print('Number of Categories: ', train_df.Category.nunique())
print('Number of PdDistricts: ', train_df.PdDistrict.nunique())
print('Number of DayOfWeeks: ', train_df.DayOfWeek.nunique())
print('_________________________________________________')
# Show some useful Information
train_df.info()

Number of Categories:  39
Number of PdDistricts:  10
Number of DayOfWeeks:  7
_________________________________________________
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 878049 entries, 0 to 878048
Data columns (total 9 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   Dates       878049 non-null  object 
 1   Category    878049 non-null  object 
 2   Descript    878049 non-null  object 
 3   DayOfWeek   878049 non-null  object 
 4   PdDistrict  878049 non-null  object 
 5   Resolution  878049 non-null  object 
 6   Address     878049 non-null  object 
 7   X           878049 non-null  float64
 8   Y           878049 non-null  float64
dtypes: float64(2), object(7)
memory usage: 60.3+ MB


Data Preprocessing

In [6]:
# Drop resolution column
train_df = train_df.drop('Resolution', axis=1)
train_df.sample(1)

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Address,X,Y
853559,2003-04-30 17:59:00,OTHER OFFENSES,PAROLE VIOLATION,Wednesday,SOUTHERN,800 Block of MARKET ST,-122.407634,37.784189


In [7]:
# Parse datatype of Dates column
train_df['Date'] = pd.to_datetime(train_df.Dates)
test_df['Date'] = pd.to_datetime(test_df.Dates)

train_df = train_df.drop('Dates', axis=1)
test_df = test_df.drop('Dates', axis=1)
train_df.sample(1)

Unnamed: 0,Category,Descript,DayOfWeek,PdDistrict,Address,X,Y,Date
97391,LARCENY/THEFT,GRAND THEFT FROM A BUILDING,Saturday,TARAVAL,1300 Block of JUDAH ST,-122.476481,37.761819,2014-01-25 03:30:00


In [8]:
# Add a feature to know time of the commited crime
train_df['IsDay'] = 0
train_df.loc[ (train_df.Date.dt.hour > 6) & (train_df.Date.dt.hour < 20), 'IsDay' ] = 1
test_df['IsDay'] = 0
test_df.loc[ (test_df.Date.dt.hour > 6) & (test_df.Date.dt.hour < 20), 'IsDay' ] = 1

train_df.sample(3)

Unnamed: 0,Category,Descript,DayOfWeek,PdDistrict,Address,X,Y,Date,IsDay
256589,OTHER OFFENSES,EVADING A POLICE OFFICER RECKLESSLY,Tuesday,BAYVIEW,2300 Block of CESAR CHAVEZ ST,-122.399689,37.749595,2011-11-29 01:07:00,0
378618,OTHER OFFENSES,"DRIVERS LICENSE, SUSPENDED OR REVOKED",Friday,INGLESIDE,0 Block of COTTER ST,-122.433907,37.727721,2010-01-29 02:44:00,0
864713,BURGLARY,"BURGLARY, HOT PROWL, FORCIBLE ENTRY",Thursday,RICHMOND,200 Block of 29TH AV,-122.489409,37.784372,2003-03-06 18:00:00,1


In [9]:
# Create 'Month', 'Year', and 'DayofWeekInt' columns
days_to_int_dic = {
        'Monday': 1,
        'Tuesday': 2,
        'Wednesday': 3,
        'Thursday': 4,
        'Friday': 5,
        'Saturday': 6,
        'Sunday': 7,
}
train_df['DayOfWeek'] = train_df['DayOfWeek'].map(days_to_int_dic)
test_df ['DayOfWeek'] = test_df ['DayOfWeek'].map(days_to_int_dic)

train_df.DayOfWeek.unique()

array([3, 2, 1, 7, 6, 5, 4], dtype=int64)

In [10]:
# Create Hour, Month and Year Columns:
train_df['Hour'] = train_df.Date.dt.hour
train_df['Month'] = train_df.Date.dt.month
train_df['Year'] = train_df.Date.dt.year
train_df['Year'] = train_df['Year'] - 2000 # The Algorithm doesn't know the difference. It's just easier to work like that

test_df['Hour'] = test_df.Date.dt.hour
test_df['Month'] = test_df.Date.dt.month
test_df['Year'] = test_df.Date.dt.year
test_df['Year'] = test_df['Year'] - 2000 # The Algorithm doesn't know the difference. It's just easier to work like that

train_df.sample(1)

Unnamed: 0,Category,Descript,DayOfWeek,PdDistrict,Address,X,Y,Date,IsDay,Hour,Month,Year
821393,VEHICLE THEFT,"VEHICLE, RECOVERED, AUTO",4,CENTRAL,LEAVENWORTH ST / CLAY ST,-122.416116,37.793014,2003-10-02 21:30:00,0,21,10,3


In [11]:
# Deal with the cyclical nature of time
train_df['HourCos'] = np.cos((train_df['Hour']*2*np.pi)/24 )
train_df['DayOfWeekCos'] = np.cos((train_df['DayOfWeek']*2*np.pi)/7 )
train_df['MonthCos'] = np.cos((train_df['Month']*2*np.pi)/12 )

test_df['HourCos'] = np.cos((test_df['Hour']*2*np.pi)/24 )
test_df['DayOfWeekCos'] = np.cos((test_df['DayOfWeek']*2*np.pi)/7 )
test_df['MonthCos'] = np.cos((test_df['Month']*2*np.pi)/12 )

train_df.sample(1)

Unnamed: 0,Category,Descript,DayOfWeek,PdDistrict,Address,X,Y,Date,IsDay,Hour,Month,Year,HourCos,DayOfWeekCos,MonthCos
355843,OTHER OFFENSES,"DRIVERS LICENSE, SUSPENDED OR REVOKED",4,BAYVIEW,MCKINNON AV / PHELPS ST,-122.392968,37.737961,2010-06-03 22:25:00,0,22,6,10,0.866025,-0.900969,-1.0


In [12]:
# Dummy encoding for PdDistrict column
train_df = pd.get_dummies(train_df, columns=['PdDistrict'])
test_df  = pd.get_dummies(test_df,  columns=['PdDistrict'])
train_df.sample(2)

Unnamed: 0,Category,Descript,DayOfWeek,Address,X,Y,Date,IsDay,Hour,Month,...,PdDistrict_BAYVIEW,PdDistrict_CENTRAL,PdDistrict_INGLESIDE,PdDistrict_MISSION,PdDistrict_NORTHERN,PdDistrict_PARK,PdDistrict_RICHMOND,PdDistrict_SOUTHERN,PdDistrict_TARAVAL,PdDistrict_TENDERLOIN
772637,VEHICLE THEFT,"VEHICLE, RECOVERED, OTHER VEHICLE",5,32ND AV / JUDAH ST,-122.490923,37.761085,2004-05-28 10:10:00,1,10,5,...,0,0,0,0,0,0,0,0,1,0
700466,OTHER OFFENSES,CONSPIRACY,7,800 Block of MARKET ST,-122.406521,37.785063,2005-06-05 17:00:00,1,17,6,...,0,0,0,0,0,0,0,1,0,0


In [13]:
# Label encoding for Category column
from sklearn.preprocessing import LabelEncoder

cat_le = LabelEncoder()
train_df['CategoryInt'] = pd.Series(cat_le.fit_transform(train_df.Category))
train_df.sample(5)

Unnamed: 0,Category,Descript,DayOfWeek,Address,X,Y,Date,IsDay,Hour,Month,...,PdDistrict_CENTRAL,PdDistrict_INGLESIDE,PdDistrict_MISSION,PdDistrict_NORTHERN,PdDistrict_PARK,PdDistrict_RICHMOND,PdDistrict_SOUTHERN,PdDistrict_TARAVAL,PdDistrict_TENDERLOIN,CategoryInt
318945,OTHER OFFENSES,TAMPERING WITH A VEHICLE,5,3400 Block of 20TH ST,-122.417424,37.758786,2010-12-17 15:31:00,1,15,12,...,0,0,1,0,0,0,0,0,0,21
141046,SEX OFFENSES FORCIBLE,CHILD ABUSE SEXUAL,6,2400 Block of HARRISON ST,-122.412492,37.758209,2013-06-29 19:21:00,1,19,6,...,0,0,1,0,0,0,0,0,0,28
17922,BURGLARY,"BURGLARY, UNLAWFUL ENTRY",3,200 Block of OFARRELL ST,-122.408496,37.786296,2015-02-18 00:16:00,0,0,2,...,0,0,0,0,0,0,0,0,1,4
227372,OTHER OFFENSES,HARASSING PHONE CALLS,2,1600 Block of OCEAN AV,-122.45961,37.724532,2012-05-01 00:01:00,0,0,5,...,0,0,0,0,0,0,0,1,0,21
406034,OTHER OFFENSES,PROBATION VIOLATION,2,TURK ST / TAYLOR ST,-122.410769,37.783215,2009-09-01 10:04:00,1,10,9,...,0,0,0,0,0,0,0,0,1,21


In [14]:
train_df['InIntersection'] = 1
train_df.loc[train_df.Address.str.contains('Block'), 'InIntersection'] = 0

test_df['InIntersection'] = 1
test_df.loc[test_df.Address.str.contains('Block'), 'InIntersection'] = 0

Feature Selection

In [15]:
train_df.columns

Index(['Category', 'Descript', 'DayOfWeek', 'Address', 'X', 'Y', 'Date',
       'IsDay', 'Hour', 'Month', 'Year', 'HourCos', 'DayOfWeekCos', 'MonthCos',
       'PdDistrict_BAYVIEW', 'PdDistrict_CENTRAL', 'PdDistrict_INGLESIDE',
       'PdDistrict_MISSION', 'PdDistrict_NORTHERN', 'PdDistrict_PARK',
       'PdDistrict_RICHMOND', 'PdDistrict_SOUTHERN', 'PdDistrict_TARAVAL',
       'PdDistrict_TENDERLOIN', 'CategoryInt', 'InIntersection'],
      dtype='object')

In [16]:
feature_cols = ['X', 'Y', 'IsDay', 'DayOfWeek', 'Month', 'Hour', 'Year', 'InIntersection',
                'PdDistrict_BAYVIEW', 'PdDistrict_CENTRAL', 'PdDistrict_INGLESIDE',
                'PdDistrict_MISSION', 'PdDistrict_NORTHERN', 'PdDistrict_PARK',
                'PdDistrict_RICHMOND', 'PdDistrict_SOUTHERN', 'PdDistrict_TARAVAL', 'PdDistrict_TENDERLOIN']
target_col = 'CategoryInt'

train_x = train_df[feature_cols]
train_y = train_df[target_col]

test_ids = test_df['Id']
test_x = test_df[feature_cols]

In [17]:
import xgboost as xgb
train_xgb = xgb.DMatrix(train_x, label=train_y)
test_xgb  = xgb.DMatrix(test_x)

In [18]:
params = {
    'max_depth': 4,  # the maximum depth of each tree
    'eta': 0.3,  # the training step for each iteration
    'silent': 1,  # logging mode - quiet
    'objective': 'multi:softprob',  # error evaluation for multiclass training
    'num_class': 39,
}

In [19]:
CROSS_VAL = False
if CROSS_VAL:
    print('Doing Cross-validation ...')
    cv = xgb.cv(params, train_xgb, nfold=3, early_stopping_rounds=10, metrics='mlogloss', verbose_eval=True)
    cv

In [20]:
SUBMIT = not CROSS_VAL
if SUBMIT:
    print('Fitting Model ...')
    m = xgb.train(params, train_xgb, 10)
    res = m.predict(test_xgb)
    cols = ['Id'] + cat_le.classes_
    submission = pd.DataFrame(res, columns=cat_le.classes_)
    submission.insert(0, 'Id', test_ids)
    submission.to_csv('submission.csv', index=False)
    print('Done Outputing !')
    print(submission.sample(3))
else:
    print('NOT SUBMITING')

Fitting Model ...
Parameters: { "silent" } are not used.

Done Outputing !
            Id     ARSON   ASSAULT  BAD CHECKS   BRIBERY  BURGLARY  \
416182  416182  0.004443  0.050354    0.003973  0.003934  0.055388   
593424  593424  0.006805  0.081276    0.006085  0.006026  0.041911   
723931  723931  0.005325  0.078894    0.006619  0.004239  0.024210   

        DISORDERLY CONDUCT  DRIVING UNDER THE INFLUENCE  DRUG/NARCOTIC  \
416182            0.005218                     0.004352       0.013902   
593424            0.007991                     0.006666       0.052002   
723931            0.008924                     0.005800       0.023132   

        DRUNKENNESS  ...  SEX OFFENSES NON FORCIBLE  STOLEN PROPERTY  \
416182     0.005602  ...                   0.003877         0.006165   
593424     0.008873  ...                   0.005938         0.009443   
723931     0.008913  ...                   0.004177         0.006563   

         SUICIDE  SUSPICIOUS OCC      TREA  TRESPASS  VAND