In [1]:
import numpy
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline

'''
import theano
theano.config.device = 'gpu'
theano.config.floatX = 'float32'
'''

seed = 7
numpy.random.seed(seed)


Using Theano backend.


In [34]:
arrest_dataframe = pd.read_csv('BPD_Arrests.csv')

In [37]:
print len(arrest_dataframe)

129025


In [36]:
arrest_dataframe['NormalizedArrestDate'] = arrest_dataframe['ArrestDate'].apply(lambda date: pd.Timestamp(date))

In [40]:
#remove inconsistencies from ArrestTime like [all time have len either 4 or 5] 9.30, 9:30, 09:30, 09.30  ==> 09.30
temp = arrest_dataframe['ArrestTime'].apply(lambda time: time[0:1] + ':' + time[2:4] if len(time) == 4 else time[0:2] + ':' + time[3:5])

In [42]:
temp.unique()

array(['23:00', '22:45', '22:40', ..., '07:11', '06:53', '07:19'], dtype=object)

In [43]:
print len(temp)

129025


In [46]:
arrest_dataframe[5:10]

Unnamed: 0,Arrest,Age,Sex,Race,ArrestDate,ArrestTime,ArrestLocation,IncidentOffense,IncidentLocation,Charge,ChargeDescription,District,Post,Neighborhood,Location 1,NormalizedArrestDate
5,16147102.0,33.0,M,B,10/15/2016,22:00,0 N HOWARD ST,Unknown Offense,0 N HOWARD ST,1 0494,COUNTERFEIT CASH,Central,,,"(39.3032179309, -76.6205657790)",2016-10-15
6,16147101.0,24.0,M,U,10/15/2016,21:50,3500 S HANOVER ST,Unknown Offense,500 E PATAPSCO AVE,,DIRTBIKE VIOLATION,Southern,912.0,Brooklyn,"(39.2401435763, -76.6087811516)",2016-10-15
7,16147108.0,41.0,M,B,10/15/2016,21:50,1500 HARFORD AVE,6CLARCENY- SHOPLIFTING,1500 HARFORD AVE,1 0621,LARCENY,Eastern,343.0,Oliver,"(39.3073430147, -76.6007485175)",2016-10-15
8,16147111.0,37.0,M,W,10/15/2016,21:15,2100 HARFORD RD,Unknown Offense,2100 HARFORD RD,4 3550,CDS VIOLATION,Eastern,342.0,East Baltimore Midway,"(39.3144365009, -76.5977578661)",2016-10-15
9,16147082.0,26.0,M,B,10/15/2016,21:00,,Unknown Offense,,1 0088,VIOLATION OF PROBATION,,,,,2016-10-15


In [2]:

crime_dataframe['Inside/Outside'] = crime_dataframe['Inside/Outside'].replace(['Inside', 'Outside'], ['I', 'O'])
crime_dataframe.drop(['Post','Location 1','Total Incidents'],inplace=True,axis=1)
crime_dataframe.drop_duplicates(inplace = True)
crime_dataframe['CrimeHour'] = crime_dataframe['CrimeTime'].apply(lambda time: time[:2])
crime_dataframe = crime_dataframe.drop(196469)
crime_dataframe['CrimeDay'] = crime_dataframe['CrimeDate'].apply(lambda date: pd.Timestamp(date).weekday_name)



bpd_crime_dataframe = crime_dataframe.copy()
bpd_crime_dataframe.drop(['CrimeDate','CrimeTime','NormalizedCrimeTime', 'NormalizedCrimeDate'],inplace=True,axis=1)
bpd_crime_dataframe['CrimeCode'] = bpd_crime_dataframe['CrimeCode'].astype('category').cat.codes
bpd_crime_dataframe['Description'] = bpd_crime_dataframe['Description'].astype('category').cat.codes
bpd_crime_dataframe['Location'] = bpd_crime_dataframe['Location'].astype('category').cat.codes
bpd_crime_dataframe['Inside/Outside'] = bpd_crime_dataframe['Inside/Outside'].astype('category').cat.codes
bpd_crime_dataframe['Weapon'] = bpd_crime_dataframe['Weapon'].astype('category').cat.codes
bpd_crime_dataframe['District'] = bpd_crime_dataframe['District'].astype('category').cat.codes
bpd_crime_dataframe['Neighborhood'] = bpd_crime_dataframe['Neighborhood'].astype('category').cat.codes
bpd_crime_dataframe['CrimeDay'] = bpd_crime_dataframe['CrimeDay'].astype('category').cat.codes


In [13]:
#Split DataFrames into Training (Training + Validation) and Test
train_crime_dataframe = bpd_crime_dataframe.iloc[1:(len(bpd_crime_dataframe)/2)]
test_crime_dataframe = bpd_crime_dataframe.iloc[(len(bpd_crime_dataframe)/2) + 1: len(bpd_crime_dataframe)]


X = pd.concat([train_crime_dataframe['Location'], train_crime_dataframe['Inside/Outside'], train_crime_dataframe['Weapon'], train_crime_dataframe['District'], train_crime_dataframe['Neighborhood'], train_crime_dataframe['CrimeHour'], train_crime_dataframe['CrimeDay']], axis=1)
Y = train_crime_dataframe['CrimeCode']

X = X[:10000]
Y = Y[:10000]

In [14]:
len(train_crime_dataframe)

136081

In [15]:
encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = encoder.transform(Y)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y = np_utils.to_categorical(encoded_Y)

In [16]:
print dummy_y[1]
print len(dummy_y)

[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
10000


In [17]:
print encoded_Y
print len(encoded_Y)

[57 49 50 ..., 51 57 50]
10000


In [18]:
uniqueCrimeCodes = Y.unique() # OR crime_dataframe.CrimeCode.unique()
print uniqueCrimeCodes
print len(uniqueCrimeCodes)

[61 52 53  0 45 42  9 16 39 80  5 40 38 44 58 56 46 41 11 28 10 51 13  1 49
 47 54 43 63 50  3 26 37 12  6 21 17 24 34 48 64 55 31 76 14 27 32 25  4 29
 60 79 77 15  7 67  2 71  8 66 57 35 65 36 18 22 23 20 62]
69


In [19]:
X.values

array([[13150, 1, -1, ..., 84, '00', 2],
       [248, 0, -1, ..., 62, '00', 2],
       [7258, 1, -1, ..., 228, '01', 2],
       ..., 
       [14874, 1, -1, ..., 99, '23', 4],
       [13889, 1, -1, ..., 11, '23', 4],
       [17383, 1, -1, ..., 193, '23', 4]], dtype=object)

In [20]:
# define baseline model
def baseline_model():
    # create model
    model = Sequential()
    
    '''
    model.add(Dense(7, input_dim = 7, init='normal', activation='relu'))
    model.add(Dense(7, input_dim = 7, init='normal', activation='relu'))
    model.add(Dense(7, input_dim = 7, init='normal', activation='relu'))
    model.add(Dense(7, input_dim = 7, init='normal', activation='relu'))
    '''
    
    model.add(Dense(7, input_dim=7, init='uniform', activation='relu'))
    model.add(Dense(7, input_dim=7, init='uniform', activation='relu'))
    model.add(Dense(7, input_dim=7, init='uniform', activation='relu'))
    
    #model.add(Dense(input_dim=7, output_dim = 10, init='normal', activation='relu'))
    #model.add(Dense(input_dim=7, output_dim = 10, init='normal', activation='relu'))
    #model.add(Dense(input_dim=11, output_dim=5, init='normal', activation='softmax'))
    #model.add(Dense(input_dim=5, output_dim=7, init='normal', activation='relu'))
    
    model.add(Dense(len(Y.unique()), init='normal', activation='sigmoid'))
    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [11]:
estimator = KerasClassifier(build_fn=baseline_model, nb_epoch=100, batch_size=5, verbose=0)
kfold = KFold(n_splits=10, shuffle=True, random_state=seed)
results = cross_val_score(estimator, X.values, dummy_y, cv=kfold)
print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Baseline: 12.80% (1.78%)
