In [1]:
import numpy
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline

'''
import theano
theano.config.device = 'gpu'
theano.config.floatX = 'float32'
'''

seed = 7
numpy.random.seed(seed)


Using Theano backend.


In [29]:
arrest_dataframe = pd.read_csv('BPD_Arrests.csv')
print len(arrest_dataframe)

129025


In [36]:
arrest_dataframe.head(1)#loc[arrest_dataframe['ArrestDate'] == '10/15/2016']

Unnamed: 0,Arrest,Age,Sex,Race,ArrestDate,ArrestTime,ArrestLocation,IncidentOffense,IncidentLocation,Charge,ChargeDescription,District,Post,Neighborhood,Location 1,ArrestDay,NormalizedArrestDate,ArrestHour
0,16147118.0,27.0,M,B,10/15/2016,23:00,200 N FRANKLINTOWN RD,Unknown Offense,400 N FRANKLINTOWN RD,2 0705,ARMED ROBBERY,Southwestern,842.0,Penrose/Fayette Street Outreach,"(39.2901207304, -76.6604437052)",Saturday,2016-10-15,23


In [31]:
arrest_dataframe['Arrest'] = arrest_dataframe['Arrest'].fillna(0)
arrest_dataframe['Age'] = arrest_dataframe['Age'].fillna(0)

arrest_dataframe['ArrestDay'] = arrest_dataframe['ArrestDate'].apply(lambda date: pd.Timestamp(date).weekday_name)
arrest_dataframe['NormalizedArrestDate'] = arrest_dataframe['ArrestDate'].apply(lambda date: pd.Timestamp(date))

In [33]:
#remove inconsistencies from ArrestTime like [all time have len either 4 or 5] 9.30, 9:30, 09:30, 09.30  ==> 09.30
arrest_dataframe['ArrestTime'] = arrest_dataframe['ArrestTime'].apply(lambda time: time[0:1] + ':' + time[2:4] if len(time) == 4 else time[0:2] + ':' + time[3:5])
arrest_dataframe['ArrestHour'] = arrest_dataframe['ArrestTime'].apply(lambda time: time[:2])

In [34]:
arrest_dataframe['Charge'] = arrest_dataframe['Charge'].fillna('0')
arrest_dataframe['ChargeDescription'] = arrest_dataframe['ChargeDescription'].fillna('Unknown Charge')

In [35]:
arrest_dataframe['District'] = arrest_dataframe['District'].fillna('U')
#if needed convert Northeastern to NE, Central to C and so on...

arrest_dataframe['Neighborhood'] = arrest_dataframe['Neighborhood'].fillna('Unknown')

#arrest_dataframe['Post'] = arrest_dataframe['Post'].fillna(0)


In [8]:
arrest_dataframe.drop(['Post','Location 1'],inplace=True,axis=1)
arrest_dataframe.drop_duplicates(inplace = True)

bpd_arrest_dataframe = arrest_dataframe.copy()
#bpd_arrest_dataframe.drop(['ArrestDate', 'ArrestTime', 'NormalizedArrestDate'],inplace=True,axis=1)
bpd_arrest_dataframe['ArrestDate'] = bpd_arrest_dataframe['ArrestDate'].astype('category').cat.codes
bpd_arrest_dataframe['ArrestTime'] = bpd_arrest_dataframe['ArrestTime'].astype('category').cat.codes
bpd_arrest_dataframe['NormalizedArrestDate'] = bpd_arrest_dataframe['NormalizedArrestDate'].astype('category').cat.codes
bpd_arrest_dataframe['Arrest'] = bpd_arrest_dataframe['Arrest'].astype('category').cat.codes
bpd_arrest_dataframe['Sex'] = bpd_arrest_dataframe['Sex'].astype('category').cat.codes
bpd_arrest_dataframe['Race'] = bpd_arrest_dataframe['Race'].astype('category').cat.codes
bpd_arrest_dataframe['ArrestLocation'] = bpd_arrest_dataframe['ArrestLocation'].astype('category').cat.codes
bpd_arrest_dataframe['IncidentOffense'] = bpd_arrest_dataframe['IncidentOffense'].astype('category').cat.codes
bpd_arrest_dataframe['IncidentLocation'] = bpd_arrest_dataframe['IncidentLocation'].astype('category').cat.codes
bpd_arrest_dataframe['Charge'] = bpd_arrest_dataframe['Charge'].astype('category').cat.codes
bpd_arrest_dataframe['ChargeDescription'] = bpd_arrest_dataframe['ChargeDescription'].astype('category').cat.codes
bpd_arrest_dataframe['District'] = bpd_arrest_dataframe['District'].astype('category').cat.codes
bpd_arrest_dataframe['Neighborhood'] = bpd_arrest_dataframe['Neighborhood'].astype('category').cat.codes


In [25]:
arrest_dataframe[2:3]

Unnamed: 0,Arrest,Age,Sex,Race,ArrestDate,ArrestTime,ArrestLocation,IncidentOffense,IncidentLocation,Charge,ChargeDescription,District,Neighborhood,ArrestDay,NormalizedArrestDate,ArrestHour
2,16147109.0,27.0,M,B,10/15/2016,22:45,2300 E JEFFERSON ST,4CAGG. ASSLT.- OTH.,2300 JEFFERSON ST,1 1420,ASSAULT,Southeastern,McElderry Park,Saturday,2016-10-15,22


In [28]:
print 'len(arrest_dataframe)', len(arrest_dataframe)
print 'len(bpd_arrest_dataframe)', len(bpd_arrest_dataframe)
print 'Arrest :', len(arrest_dataframe['Arrest'].unique())
print 'Age', len(arrest_dataframe['Age'].unique())
print 'Sex', len(arrest_dataframe['Sex'].unique())
print 'Race', len(arrest_dataframe['Race'].unique())
print 'ArrestDate', len(arrest_dataframe['ArrestDate'].unique())
print 'ArrestTime', len(arrest_dataframe['ArrestTime'].unique())
print 'ArrestLocation [Many Unknowns]', len(arrest_dataframe['ArrestLocation'].unique())
print 'IncidentOffense [Many Unknowns]', len(arrest_dataframe['IncidentOffense'].unique())
print 'IncidentLocation [Many Unknowns]', len(arrest_dataframe['IncidentLocation'].unique())
print 'Charge', len(arrest_dataframe['Charge'].unique())
print 'ChargeDescription', len(arrest_dataframe['ChargeDescription'].unique())
print 'District', len(arrest_dataframe['District'].unique())
print 'Neighborhood', len(arrest_dataframe['Neighborhood'].unique())

len(arrest_dataframe) 128668
len(bpd_arrest_dataframe) 128668
Arrest : 122084
Age 74
Sex 2
Race 5
ArrestDate 1384
ArrestTime 1728
ArrestLocation [Many Unknowns] 10134
IncidentOffense [Many Unknowns] 266
IncidentLocation [Many Unknowns] 14972
Charge 586
ChargeDescription 11510
District 10
Neighborhood 548


In [10]:
#Split DataFrames into Training (Training + Validation) and Test
train_arrest_dataframe = bpd_arrest_dataframe.iloc[1:(len(bpd_arrest_dataframe)/2)]
test_arrest_dataframe = bpd_arrest_dataframe.iloc[(len(bpd_arrest_dataframe)/2) + 1: len(bpd_arrest_dataframe)]

X = pd.concat([train_arrest_dataframe['Age'], train_arrest_dataframe['Sex'], train_arrest_dataframe['Race'], train_arrest_dataframe['District']], axis=1)
Y = train_arrest_dataframe['Charge']

#X = X[:10000]
#Y = Y[:10000]

In [11]:
len(train_arrest_dataframe)

64333

In [12]:
encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = encoder.transform(Y)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y = np_utils.to_categorical(encoded_Y)

In [13]:
#print dummy_y[1]
print len(dummy_y)

64333


In [14]:
print encoded_Y
print len(encoded_Y)

[165 167 165 ..., 178   0  14]
64333


In [15]:
uniqueCrimeCodes = Y.unique() # OR crime_dataframe.CrimeCode.unique()
#print uniqueCrimeCodes
print len(uniqueCrimeCodes)

469


In [16]:
X.values

array([[ 33.,   1.,   1.,   0.],
       [ 27.,   1.,   1.,   5.],
       [ 24.,   1.,   1.,   6.],
       ..., 
       [ 38.,   1.,   1.,   8.],
       [ 30.,   1.,   1.,   8.],
       [ 54.,   1.,   1.,   8.]])

In [19]:
# define baseline model
def baseline_model():
    # create model
    model = Sequential()
    
    '''
    model.add(Dense(7, input_dim = 7, init='normal', activation='relu'))
    model.add(Dense(7, input_dim = 7, init='normal', activation='relu'))
    model.add(Dense(7, input_dim = 7, init='normal', activation='relu'))
    model.add(Dense(7, input_dim = 7, init='normal', activation='relu'))
    '''
    
    model.add(Dense(4, input_dim=4, init='uniform', activation='relu'))
    model.add(Dense(4, input_dim=4, init='uniform', activation='relu'))
    
    #model.add(Dense(input_dim=7, output_dim = 10, init='normal', activation='relu'))
    #model.add(Dense(input_dim=7, output_dim = 10, init='normal', activation='relu'))
    #model.add(Dense(input_dim=11, output_dim=5, init='normal', activation='softmax'))
    #model.add(Dense(input_dim=5, output_dim=7, init='normal', activation='relu'))
    
    model.add(Dense(len(Y.unique()), init='normal', activation='sigmoid'))
    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [20]:
estimator = KerasClassifier(build_fn=baseline_model, nb_epoch=5, batch_size=5, verbose=0)
kfold = KFold(n_splits=10, shuffle=True, random_state=seed)
results = cross_val_score(estimator, X.values, dummy_y, cv=kfold)
print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Baseline: 13.77% (1.51%)
