In [1]:
import numpy
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline

'''
import theano
theano.config.device = 'gpu'
theano.config.floatX = 'float32'
'''

seed = 7
numpy.random.seed(seed)


Using Theano backend.


In [2]:
arrest_dataframe = pd.read_csv('BPD_Arrests.csv')
print len(arrest_dataframe)

129025


In [3]:
print len(arrest_dataframe[arrest_dataframe['ArrestTime'].str.contains(":")])
nonColonTime = arrest_dataframe['ArrestTime'][~arrest_dataframe['ArrestTime'].str.contains(":")]
#print nonColonTime.unique()
print len(nonColonTime)
print len(nonColonTime.str.contains("."))

124731
4294
4294


In [4]:
arrest_dataframe.head(2)#loc[arrest_dataframe['ArrestDate'] == '10/15/2016']

Unnamed: 0,Arrest,Age,Sex,Race,ArrestDate,ArrestTime,ArrestLocation,IncidentOffense,IncidentLocation,Charge,ChargeDescription,District,Post,Neighborhood,Location 1
0,16147118.0,27.0,M,B,10/15/2016,23:00,200 N FRANKLINTOWN RD,Unknown Offense,400 N FRANKLINTOWN RD,2 0705,ARMED ROBBERY,Southwestern,842.0,Penrose/Fayette Street Outreach,"(39.2901207304, -76.6604437052)"
1,16147119.0,33.0,M,B,10/15/2016,23:00,100 W NORTH AVE,4CAGG. ASSLT.- OTH.,W NORTH AV & N CHARLES ST,1 1415,DOMESTIC ASSAULT,Central,141.0,Charles North,"(39.3111354877, -76.6181214533)"


In [5]:
arrest_dataframe['Arrest'] = arrest_dataframe['Arrest'].fillna(0)
arrest_dataframe['Age'] = arrest_dataframe['Age'].fillna(0)

arrest_dataframe['ArrestDay'] = arrest_dataframe['ArrestDate'].apply(lambda date: pd.Timestamp(date).weekday_name)
arrest_dataframe['NormalizedArrestDate'] = arrest_dataframe['ArrestDate'].apply(lambda date: pd.Timestamp(date))

In [6]:
#remove inconsistencies from ArrestTime like [all time have len either 4 or 5] 9.30, 9:30, 09:30, 09.30  ==> 09.30
arrest_dataframe['ArrestTime'] = arrest_dataframe['ArrestTime'].apply(lambda time: time[0:2] + ':' + time[3:5] if len(time) == 5 else time[0:1] + ':' + time[2:4])

In [7]:
arrest_dataframe['ArrestHour'] = arrest_dataframe['ArrestTime'].apply(lambda time: '0' + time[:1] if len(time) == 4 else time[:2])

In [8]:
print arrest_dataframe['ArrestHour'].unique()

['23' '22' '21' '20' '19' '18' '17' '15' '13' '12' '10' '09' '06' '04' '02'
 '00' '16' '14' '11' '08' '05' '01' '03' '07']


In [9]:
arrest_dataframe['Charge'] = arrest_dataframe['Charge'].fillna('0')
arrest_dataframe['ChargeDescription'] = arrest_dataframe['ChargeDescription'].fillna('Unknown Charge')

In [10]:
arrest_dataframe['District'] = arrest_dataframe['District'].fillna('U')
#if needed convert Northeastern to NE, Central to C and so on...

arrest_dataframe['Neighborhood'] = arrest_dataframe['Neighborhood'].fillna('Unknown')

#arrest_dataframe['Post'] = arrest_dataframe['Post'].fillna(0)


In [11]:
arrest_dataframe.drop(['Post','Location 1'],inplace=True,axis=1)
arrest_dataframe.drop_duplicates(inplace = True)

bpd_arrest_dataframe = arrest_dataframe.copy()
#bpd_arrest_dataframe.drop(['ArrestDate', 'ArrestTime', 'NormalizedArrestDate'],inplace=True,axis=1)
'''
bpd_arrest_dataframe['ArrestDate'] = bpd_arrest_dataframe['ArrestDate'].astype('category').cat.codes
bpd_arrest_dataframe['ArrestTime'] = bpd_arrest_dataframe['ArrestTime'].astype('category').cat.codes
bpd_arrest_dataframe['NormalizedArrestDate'] = bpd_arrest_dataframe['NormalizedArrestDate'].astype('category').cat.codes
bpd_arrest_dataframe['Arrest'] = bpd_arrest_dataframe['Arrest'].astype('category').cat.codes
bpd_arrest_dataframe['Sex'] = bpd_arrest_dataframe['Sex'].astype('category').cat.codes
bpd_arrest_dataframe['Race'] = bpd_arrest_dataframe['Race'].astype('category').cat.codes
bpd_arrest_dataframe['ArrestLocation'] = bpd_arrest_dataframe['ArrestLocation'].astype('category').cat.codes
bpd_arrest_dataframe['IncidentOffense'] = bpd_arrest_dataframe['IncidentOffense'].astype('category').cat.codes
bpd_arrest_dataframe['IncidentLocation'] = bpd_arrest_dataframe['IncidentLocation'].astype('category').cat.codes
bpd_arrest_dataframe['Charge'] = bpd_arrest_dataframe['Charge'].astype('category').cat.codes
bpd_arrest_dataframe['ChargeDescription'] = bpd_arrest_dataframe['ChargeDescription'].astype('category').cat.codes
bpd_arrest_dataframe['District'] = bpd_arrest_dataframe['District'].astype('category').cat.codes
bpd_arrest_dataframe['Neighborhood'] = bpd_arrest_dataframe['Neighborhood'].astype('category').cat.codes
'''



"\nbpd_arrest_dataframe['ArrestDate'] = bpd_arrest_dataframe['ArrestDate'].astype('category').cat.codes\nbpd_arrest_dataframe['ArrestTime'] = bpd_arrest_dataframe['ArrestTime'].astype('category').cat.codes\nbpd_arrest_dataframe['NormalizedArrestDate'] = bpd_arrest_dataframe['NormalizedArrestDate'].astype('category').cat.codes\nbpd_arrest_dataframe['Arrest'] = bpd_arrest_dataframe['Arrest'].astype('category').cat.codes\nbpd_arrest_dataframe['Sex'] = bpd_arrest_dataframe['Sex'].astype('category').cat.codes\nbpd_arrest_dataframe['Race'] = bpd_arrest_dataframe['Race'].astype('category').cat.codes\nbpd_arrest_dataframe['ArrestLocation'] = bpd_arrest_dataframe['ArrestLocation'].astype('category').cat.codes\nbpd_arrest_dataframe['IncidentOffense'] = bpd_arrest_dataframe['IncidentOffense'].astype('category').cat.codes\nbpd_arrest_dataframe['IncidentLocation'] = bpd_arrest_dataframe['IncidentLocation'].astype('category').cat.codes\nbpd_arrest_dataframe['Charge'] = bpd_arrest_dataframe['Charge']

In [12]:
arrest_dataframe[2:3]

Unnamed: 0,Arrest,Age,Sex,Race,ArrestDate,ArrestTime,ArrestLocation,IncidentOffense,IncidentLocation,Charge,ChargeDescription,District,Neighborhood,ArrestDay,NormalizedArrestDate,ArrestHour
2,16147109.0,27.0,M,B,10/15/2016,22:45,2300 E JEFFERSON ST,4CAGG. ASSLT.- OTH.,2300 JEFFERSON ST,1 1420,ASSAULT,Southeastern,McElderry Park,Saturday,2016-10-15,22


In [13]:
print 'len(arrest_dataframe)', len(arrest_dataframe)
print 'len(bpd_arrest_dataframe)', len(bpd_arrest_dataframe)
print 'Arrest :', len(arrest_dataframe['Arrest'].unique())
print 'Age', len(arrest_dataframe['Age'].unique())
print 'Sex', len(arrest_dataframe['Sex'].unique())
print 'Race', len(arrest_dataframe['Race'].unique())
print 'ArrestDate', len(arrest_dataframe['ArrestDate'].unique())
print 'ArrestTime', len(arrest_dataframe['ArrestTime'].unique())
print 'ArrestLocation [Many Unknowns]', len(arrest_dataframe['ArrestLocation'].unique())
print 'IncidentOffense [Many Unknowns]', len(arrest_dataframe['IncidentOffense'].unique())
print 'IncidentLocation [Many Unknowns]', len(arrest_dataframe['IncidentLocation'].unique())
print 'Charge', len(arrest_dataframe['Charge'].unique())
print 'ChargeDescription', len(arrest_dataframe['ChargeDescription'].unique())
print 'District', len(arrest_dataframe['District'].unique())
print 'Neighborhood', len(arrest_dataframe['Neighborhood'].unique())

len(arrest_dataframe) 128668
len(bpd_arrest_dataframe) 128668
Arrest : 122084
Age 74
Sex 2
Race 5
ArrestDate 1384
ArrestTime 1728
ArrestLocation [Many Unknowns] 10134
IncidentOffense [Many Unknowns] 266
IncidentLocation [Many Unknowns] 14972
Charge 586
ChargeDescription 11510
District 10
Neighborhood 548


In [14]:
print 'Male count =>', len(bpd_arrest_dataframe[bpd_arrest_dataframe['Sex'] == 'M'])
print 'Female count =>', len(bpd_arrest_dataframe[bpd_arrest_dataframe['Sex'] == 'F'])
print ''

print 'A count =>', len(bpd_arrest_dataframe[bpd_arrest_dataframe['Race'] == 'A'])
print 'I count =>', len(bpd_arrest_dataframe[bpd_arrest_dataframe['Race'] == 'I'])
print 'U count =>', len(bpd_arrest_dataframe[bpd_arrest_dataframe['Race'] == 'U'])
print 'W count =>', len(bpd_arrest_dataframe[bpd_arrest_dataframe['Race'] == 'W'])
print 'B count =>', len(bpd_arrest_dataframe[bpd_arrest_dataframe['Race'] == 'B'])
print ''

print 'ArrestTime Midnight [00:00 to 06:00] =>', len(bpd_arrest_dataframe[(bpd_arrest_dataframe['ArrestHour'] >= '0') & (bpd_arrest_dataframe['ArrestHour'] < '6')])
print 'ArrestTime Morning [06:00 to 12:00] =>', len(bpd_arrest_dataframe[(bpd_arrest_dataframe['ArrestHour'] >= '6') & (bpd_arrest_dataframe['ArrestHour'] < '12')])
print 'ArrestTime Afternoon [12:00 to 18:00] =>', len(bpd_arrest_dataframe[(bpd_arrest_dataframe['ArrestHour'] >= '12') & (bpd_arrest_dataframe['ArrestHour'] < '18')])
print 'ArrestTime Evening [18:00 to 23:59] =>', len(bpd_arrest_dataframe[(bpd_arrest_dataframe['ArrestHour'] >= '18') & (bpd_arrest_dataframe['ArrestHour'] < '24')])

print ''

Male count => 104157
Female count => 24511

A count => 340
I count => 372
U count => 2621
W count => 20413
B count => 104922

ArrestTime Midnight [00:00 to 06:00] => 128668
ArrestTime Morning [06:00 to 12:00] => 0
ArrestTime Afternoon [12:00 to 18:00] => 38842
ArrestTime Evening [18:00 to 23:59] => 39568



In [15]:
Male_data = bpd_arrest_dataframe[bpd_arrest_dataframe['Sex'] == 'M']
Female_data = bpd_arrest_dataframe[bpd_arrest_dataframe['Sex'] == 'F']
print len(Male_data)
print len(Female_data)

X_male = pd.concat([Male_data['Age'], pd.get_dummies(Male_data['Race'], prefix = 'R')], axis=1)
Y_male = Male_data['Sex']

X_male = X_male[:5000]
Y_male = Y_male[:5000]

X_female = pd.concat([Female_data['Age'], pd.get_dummies(Female_data['Race'], prefix = 'R')], axis=1)
Y_female = Female_data['Sex']

X_female = X_female[:5000]
Y_female = Y_female[:5000]

X = pd.concat([X_male, X_female], axis=0)
Y = pd.concat([Y_male, Y_female], axis=0)

print X.head(2)
print Y.head(2)

104157
24511
    Age  R_A  R_B  R_I  R_U  R_W
0  27.0    0    1    0    0    0
1  33.0    0    1    0    0    0
0    M
1    M
Name: Sex, dtype: object


In [16]:
bpd_arrest_knownIncident = bpd_arrest_dataframe.copy()

print 'Total records:', len(bpd_arrest_knownIncident)
print 'With Unknown Offense:', len(bpd_arrest_knownIncident[bpd_arrest_knownIncident['IncidentOffense'] == 'Unknown Offense'])
print 'Without:', len(bpd_arrest_knownIncident[bpd_arrest_knownIncident['IncidentOffense'] != 'Unknown Offense'])

bpd_arrest_knownIncident = bpd_arrest_knownIncident[bpd_arrest_knownIncident['IncidentOffense'] != 'Unknown Offense']

Total records: 128668
With Unknown Offense: 80390
Without: 48278


In [17]:
print 'New records in bpd_arrest_knownIncident:', len(bpd_arrest_knownIncident)
bpd_arrest_dataframe = bpd_arrest_knownIncident

New records in bpd_arrest_knownIncident: 48278


In [18]:
#Split DataFrames into Training (Training + Validation) and Test
#X = pd.concat([bpd_arrest_dataframe['Age'], pd.get_dummies(bpd_arrest_dataframe['Sex'], prefix = 'S'), pd.get_dummies(bpd_arrest_dataframe['Race'], prefix = 'R'), pd.get_dummies(bpd_arrest_dataframe['District'], prefix = 'Dist')], axis=1)
X = pd.concat([bpd_arrest_dataframe['Age'], pd.get_dummies(bpd_arrest_dataframe['Sex'], prefix = 'S'), pd.get_dummies(bpd_arrest_dataframe['Race'], prefix = 'R')], axis=1)
Y = bpd_arrest_dataframe['IncidentOffense']
#Y = pd.get_dummies(bpd_arrest_dataframe['Charge'], prefix = 'C')

'''
X_train = X.iloc[1:(len(X)/2)]
X_test = X.iloc[(len(X)/2) + 1: len(X)]

Y_train = Y.iloc[1:(len(Y)/2)]
Y_test = Y.iloc[(len(Y)/2) + 1: len(Y)]
'''
#train_arrest_dataframe = bpd_arrest_dataframe.iloc[1:(len(bpd_arrest_dataframe)/2)]
#test_arrest_dataframe = bpd_arrest_dataframe.iloc[(len(bpd_arrest_dataframe)/2) + 1: len(bpd_arrest_dataframe)]

#X = X[:10000]
#Y = Y[:10000]

'\nX_train = X.iloc[1:(len(X)/2)]\nX_test = X.iloc[(len(X)/2) + 1: len(X)]\n\nY_train = Y.iloc[1:(len(Y)/2)]\nY_test = Y.iloc[(len(Y)/2) + 1: len(Y)]\n'

In [19]:
print len(X)
print len(Y)

48278
48278


In [20]:
encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = encoder.transform(Y)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y = np_utils.to_categorical(encoded_Y)

In [155]:
encoder = LabelEncoder()
encoder.fit(Y_train)
encoded_Y_train = encoder.transform(Y_train)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y_train = np_utils.to_categorical(encoded_Y_train)

encoder.fit(Y_test)
encoded_Y_test = encoder.transform(Y_test)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y_test = np_utils.to_categorical(encoded_Y_test)

NameError: name 'Y_train' is not defined

In [103]:
#print dummy_y[1]
print len(dummy_y_train)
print len(dummy_y_test)

64333
64333


In [104]:
print encoded_Y_train
print len(encoded_Y_train)
print encoded_Y_test
print len(encoded_Y_test)

[165 167 165 ..., 178   0  14]
64333
[ 21  34 329 ...,  21  14 140]
64333


In [105]:
uniqueCrimeCodes_train = Y_train.unique() # OR crime_dataframe.CrimeCode.unique()
print len(uniqueCrimeCodes_train)

uniqueCrimeCodes_test = Y_test.unique() # OR crime_dataframe.CrimeCode.unique()
print len(uniqueCrimeCodes_test)

469
424


In [21]:
X_cols = len(X.columns)
print X_cols

#Y_cols = len(Y.columns)
#print Y_cols
print len(Y.unique())
print Y.unique()
print Y.count()

8
265
['4CAGG. ASSLT.- OTH.' '4ECOMMON ASSAULT' '6CLARCENY- SHOPLIFTING'
 '49FAMILY DISTURBANCE' '7ASTOLEN AUTO' '4BAGG. ASSLT.- CUT'
 '54ARMED PERSON' '5ABURG. RES. (FORCE)' '3BROBB HIGHWAY (UA)'
 '118BURGLARY - FOURTH DEGREE' '4DAGG. ASSLT.- HAND' '4AAGG. ASSLT.- GUN'
 '3AFROBB HWY-FIREARM' '111PROTECTIVE ORDER' '119ISSUED IN ERROR'
 '4FASSAULT BY THREAT' '87NARCOTICS' '75DESTRUCT. OF PROPERTY' '79OTHER'
 '55APROSTITUTION' '77DOG BITE' '87ONARCOTICS (OUTSIDE)'
 '3AOROBB HWY-OTHER WPN' '6DLARCENY- FROM AUTO' '3KROBB RES. (UA)'
 '26RECOVERED VEHICLE' '5DBURG. OTH. (FORCE)' '5EBURG. OTH. (ATT.)'
 '81RECOVERED PROPERTY' '6JLARCENY- OTHER' '3JKROBB RESIDENCE-KNIFE'
 '2ARAPE (FORCE)' '23UNAUTHORIZED USE' '3CKROBB COMM-KNIFE'
 '87VNARCOTICS (ONVIEW)' '115TRESPASSING' '1AMURDER' '2BRAPE (ATTEMPT)'
 '88UNFOUNDED CALL' '3AJFROBB CARJACK-FIREARM' '24TOWED VEHICLE'
 '8AOARSON SIN RES STR-OCC' 'UNKNOWN OFFENSE' '3DROBB COMM. (UA)'
 '6BLARCENY- PURSE SNATCH' '5BBURG. RES. (ATT.)' '3HROBB CONV. STO

In [22]:
print X.columns

Index([u'Age', u'S_F', u'S_M', u'R_A', u'R_B', u'R_I', u'R_U', u'R_W'], dtype='object')


In [23]:
model = Sequential()

model.add(Dense(X_cols, input_dim=X_cols, init='uniform', activation='relu'))
model.add(Dense(X_cols, init='uniform', activation='relu'))
model.add(Dense(X_cols, init='uniform', activation='relu'))
model.add(Dense(len(Y.unique()), init='uniform', activation='sigmoid'))

# Compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Fit model
model.fit(X.values, dummy_y, nb_epoch=50, batch_size=10, shuffle=True, verbose=1)

# Evaluate model
model.evaluate(X.values, dummy_y, batch_size=5, verbose=1)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
 5210/48278 [==>...........................] - ETA: 8s - loss: 3.1825 - acc: 0.2622

KeyboardInterrupt: 

In [101]:
# define baseline model
def baseline_model():
    # create model
    model = Sequential()
    
    '''
    model.add(Dense(7, input_dim = 7, init='normal', activation='relu'))
    model.add(Dense(7, input_dim = 7, init='normal', activation='relu'))
    model.add(Dense(7, input_dim = 7, init='normal', activation='relu'))
    model.add(Dense(7, input_dim = 7, init='normal', activation='relu'))
    '''
    
    model.add(Dense(X_train_cols, input_dim=X_train_cols, init='uniform', activation='relu'))
    model.add(Dense(X_train_cols, input_dim=X_train_cols, init='uniform', activation='relu'))
    
    #model.add(Dense(input_dim=7, output_dim = 10, init='normal', activation='relu'))
    #model.add(Dense(input_dim=7, output_dim = 10, init='normal', activation='relu'))
    #model.add(Dense(input_dim=11, output_dim=5, init='normal', activation='softmax'))
    #model.add(Dense(input_dim=5, output_dim=7, init='normal', activation='relu'))
    
    model.add(Dense(len(Y_train.unique()), init='normal', activation='sigmoid'))
    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    #model.fit(X_train, Y_train, validation_split=0.33, nb_epoch=150, batch_size=10)
    return model

In [102]:
estimator = KerasClassifier(build_fn=baseline_model, nb_epoch=1, batch_size=5, verbose=0)
kfold = KFold(n_splits=10, shuffle=True, random_state=seed)
results = cross_val_score(estimator, X_train.values, dummy_y_train, cv=kfold)
print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Baseline: 13.26% (2.76%)
