In [1]:
import numpy
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline

'''
import theano
theano.config.device = 'gpu'
theano.config.floatX = 'float32'
'''

seed = 7
numpy.random.seed(seed)


Using Theano backend.


In [47]:
crime_dataframe = pd.read_csv('BPD_Part_1_Victim_Based_Crime_Data.csv')
crime_dataframe['NormalizedCrimeTime'] = crime_dataframe['CrimeTime'].apply(lambda time: time[:5] if len(time) == 8 else time[0:2] + ':' + time[2:])
crime_dataframe['Inside/Outside'] = crime_dataframe['Inside/Outside'].replace(['Inside', 'Outside'], ['I', 'O'])
crime_dataframe.drop(['Post','Location 1','Total Incidents'],inplace=True,axis=1)
crime_dataframe.drop_duplicates(inplace = True)
crime_dataframe['CrimeHour'] = crime_dataframe['CrimeTime'].apply(lambda time: time[:2])
crime_dataframe = crime_dataframe.drop(196469)
crime_dataframe['CrimeDay'] = crime_dataframe['CrimeDate'].apply(lambda date: pd.Timestamp(date).weekday_name)
crime_dataframe['NormalizedCrimeDate'] = crime_dataframe['CrimeDate'].apply(lambda date: pd.Timestamp(date))
crime_dataframe['Weapon'] = crime_dataframe['Weapon'].fillna('Unknown');

bpd_crime_dataframe = crime_dataframe.copy()
bpd_crime_dataframe.drop(['CrimeDate','CrimeTime','NormalizedCrimeTime', 'NormalizedCrimeDate'],inplace=True,axis=1)
bpd_crime_dataframe['CrimeCode'] = bpd_crime_dataframe['CrimeCode'].astype('category').cat.codes
bpd_crime_dataframe['Description'] = bpd_crime_dataframe['Description'].astype('category').cat.codes
bpd_crime_dataframe['Location'] = bpd_crime_dataframe['Location'].astype('category').cat.codes
bpd_crime_dataframe['Inside/Outside'] = bpd_crime_dataframe['Inside/Outside'].astype('category').cat.codes
bpd_crime_dataframe['Weapon'] = bpd_crime_dataframe['Weapon'].astype('category').cat.codes
bpd_crime_dataframe['District'] = bpd_crime_dataframe['District'].astype('category').cat.codes
bpd_crime_dataframe['Neighborhood'] = bpd_crime_dataframe['Neighborhood'].astype('category').cat.codes
bpd_crime_dataframe['CrimeDay'] = bpd_crime_dataframe['CrimeDay'].astype('category').cat.codes


In [3]:
#Split DataFrames into Training (Training + Validation) and Test
train_crime_dataframe = bpd_crime_dataframe.iloc[1:(len(bpd_crime_dataframe)/2)]
test_crime_dataframe = bpd_crime_dataframe.iloc[(len(bpd_crime_dataframe)/2) + 1: len(bpd_crime_dataframe)]


X = pd.concat([train_crime_dataframe['Inside/Outside'], train_crime_dataframe['Weapon'], train_crime_dataframe['Neighborhood'], train_crime_dataframe['CrimeHour'], train_crime_dataframe['CrimeDay']], axis=1)
Y = train_crime_dataframe['CrimeCode']

X = X[:10000]
Y = Y[:10000]

In [50]:
print len(crime_dataframe['Weapon'])
print crime_dataframe['Weapon'].unique();

88279
['FIREARM' 'HANDS' 'KNIFE' 'OTHER']


In [48]:
crime_dataframe = crime_dataframe[crime_dataframe['Weapon'] != 'Unknown']

In [57]:
X = pd.concat([pd.get_dummies(crime_dataframe['Inside/Outside'], prefix = 'IO'), pd.get_dummies(crime_dataframe['Weapon'], prefix = 'W'), pd.get_dummies(crime_dataframe['Neighborhood'], prefix = 'N')], axis=1)
Y = crime_dataframe['CrimeCode']

X = X[:10000]
Y = Y[:10000]

In [58]:
print len(train_crime_dataframe)
X_cols = len(X.columns)
print X_cols

136081
283


In [59]:
encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = encoder.transform(Y)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y = np_utils.to_categorical(encoded_Y)

In [60]:
print dummy_y[1]
print len(dummy_y)

[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.]
10000


In [61]:
print encoded_Y
print len(encoded_Y)

[ 0 31  8 ..., 28 31 31]
10000


In [62]:
uniqueCrimeCodes = Y.unique() # OR crime_dataframe.CrimeCode.unique()
print uniqueCrimeCodes
print len(uniqueCrimeCodes)

['1F' '4E' '3AK' '4B' '9S' '3AF' '4C' '4A' '4D' '3AO' '3CF' '1K' '2A' '3JK'
 '3AJF' '3GF' '3EF' '3NF' '3LO' '3CK' '3JO' '3JF' '3LF' '3CO' '3AJK' '1O'
 '3AJO' '3NK' '3NO' '3EK' '3GK' '3GO' '3EO']
33


In [63]:
X.values

array([[0, 1, 1, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       ..., 
       [0, 1, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0]], dtype=uint8)

In [64]:
model = Sequential()

model.add(Dense(X_cols, input_dim=X_cols, init='uniform', activation='relu'))
model.add(Dense(X_cols, init='uniform', activation='relu'))
model.add(Dense(X_cols, init='uniform', activation='relu'))
model.add(Dense(len(Y.unique()), init='uniform', activation='sigmoid'))

# Compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Fit model
model.fit(X.values, dummy_y, nb_epoch=50, batch_size=10, shuffle=True, verbose=1)

# Evaluate model
model.evaluate(X.values, dummy_y, batch_size=5, verbose=1)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50

[nan, 0.017500000000000002]

In [20]:
# define baseline model
def baseline_model():
    # create model
    model = Sequential()
    
    '''
    model.add(Dense(7, input_dim = 7, init='normal', activation='relu'))
    model.add(Dense(7, input_dim = 7, init='normal', activation='relu'))
    model.add(Dense(7, input_dim = 7, init='normal', activation='relu'))
    model.add(Dense(7, input_dim = 7, init='normal', activation='relu'))
    '''
    
    model.add(Dense(X_cols, input_dim=X_cols, init='uniform', activation='relu'))
    model.add(Dense(X_cols, input_dim=X_cols, init='uniform', activation='relu'))
    model.add(Dense(X_cols, input_dim=X_cols, init='uniform', activation='relu'))
    
    #model.add(Dense(input_dim=7, output_dim = 10, init='normal', activation='relu'))
    #model.add(Dense(input_dim=7, output_dim = 10, init='normal', activation='relu'))
    #model.add(Dense(input_dim=11, output_dim=5, init='normal', activation='softmax'))
    #model.add(Dense(input_dim=5, output_dim=7, init='normal', activation='relu'))
    
    model.add(Dense(len(Y.unique()), init='normal', activation='sigmoid'))
    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [21]:
estimator = KerasClassifier(build_fn=baseline_model, nb_epoch=100, batch_size=5, verbose=0)
kfold = KFold(n_splits=10, shuffle=True, random_state=seed)
results = cross_val_score(estimator, X.values, dummy_y, cv=kfold)
print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

KeyboardInterrupt: 