# Crime Analysis and Prediction

This file contains a set of experiments for analyzing and extrapolating data regarding crimes.

## 1. Data Loading and Sanitization

In [31]:
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from data import CrimeLoader
import numpy as np

# Crime loader now handles all of the data saving and parsing
#    force_refresh - If true, will read from the original files rather than a saved pickle file
#    force_save - If true, will save any loaded data into a pickle for later loading
#    data_limit - The number of crimes to load (used for speedup)
data = CrimeLoader()
data.load_data(force_refresh=True, force_save=True, data_limit=10000, randomize=True)

# List of possible features to include:
#    day                - Day of the week (ex: SUN - SAT) 1-hot encoded
#    time               - Time of day (ex: Morning, Afternoon, Evening, Late Night) 1-hot encoded
#    time min           - Time in minutes (ex: 1420 minutes) Integer value (not 1-hot encoded)
#    hour               - Time in hour (ex: 13) Integer value (not 1-hot encoded)
#    location           - Location of crime, (ex: lat, long) 2 float values
#    crime condensed    - Encoding of crime, 1-hot encoded of length 9 (?)
#    crime full         - Encoding of crime, 1-hot encoded of length ~ 30
#    all                - All of the above

X_features = ["day", "time", "time min", "hour", "location"]
Y_features = ["crime condensed"]
X, Y, X_decoder, Y_decoder = data.get_workable_data(X_features, Y_features)
print("Featurization achieved")

Loading crime data
Saving checkpoint
Finished loading data
Creating X feature matrix
Creating Y feature matrix
Featurization complete
Featurization achieved


## 2. K-Nearest Neighbors

In [26]:
def train_KNN(feature_matrix, targets, n):
    knn = KNeighborsClassifier(n_neighbors=n)
    Y = np.zeros(targets.shape[0])
    for i in range(targets.shape[0]):
        Y[i] = np.argmax(targets[i])
    knn.fit(feature_matrix,Y)
    return knn


In [27]:
def int_to_category(prediction):
    categories = ["KIDNAPPING / CHILDREN","ROBBERY/BURGLARY/THEFT","ASSAULT/VIOLENCE","NARCOTICS","PUBLIC-RELATED CRIME","DAMAGE/ARSON","OTHER/NON-CRIMINAL","WEAPON-RELATED","PROHIBITIVE CRIME"]
    return categories[int(prediction)]


In [28]:
def evaluate_KNN(X, Y):
    # Loading data
    X_train = X[1:300]
    Y_train = Y[1:300]
    X_validate = X[300:500]
    Y_validate = Y[300:500]
    X_test = X[500:600]
    Y_test = Y[500:600]
    
    #Tuning on the validation set for value of K
    K = [k for k in range(1,30)]
    best_K = 0
    max_score = 0.0
    for k in K:
        knn = train_KNN(X_train, Y_train, k)
        predictions = knn.predict(X_validate)
        correct = 0.0
        for i in range(predictions.shape[0]):
            if int_to_category(predictions[i]) == Y_decoder(Y_validate[i])[0]:
                correct += 1.0
        if correct > max_score:
            max_score = correct
            best_K = k
        print("Validation set Performance with k = " + str(k) + ":" + str(correct/predictions.shape[0]))
    
    #Evaluating on Test set with best K value
    knn = train_KNN(X_train, Y_train, best_K)
    predictions = knn.predict(X_test)
    correct = 0.0
    for i in range(predictions.shape[0]):
        if int_to_category(predictions[i]) == Y_decoder(Y_test[i])[0]:
            correct += 1.0
    if correct > max_score:
        max_score = correct
        best_K = k
    print("Test set Performance with k = " + str(best_K) + ":  " + str(correct/predictions.shape[0]))

print(X.shape)
evaluate_KNN(X, Y)       
        
    
    
    
    

(9740, 15)
Validation set Performance with k = 1:0.26
Validation set Performance with k = 2:0.36
Validation set Performance with k = 3:0.375
Validation set Performance with k = 4:0.365
Validation set Performance with k = 5:0.35
Validation set Performance with k = 6:0.375
Validation set Performance with k = 7:0.35
Validation set Performance with k = 8:0.38
Validation set Performance with k = 9:0.375
Validation set Performance with k = 10:0.365
Validation set Performance with k = 11:0.37
Validation set Performance with k = 12:0.37
Validation set Performance with k = 13:0.36
Validation set Performance with k = 14:0.365
Validation set Performance with k = 15:0.37
Validation set Performance with k = 16:0.36
Validation set Performance with k = 17:0.335
Validation set Performance with k = 18:0.34
Validation set Performance with k = 19:0.355
Validation set Performance with k = 20:0.335
Validation set Performance with k = 21:0.34
Validation set Performance with k = 22:0.35
Validation set Perfor

## 3. Neural Network

In [13]:
import keras

Using TensorFlow backend.
  return f(*args, **kwds)


In [20]:
# dividing training data with 60-20-20 train-val-test split

num_training = int(0.6 * X.shape[0])
num_val_or_test = int(0.2 * X.shape[0])

# shuffling with fixed seed

seed = 42

# create big matrix and shuffle to ensure that things get shuffled properly
# see https://stackoverflow.com/questions/35646908/numpy-shuffle-multidimensional-array-by-row-only-keep-column-order-unchanged

total = np.hstack((X,Y))

np.random.seed(42)

np.random.shuffle(total)

print(X.shape)
print(Y.shape)

# split back apart to partition into training, validation, and test sets

X_new = total[:,0:X.shape[1]]
Y_new = total[:,X.shape[1]:]

#X_new = X
#Y_new = Y # lol

print(X_new.shape)
print(Y_new.shape)

print(X[0])
print(Y[0])
print(X_new[0])
print(Y_new[0])

X_train = X_new[0:num_training,:]
Y_train = Y_new[0:num_training,:]

X_val = X_new[num_training:num_training+num_val_or_test,:]
Y_val = Y_new[num_training:num_training+num_val_or_test,:]

X_test = X_new[num_training+num_val_or_test:,:]
Y_test = Y_new[num_training+num_val_or_test:,:]

print(X_train.shape)
print(X_val.shape)
print(X_test.shape)

(9778, 15)
(9778, 9)
(9778, 15)
(9778, 9)
[  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
   1.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
   0.00000000e+00   1.00000000e+00   0.00000000e+00   1.32000000e+03
   2.20000000e+01   4.17503243e+01  -8.76661668e+01]
[ 0.  0.  0.  0.  1.  0.  0.  0.  0.]
[   0.            1.            0.            0.            0.            0.
    0.            0.            1.            0.            0.          750.
   12.           41.72435991  -87.63288418]
[ 0.  0.  0.  0.  0.  0.  1.  0.  0.]
(5866, 15)
(1955, 15)
(1957, 15)


In [29]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import RMSprop, SGD, Adam
#u ready for some neural nets?

batch_size = 2048
epochs = 5

model = Sequential()
model.add(Dense(activation='relu', input_shape=(len(X[0]),), units=100))
model.add(Dropout(0.2))
model.add(Dense(activation='relu', units=100))
model.add(Dropout(0.2))
model.add(Dense(activation='relu', units=100))
model.add(Dropout(0.2))
model.add(Dense(activation='relu', units=100))
model.add(Dropout(0.2))
model.add(Dense(activation='relu', units=100))
model.add(Dropout(0.2))
model.add(Dense(activation='relu', units=100))
model.add(Dropout(0.2))
model.add(Dense(activation='relu', units=100))
model.add(Dropout(0.2))
model.add(Dense(activation='relu', units=100))
model.add(Dropout(0.2))
model.add(Dense(activation='relu', units=100))
model.add(Dropout(0.2))
model.add(Dense(activation='relu', units=100))
model.add(Dropout(0.2))
model.add(Dense(activation='relu', units=100))
model.add(Dropout(0.2))
model.add(Dense(activation='relu', units=100))
model.add(Dropout(0.2))
model.add(Dense(activation='relu', units=100))
model.add(Dropout(0.2))
model.add(Dense(activation='relu', units=100))
model.add(Dropout(0.2))
model.add(Dense(activation='relu', units=100))
model.add(Dropout(0.2))
model.add(Dense(activation='relu', units=100))
model.add(Dropout(0.2))
model.add(Dense(activation='relu', units=100))
model.add(Dropout(0.2))
model.add(Dense(activation='relu', units=100))
model.add(Dropout(0.2))
model.add(Dense(activation='relu', units=100))
model.add(Dropout(0.2))
model.add(Dense(activation='relu', units=100))
model.add(Dropout(0.2))
model.add(Dense(activation='relu', units=100))
model.add(Dropout(0.2))
model.add(Dense(activation='relu', units=100))
model.add(Dropout(0.2))
model.add(Dense(activation='relu', units=100))
model.add(Dropout(0.2))
model.add(Dense(activation='relu', units=100))
model.add(Dropout(0.2))
model.add(Dense(activation='relu', units=100))
model.add(Dropout(0.2))
model.add(Dense(activation='softmax',units=9))

model.summary()

model.compile(loss='categorical_crossentropy',
              optimizer=Adam(),
              metrics=['accuracy'])

history = model.fit(X_train, Y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_data=(X_val, Y_val))
score = model.evaluate(X_test, Y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_79 (Dense)             (None, 100)               1600      
_________________________________________________________________
dropout_76 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_80 (Dense)             (None, 100)               10100     
_________________________________________________________________
dropout_77 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_81 (Dense)             (None, 100)               10100     
_________________________________________________________________
dropout_78 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_82 (Dense)             (None, 100)               10100     
__________