# Crime Analysis and Prediction

This file contains a set of experiments for analyzing and extrapolating data regarding crimes.

## 1. Data Loading and Sanitization

In [1]:
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
import data

results = data.load_data(force_refresh=True)

Loading crime data
Saving checkpoint
Finished loading data


In [8]:
# Time test
import data
test = results["CH"][0][0:10]
print(test[0].time_of_day[0])
X_features = ["time min"]
Y_features = ["location"]
X, Y, X_decoder, Y_decoder = data.get_workable_data(test, X_features, Y_features)
print("Featurization achieved")
print(X)
print(X.shape)
print(Y.shape)
print(X_decoder(X[0]))
print(Y_decoder(Y[0]))

[ 0.  0.  1.  0.]
Test


ValueError: all the input arrays must have same number of dimensions

In [None]:
# ATTENTION: HERE IS HOW EVERYTHING WORKS!!! ------------
#   1) Load the data using the cell above
#   2) (Optional) If using non-condensed crimes (i.e no combination), pass
#      a list of all crimes to save_full_crime_encoding()
#   3) Call get_workable_data with the data from 1), pass in a list of features
#      you want to include for X and target Y. Possible values are:
#         "day" "time" "location" "crime condensed" "crime full"
#   4) You will receive the encoded X matrix, and encoded Y matrix, and two methods
#      for decoding the X and Y features

# Use this next line for enabling the full crime set
save_full_crime_encoding(list(set([i.crime for i in results["CH"][0]])))

X_features = ["day", "time", "location"]
Y_features = ["crime condensed"]
X, Y, X_decoder, Y_decoder = get_workable_data(results["CH"][0], X_features, Y_features)
print("Featurization achieved")
print(X)
print(X.shape)
print(Y.shape)
print(X_decoder(X[0]))
print(Y_decoder(Y[0]))

In [None]:
def train_KNN(feature_matrix, targets, n):
    knn = KNeighborsClassifier(n_neighbors=n)
    Y = np.zeros(targets.shape[0])
    for i in range(targets.shape[0]):
        Y[i] = np.argmax(targets[i])
    knn.fit(feature_matrix,Y)
    return knn


In [None]:
def int_to_category(prediction):
    categories = ["KIDNAPPING / CHILDREN","ROBBERY/BURGLARY/THEFT","ASSAULT/VIOLENCE","NARCOTICS","PUBLIC-RELATED CRIME","DAMAGE/ARSON","OTHER/NON-CRIMINAL","WEAPON-RELATED","PROHIBITIVE CRIME"]
    return categories[int(prediction)]


In [None]:
def evaluate_KNN(X, Y):
    # Loading data
    X_train = X[1:300]
    Y_train = Y[1:300]
    X_validate = X[300:500]
    Y_validate = Y[300:500]
    X_test = X[500:600]
    Y_test = Y[500:600]
    
    #Tuning on the validation set for value of K
    K = [k for k in range(1,30)]
    best_K = 0
    max_score = 0.0
    for k in K:
        knn = train_KNN(X_train, Y_train, k)
        predictions = knn.predict(X_validate)
        correct = 0.0
        for i in range(predictions.shape[0]):
            if int_to_category(predictions[i]) == Y_decoder(Y_validate[i])[0]:
                correct += 1.0
        if correct > max_score:
            max_score = correct
            best_K = k
        print("Validation set Performance with k = " + str(k) + ":" + str(correct/predictions.shape[0]))
    
    #Evaluating on Test set with best K value
    knn = train_KNN(X_train, Y_train, best_K)
    predictions = knn.predict(X_test)
    correct = 0.0
    for i in range(predictions.shape[0]):
        if int_to_category(predictions[i]) == Y_decoder(Y_test[i])[0]:
            correct += 1.0
    if correct > max_score:
        max_score = correct
        best_K = k
    print("Test set Performance with k = " + str(best_K) + ":  " + str(correct/predictions.shape[0]))
        
evaluate_KNN(X, Y)       
        
    
    
    
    

In [None]:
import keras

In [None]:
# dividing training data with 60-20-20 train-val-test split

num_training = int(0.6 * X.shape[0])
num_val_or_test = int(0.2 * X.shape[0])

# shuffling with fixed seed

seed = 42

# create big matrix and shuffle to ensure that things get shuffled properly
# see https://stackoverflow.com/questions/35646908/numpy-shuffle-multidimensional-array-by-row-only-keep-column-order-unchanged

total = np.hstack((X,Y))

np.random.seed(42)

np.random.shuffle(total)

print X.shape
print Y.shape

# split back apart to partition into training, validation, and test sets

X_new = total[:,0:X.shape[1]]
Y_new = total[:,X.shape[1]:]

#X_new = X
#Y_new = Y # lol

print X_new.shape
print Y_new.shape

print X[0]
print Y[0]
print X_new[0]
print Y_new[0]

X_train = X_new[0:num_training,:]
Y_train = Y_new[0:num_training,:]

X_val = X_new[num_training:num_training+num_val_or_test,:]
Y_val = Y_new[num_training:num_training+num_val_or_test,:]

X_test = X_new[num_training+num_val_or_test:,:]
Y_test = Y_new[num_training+num_val_or_test:,:]

print X_train.shape
print X_val.shape
print X_test.shape

In [None]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import RMSprop, SGD, Adam
#u ready for some neural nets?

batch_size = 2048
epochs = 5

model = Sequential()
model.add(Dense(activation='relu', input_shape=(13,), units=100))
model.add(Dropout(0.2))
model.add(Dense(activation='relu', units=100))
model.add(Dropout(0.2))
model.add(Dense(activation='relu', units=100))
model.add(Dropout(0.2))
model.add(Dense(activation='relu', units=100))
model.add(Dropout(0.2))
model.add(Dense(activation='relu', units=100))
model.add(Dropout(0.2))
model.add(Dense(activation='relu', units=100))
model.add(Dropout(0.2))
model.add(Dense(activation='relu', units=100))
model.add(Dropout(0.2))
model.add(Dense(activation='relu', units=100))
model.add(Dropout(0.2))
model.add(Dense(activation='relu', units=100))
model.add(Dropout(0.2))
model.add(Dense(activation='relu', units=100))
model.add(Dropout(0.2))
model.add(Dense(activation='relu', units=100))
model.add(Dropout(0.2))
model.add(Dense(activation='relu', units=100))
model.add(Dropout(0.2))
model.add(Dense(activation='relu', units=100))
model.add(Dropout(0.2))
model.add(Dense(activation='relu', units=100))
model.add(Dropout(0.2))
model.add(Dense(activation='relu', units=100))
model.add(Dropout(0.2))
model.add(Dense(activation='relu', units=100))
model.add(Dropout(0.2))
model.add(Dense(activation='relu', units=100))
model.add(Dropout(0.2))
model.add(Dense(activation='relu', units=100))
model.add(Dropout(0.2))
model.add(Dense(activation='relu', units=100))
model.add(Dropout(0.2))
model.add(Dense(activation='relu', units=100))
model.add(Dropout(0.2))
model.add(Dense(activation='relu', units=100))
model.add(Dropout(0.2))
model.add(Dense(activation='relu', units=100))
model.add(Dropout(0.2))
model.add(Dense(activation='relu', units=100))
model.add(Dropout(0.2))
model.add(Dense(activation='relu', units=100))
model.add(Dropout(0.2))
model.add(Dense(activation='relu', units=100))
model.add(Dropout(0.2))
model.add(Dense(activation='softmax',units=9))

model.summary()

model.compile(loss='categorical_crossentropy',
              optimizer=Adam(),
              metrics=['accuracy'])

history = model.fit(X_train, Y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_data=(X_val, Y_val))
score = model.evaluate(X_test, Y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])