# Crime Analysis and Prediction

This file contains a set of experiments for analyzing and extrapolating data regarding crimes.

## 1. Data Loading and Sanitization

In [42]:
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from data import CrimeLoader
import numpy as np

# Crime loader now handles all of the data saving and parsing
#    force_refresh - If true, will read from the original files rather than a saved pickle file
#    force_save - If true, will save any loaded data into a pickle for later loading
#    data_limit - The number of crimes to load (used for speedup)
data = CrimeLoader()
data.load_data(force_refresh=False, force_save=False, data_limit=100000, randomize=True)

# List of possible features to include:
#    day                - Day of the week (ex: SUN - SAT) 1-hot encoded
#    time               - Time of day (ex: Morning, Afternoon, Evening, Late Night) 1-hot encoded
#    time min           - Time in minutes (ex: 1420 minutes) Integer value (not 1-hot encoded)
#    hour               - Time in hour (ex: 13) Integer value (not 1-hot encoded)
#    location           - Location of crime, (ex: lat, long) 2 float values
#    crime condensed    - Encoding of crime, 1-hot encoded of length 9 (?)
#    crime full         - Encoding of crime, 1-hot encoded of length ~ 30
#    all                - All of the above

X_features = ["day", "time min", "location"]
Y_features = ["crime condensed"]
X, Y, X_decoder, Y_decoder = data.get_workable_data(X_features, Y_features)
print("Featurization achieved")

Creating checkpoint for crimes
Loading crime data
Finished loading data
Creating X feature matrix
Creating Y feature matrix
Featurization complete
Featurization achieved


## 2. K-Nearest Neighbors

In [39]:
def train_KNN(feature_matrix, targets, n):
    knn = KNeighborsClassifier(n_neighbors=n)
    Y = np.zeros(targets.shape[0])
    for i in range(targets.shape[0]):
        Y[i] = np.argmax(targets[i])
    knn.fit(feature_matrix,Y)
    return knn


In [40]:
def int_to_category(prediction):
    categories = ["KIDNAPPING / CHILDREN","ROBBERY/BURGLARY/THEFT","ASSAULT/VIOLENCE","NARCOTICS","PUBLIC-RELATED CRIME","DAMAGE/ARSON","OTHER/NON-CRIMINAL","WEAPON-RELATED","PROHIBITIVE CRIME"]
    return categories[int(prediction)]


In [12]:
def evaluate_KNN(X, Y):
    # Loading data
    X_train = X[1:300]
    Y_train = Y[1:300]
    X_validate = X[300:500]
    Y_validate = Y[300:500]
    X_test = X[500:600]
    Y_test = Y[500:600]
    
    #Tuning on the validation set for value of K
    K = [k for k in range(1,30)]
    best_K = 0
    max_score = 0.0
    for k in K:
        knn = train_KNN(X_train, Y_train, k)
        predictions = knn.predict(X_validate)
        correct = 0.0
        for i in range(predictions.shape[0]):
            if int_to_category(predictions[i]) == Y_decoder(Y_validate[i])[0]:
                correct += 1.0
        if correct > max_score:
            max_score = correct
            best_K = k
        print("Validation set Performance with k = " + str(k) + ":" + str(correct/predictions.shape[0]))
    
    #Evaluating on Test set with best K value
    knn = train_KNN(X_train, Y_train, best_K)
    predictions = knn.predict(X_test)
    correct = 0.0
    for i in range(predictions.shape[0]):
        if int_to_category(predictions[i]) == Y_decoder(Y_test[i])[0]:
            correct += 1.0
    if correct > max_score:
        max_score = correct
        best_K = k
    print("Test set Performance with k = " + str(best_K) + ":  " + str(correct/predictions.shape[0]))

print(X.shape)
evaluate_KNN(X, Y)       
        
    
    
    
    

(97472, 10)


NameError: global name 'train_KNN' is not defined

## 3. Neural Network

In [None]:
import keras

In [13]:
# dividing training data with 60-20-20 train-val-test split

num_training = int(0.6 * X.shape[0])
num_val_or_test = int(0.2 * X.shape[0])

# shuffling with fixed seed

seed = 42

# create big matrix and shuffle to ensure that things get shuffled properly
# see https://stackoverflow.com/questions/35646908/numpy-shuffle-multidimensional-array-by-row-only-keep-column-order-unchanged

total = np.hstack((X,Y))

np.random.seed(42)

np.random.shuffle(total)

print(X.shape)
print(Y.shape)

# split back apart to partition into training, validation, and test sets

X_new = total[:,0:X.shape[1]]
Y_new = total[:,X.shape[1]:]

#X_new = X
#Y_new = Y # lol

print(X_new.shape)
print(Y_new.shape)

print(X[0])
print(Y[0])
print(X_new[0])
print(Y_new[0])

X_train = X_new[0:num_training,:]
Y_train = Y_new[0:num_training,:]

X_val = X_new[num_training:num_training+num_val_or_test,:]
Y_val = Y_new[num_training:num_training+num_val_or_test,:]

X_test = X_new[num_training+num_val_or_test:,:]
Y_test = Y_new[num_training+num_val_or_test:,:]

print(X_train.shape)
print(X_val.shape)
print(X_test.shape)

(97472, 10)
(97472, 9)
(97472, 10)
(97472, 9)
[   0.            0.            0.            0.            0.            1.
    0.          990.           41.78299076  -87.61447837]
[ 0.  0.  0.  0.  0.  0.  1.  0.  0.]
[  0.00000000e+00   0.00000000e+00   1.00000000e+00   0.00000000e+00
   0.00000000e+00   0.00000000e+00   0.00000000e+00   1.35500000e+03
   4.17863817e+01  -8.76951563e+01]
[ 0.  0.  0.  0.  0.  0.  1.  0.  0.]
(58483, 10)
(19494, 10)
(19495, 10)


In [15]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import RMSprop, SGD, Adam
#u ready for some neural nets?

batch_size = 2048
epochs = 5
hidden_layers = 5
units_per_layer = 100
dropout_rate = 0.2

model = Sequential()
for i in range(hidden_layers):
    if i == 0:
        model.add(Dense(activation='relu', input_shape=(len(X[0]),), units=units_per_layer))
    else:
        model.add(Dense(activation='relu', units=units_per_layer))
    model.add(Dropout(dropout_rate))

model.add(Dense(activation='softmax',units=Y.shape[1]))

model.summary()

model.compile(loss='categorical_crossentropy',
              optimizer=Adam(),
              metrics=['accuracy'])

history = model.fit(X_train, Y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_data=(X_val, Y_val))
score = model.evaluate(X_test, Y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

#this is kinda like witchcraft, so I'm just wrapping it in a function so it looks like it makes more sense
def k_largest(k, arr):
    return arr.argsort()[-k:][::-1]
#array([4, 3, 1])

predictions = model.predict(X_test)
total = X_test.shape[0]
correct_with_top_2 = 0
correct_with_top_3 = 0
for i in range(X_test.shape[0]):
    pred = predictions[i,:]
    top2 = k_largest(2, pred)
    top3 = k_largest(3, pred)
    found = False
    for j in range(len(top2)):
        if Y_test[i,:][top2[j]] == 1.0:
            correct_with_top_2 += 1.0
            correct_with_top_3 += 1.0
            found = True
            break
    if found:
        continue
    if Y_test[i,:][top3[-1]] == 1.0:
        correct_with_top_3 += 1.0
            
#print(Y_train[0])
print('Test accuracy with top 2:', correct_with_top_2 / total)
print('Test accuracy with top 3:', correct_with_top_3 / total)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_43 (Dense)             (None, 100)               1100      
_________________________________________________________________
dropout_36 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_44 (Dense)             (None, 100)               10100     
_________________________________________________________________
dropout_37 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_45 (Dense)             (None, 100)               10100     
_________________________________________________________________
dropout_38 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_46 (Dense)             (None, 100)               10100     
__________

In [41]:
#bucket_sizes = [3,4,5,2,7,2,5,2,3]
breakpoints = [3,7,12,14,21,23,28,30,33]
Xs = [[],[],[],[],[],[],[],[],[]]
Ys = [[],[],[],[],[],[],[],[],[]]

for i in range(X.shape[0]):
    index = np.argmax(Y[i,:])
    for j in range(len(breakpoints)):
        if index < breakpoints[j]:
            Xs[j].append(X[i,:].reshape((1,X.shape[1])))
            if j == 0:
                Ys[j].append(Y[i,0:breakpoints[j]].reshape((1,breakpoints[j])))
            else:
                Ys[j].append(Y[i,breakpoints[j-1]:breakpoints[j]].reshape((1,breakpoints[j]-breakpoints[j-1])))
            break
# was going to handle the NON-CRIMINAL vs NON -CRIMINAL case but got lazy...
#print(Xs[0][0])
#print(Ys[0][4])
X_arrs = []
Y_arrs = []

for i in range(len(Xs)):
    X_arrs.append(np.concatenate(Xs[i],axis=0))
    Y_arrs.append(np.concatenate(Ys[i],axis=0))
    print(X_arrs[i].shape)
    print(Y_arrs[i].shape)

(97421, 31)
30
33
(1,)


ValueError: cannot reshape array of size 1 into shape (1,3)