# Crime Analysis and Prediction

This file contains a set of experiments for analyzing and extrapolating data regarding crimes.

## 1. Data Loading and Sanitization

In [2]:
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from data import CrimeLoader
import numpy as np

# Crime loader now handles all of the data saving and parsing
#    force_refresh - If true, will read from the original files rather than a saved pickle file
#    force_save - If true, will save any loaded data into a pickle for later loading
#    data_limit - The number of crimes to load (used for speedup)
data = CrimeLoader()
data.load_data(force_refresh=False, force_save=False, data_limit=10000, randomize=True)

# List of possible features to include:
#    day                - Day of the week (ex: SUN - SAT) 1-hot encoded
#    time               - Time of day (ex: Morning, Afternoon, Evening, Late Night) 1-hot encoded
#    time min           - Time in minutes (ex: 1420 minutes) Integer value (not 1-hot encoded)
#    hour               - Time in hour (ex: 13) Integer value (not 1-hot encoded)
#    location           - Location of crime, (ex: lat, long) 2 float values
#    location normalized - Location of crime [0,1]
#    crime condensed    - Encoding of crime, 1-hot encoded of length 9 (?)
#    crime full         - Encoding of crime, 1-hot encoded of length ~ 30
#    neighborhood       - Neighborhood of the crime (1-hot encoded, length 77)
#    below poverty count - A number representing the rate of people below the poverty level
#    crowded            - Crowding rate
#    no diploma         - No diploma rate
#    income             - Per capita income in that neighborhood
#    unemployment       - Unemployment percentage for that neighborhood
#    all                - Some combination of the above (see source)

X_features = ["time min", "location"]
Y_features = ["crime condensed"]
X, Y, X_decoder, Y_decoder = data.get_workable_data(X_features, Y_features)
print("Featurization achieved")

Loading existing pickled crimes
Creating checkpoint for crimes
Loading crime data
Finished loading crime data
Now loading social / economics data
Finished loading Chicago social data
Creating X feature matrix
Creating Y feature matrix
Featurization complete
Featurization achieved


## 2. K-Nearest Neighbors

In [3]:
def train_KNN(feature_matrix, targets, n):
    knn = KNeighborsClassifier(n_neighbors=n)
    Y = np.zeros(targets.shape[0])
    for i in range(targets.shape[0]):
        Y[i] = np.argmax(targets[i])
    knn.fit(feature_matrix,Y)
    return knn


In [4]:
def int_to_category(prediction):
    categories = ["KIDNAPPING / CHILDREN","ROBBERY/BURGLARY/THEFT","ASSAULT/VIOLENCE","NARCOTICS","PUBLIC-RELATED CRIME","DAMAGE/ARSON","OTHER/NON-CRIMINAL","WEAPON-RELATED","PROHIBITIVE CRIME"]
    return categories[int(prediction)]


In [5]:
def evaluate_KNN(X, Y):
    # Loading data
    X_train = X[1:300]
    Y_train = Y[1:300]
    X_validate = X[300:500]
    Y_validate = Y[300:500]
    X_test = X[500:600]
    Y_test = Y[500:600]
    
    #Tuning on the validation set for value of K
    K = [k for k in range(1,30)]
    best_K = 0
    max_score = 0.0
    for k in K:
        knn = train_KNN(X_train, Y_train, k)
        predictions = knn.predict(X_validate)
        correct = 0.0
        for i in range(predictions.shape[0]):
            if int_to_category(predictions[i]) == Y_decoder(Y_validate[i])[0]:
                correct += 1.0
        if correct > max_score:
            max_score = correct
            best_K = k
        print("Validation set Performance with k = " + str(k) + ":" + str(correct/predictions.shape[0]))
    
    #Evaluating on Test set with best K value
    knn = train_KNN(X_train, Y_train, best_K)
    predictions = knn.predict(X_test)
    correct = 0.0
    for i in range(predictions.shape[0]):
        if int_to_category(predictions[i]) == Y_decoder(Y_test[i])[0]:
            correct += 1.0
    if correct > max_score:
        max_score = correct
        best_K = k
    print("Test set Performance with k = " + str(best_K) + ":  " + str(correct/predictions.shape[0]))

print(X.shape)
evaluate_KNN(X, Y)       
        
    
    
    
    

(9723, 3)
Validation set Performance with k = 1:0.29
Validation set Performance with k = 2:0.31
Validation set Performance with k = 3:0.35
Validation set Performance with k = 4:0.34
Validation set Performance with k = 5:0.32
Validation set Performance with k = 6:0.31
Validation set Performance with k = 7:0.32
Validation set Performance with k = 8:0.32
Validation set Performance with k = 9:0.31
Validation set Performance with k = 10:0.32
Validation set Performance with k = 11:0.335
Validation set Performance with k = 12:0.325
Validation set Performance with k = 13:0.325
Validation set Performance with k = 14:0.34
Validation set Performance with k = 15:0.345
Validation set Performance with k = 16:0.35
Validation set Performance with k = 17:0.365
Validation set Performance with k = 18:0.355
Validation set Performance with k = 19:0.375
Validation set Performance with k = 20:0.365
Validation set Performance with k = 21:0.375
Validation set Performance with k = 22:0.37
Validation set Perform

In [6]:
def train_MaxEnt(feature_matrix, targets, c):
    maxEnt = LogisticRegression(C = c)
    maxEnt.fit(feature_matrix,targets)
    return maxEnt

In [7]:
def evaluate_MaxEnt(X, Y):
    # Loading data
    X_train = X[1:300]
    Y_train = Y[1:300]
    X_validate = X[300:500]
    Y_validate = Y[300:500]
    X_test = X[500:600]
    Y_test = Y[500:600]
    train_targets = np.zeros(Y_train.shape[0])
    for i in range(Y_train.shape[0]):
        train_targets[i] = np.argmax(Y_train[i])
    validation_targets = np.zeros(Y_validate.shape[0])
    for i in range(Y_validate.shape[0]):
        validation_targets[i] = np.argmax(Y_validate[i])
    test_targets = np.zeros(Y_test.shape[0])
    for i in range(Y_test.shape[0]):
        test_targets[i] = np.argmax(Y_test[i])
    
    print train_targets
    print validation_targets
    print test_targets
    
        
    
    #Tuning on the validation set for value of C
    C = [.001, .01, .1, 1.0, 2.0, 5.0, 10.0]
    best_C = 0
    max_score = 0.0
    for c in C:
        maxEnt = train_MaxEnt(X_train, train_targets, c)
        
        score = maxEnt.score(X_validate, validation_targets)
        if score > max_score:
            max_score = score
            best_C = c
        print("Validation set Performance with C = " + str(c) + ":" + str(score))
    
    #Evaluating on Test set with best K value
    maxEnt = train_MaxEnt(X_train, train_targets, .001)
    test_score = maxEnt.score(X_test, test_targets)
    print("Test set Performance with C = " + str(best_C) + ":  " + str(test_score))

evaluate_MaxEnt(X, Y)       
    

[ 1.  1.  1.  1.  7.  2.  6.  2.  4.  2.  3.  1.  3.  1.  8.  2.  1.  2.
  4.  2.  6.  3.  2.  4.  1.  1.  6.  2.  2.  2.  2.  2.  1.  5.  5.  3.
  1.  3.  1.  1.  3.  2.  2.  2.  1.  6.  1.  2.  0.  3.  4.  1.  1.  2.
  5.  1.  5.  2.  1.  2.  3.  1.  1.  2.  2.  1.  6.  1.  1.  1.  1.  1.
  3.  3.  2.  1.  2.  3.  3.  4.  3.  6.  2.  1.  2.  1.  2.  1.  1.  8.
  1.  1.  5.  0.  2.  5.  3.  1.  1.  1.  1.  1.  6.  1.  5.  1.  1.  2.
  2.  1.  2.  1.  1.  2.  6.  1.  8.  1.  2.  6.  1.  1.  3.  3.  5.  5.
  1.  2.  2.  6.  1.  2.  2.  1.  3.  1.  1.  1.  2.  1.  1.  1.  6.  2.
  3.  0.  6.  1.  6.  1.  6.  2.  1.  1.  6.  2.  2.  2.  6.  5.  2.  1.
  1.  5.  1.  1.  2.  6.  1.  1.  0.  2.  6.  4.  3.  1.  1.  2.  1.  3.
  2.  1.  5.  1.  1.  3.  6.  5.  1.  1.  1.  4.  1.  2.  1.  1.  2.  6.
  1.  2.  2.  2.  3.  2.  8.  6.  1.  1.  2.  2.  4.  5.  1.  5.  6.  5.
  6.  3.  1.  1.  1.  6.  6.  6.  1.  6.  1.  1.  2.  1.  3.  8.  7.  1.
  6.  4.  5.  1.  4.  1.  1.  2.  6.  3.  2.  6.  1

## 4. Neural Network

In [8]:
import keras

ImportError: No module named keras

In [9]:
# dividing training data with 60-20-20 train-val-test split

num_training = int(0.6 * X.shape[0])
num_val_or_test = int(0.2 * X.shape[0])

# shuffling with fixed seed

seed = 42

# create big matrix and shuffle to ensure that things get shuffled properly
# see https://stackoverflow.com/questions/35646908/numpy-shuffle-multidimensional-array-by-row-only-keep-column-order-unchanged

total = np.hstack((X,Y))

np.random.seed(42)

np.random.shuffle(total)

print(X.shape)
print(Y.shape)

# split back apart to partition into training, validation, and test sets

X_new = total[:,0:X.shape[1]]
Y_new = total[:,X.shape[1]:]

#X_new = X
#Y_new = Y # lol

print(X_new.shape)
print(Y_new.shape)

print(X[0])
print(Y[0])
print(X_new[0])
print(Y_new[0])

X_train = X_new[0:num_training,:]
Y_train = Y_new[0:num_training,:]

X_val = X_new[num_training:num_training+num_val_or_test,:]
Y_val = Y_new[num_training:num_training+num_val_or_test,:]

X_test = X_new[num_training+num_val_or_test:,:]
Y_test = Y_new[num_training+num_val_or_test:,:]

print(X_train.shape)
print(X_val.shape)
print(X_test.shape)

(9723, 3)
(9723, 9)
(9723, 3)
(9723, 9)
[ 860.           41.71151799  -87.6417403 ]
[ 0.  0.  0.  1.  0.  0.  0.  0.  0.]
[ 1389.            41.99998378   -87.79972797]
[ 0.  0.  0.  1.  0.  0.  0.  0.  0.]
(5833, 3)
(1944, 3)
(1946, 3)


In [10]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import RMSprop, SGD, Adam
#u ready for some neural nets?

batch_size = 2048
epochs = 5
hidden_layers = 5
units_per_layer = 100
dropout_rate = 0.2

model = Sequential()
for i in range(hidden_layers):
    if i == 0:
        model.add(Dense(activation='relu', input_shape=(len(X[0]),), units=units_per_layer))
    else:
        model.add(Dense(activation='relu', units=units_per_layer))
    model.add(Dropout(dropout_rate))

model.add(Dense(activation='softmax',units=Y.shape[1]))

model.summary()

model.compile(loss='categorical_crossentropy',
              optimizer=Adam(),
              metrics=['accuracy'])

history = model.fit(X_train, Y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_data=(X_val, Y_val))
score = model.evaluate(X_test, Y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

#this is kinda like witchcraft, so I'm just wrapping it in a function so it looks like it makes more sense
def k_largest(k, arr):
    return arr.argsort()[-k:][::-1]
#array([4, 3, 1])

predictions = model.predict(X_test)
total = X_test.shape[0]
correct_with_top_2 = 0
correct_with_top_3 = 0
for i in range(X_test.shape[0]):
    pred = predictions[i,:]
    top2 = k_largest(2, pred)
    top3 = k_largest(3, pred)
    found = False
    for j in range(len(top2)):
        if Y_test[i,:][top2[j]] == 1.0:
            correct_with_top_2 += 1.0
            correct_with_top_3 += 1.0
            found = True
            break
    if found:
        continue
    if Y_test[i,:][top3[-1]] == 1.0:
        correct_with_top_3 += 1.0
            
#print(Y_train[0])
print('Test accuracy with top 2:', correct_with_top_2 / total)
print('Test accuracy with top 3:', correct_with_top_3 / total)

ImportError: No module named keras

In [11]:
#bucket_sizes = [3,4,5,2,7,2,5,2,3]
breakpoints = [3,7,12,14,21,23,28,30,33]
Xs = [[],[],[],[],[],[],[],[],[]]
Ys = [[],[],[],[],[],[],[],[],[]]

for i in range(X.shape[0]):
    index = np.argmax(Y[i,:])
    for j in range(len(breakpoints)):
        if index < breakpoints[j]:
            Xs[j].append(X[i,:].reshape((1,X.shape[1])))
            if j == 0:
                Ys[j].append(Y[i,0:breakpoints[j]].reshape((1,breakpoints[j])))
            else:
                Ys[j].append(Y[i,breakpoints[j-1]:breakpoints[j]].reshape((1,breakpoints[j]-breakpoints[j-1])))
            break
# was going to handle the NON-CRIMINAL vs NON -CRIMINAL case but got lazy...
#print(Xs[0][0])
#print(Ys[0][4])
X_arrs = []
Y_arrs = []

for i in range(len(Xs)):
    X_arrs.append(np.concatenate(Xs[i],axis=0))
    Y_arrs.append(np.concatenate(Ys[i],axis=0))
    print(X_arrs[i].shape)
    print(Y_arrs[i].shape)

ValueError: cannot reshape array of size 2 into shape (1,5)

## 5. Decision Tree

In [12]:
from sklearn import tree
import graphviz
clf = tree.DecisionTreeClassifier()

clf = clf.fit(X_train, Y_train)
print(clf.score(X_test,Y_test))
print(clf.score(X_train,Y_train))
#dot_data = tree.export_graphviz(clf, out_file=None) 
#graph = graphviz.Source(dot_data) 
#graph.render("crime") 
#graph

ImportError: No module named graphviz

In [13]:
best_layer_num = 0
best_accuracy = 0
best_class = None
layers = np.arange(3,50)
max_nodes = np.arange(Y_train.shape[1],5*Y_train.shape[1])
for layer in layers:
    clf = tree.DecisionTreeClassifier(criterion="entropy", max_depth=layer)
    clf = clf.fit(X_train, Y_train)
    curr_acc = clf.score(X_val, Y_val)
    if curr_acc > best_accuracy:
        best_accuracy = curr_acc
        best_class = clf
        best_layer_num = layer

print(best_layer_num)
print(best_accuracy)
print(X_decoder(X_train[0]))
print(Y_decoder(Y_train[0]))
dot_data = tree.export_graphviz(best_class, out_file=None) 
graph = graphviz.Source(dot_data) 
#graph.render("crime") 
graph

40
0.269032921811
['23:9', array([ 41.99998378, -87.79972797])]
['NARCOTICS']


NameError: name 'graphviz' is not defined

## 6. GMM (because why not)

In [None]:
from sklearn import mixture
best_likelihood = -10000000
best_gmm = None
best_n = 0
for i in range(2,9):
    
    gmm = mixture.GaussianMixture(n_components=i)
    gmm.fit(X)
    if gmm.score(X) > best_likelihood:
        best_likelihood = gmm.score(X)
        best_gmm = gmm
        best_n = i
        
print(best_gmm.score(X))
print(best_n)
print(best_gmm.means_)
print(X_decoder(X[0]))
preds = best_gmm.predict(X)
#print(preds)
#print(Y)
for i in range(preds[0:50].size):
    print('Actual: ' + str(Y_decoder(Y[i])) + ' Predicted: ' + str(preds[i]))

In [17]:
from sklearn.dummy import DummyClassifier
def baselines(X, Y):
    # Loading data
    X_train = X[1:300]
    Y_train = Y[1:300]
    X_validate = X[300:500]
    Y_validate = Y[300:500]
    X_test = X[500:600]
    Y_test = Y[500:600]
    train_targets = np.zeros(Y_train.shape[0])
    for i in range(Y_train.shape[0]):
        train_targets[i] = np.argmax(Y_train[i])
    validation_targets = np.zeros(Y_validate.shape[0])
    for i in range(Y_validate.shape[0]):
        validation_targets[i] = np.argmax(Y_validate[i])
    test_targets = np.zeros(Y_test.shape[0])
    for i in range(Y_test.shape[0]):
        test_targets[i] = np.argmax(Y_test[i])
    #majority vote
    clf = DummyClassifier(strategy='most_frequent')
    clf.fit(X_train, train_targets)
    print("Majority Vote validation score", clf.score(X_validate, validation_targets) )
    print("Majority Vote test score", clf.score(X_test, Y_test))
    clf2 = DummyClassifier(strategy='uniform')
    clf2.fit(X_train, Y_train)
    print("Random uniform sampling validation score", clf2.score(X_validate, Y_validate))
    print("Random uniform sampling test score", clf2.score(X_test, Y_test))

baselines(X, Y)

ValueError: Classification metrics can't handle a mix of multilabel-indicator and binary targets