In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
from numpy import mean
from numpy import array
from prettytable import PrettyTable
from tqdm import tqdm_notebook

# from keras.models import Sequential
# from keras.layers import LSTM
# from keras.layers import Dense
# from keras.layers import Bidirectional
# from keras.layers import Flatten
# from keras.layers import TimeDistributed
# from keras.layers.convolutional import Conv1D
# from keras.layers.convolutional import MaxPooling1D

from sklearn.metrics import mean_squared_error

This article covers the main method used: https://machinelearningmastery.com/moving-average-smoothing-for-time-series-forecasting-python/

In [2]:
incidents = pd.read_csv("../incident_data/yearly_incident_data/all_incidents.csv", low_memory=False)
incidents

Unnamed: 0,publicAddress,caseNumber,precinct,offense,description,ucrCode,lat,lon,neighborhood,incidentDate,incidentTime
0,0056Xx Lyndale Av S,MP2010000051,5,CSCR,Crim Sex Cond-Rape,3.0,44.900291,-93.288239,Windom,2010/01/01,00:30
1,0001Xx 62 St W,MP2010000099,5,ROBPAG,Robbery Per Agg,4.0,44.890636,-93.280041,Windom,2010/01/01,01:05
2,0039Xx 5 Av S,MP2010000373,3,BURGD,Burglary Of Dwelling,6.0,44.931448,-93.268841,Bryant,2010/01/01,04:18
3,0013Xx Nicollet Av S,MP2010000305,1,ASLT2,Asslt W/Dngrs Weapon,5.0,44.970506,-93.277714,Loring Park,2010/01/01,03:00
4,0034Xx Oliver Av N,MP2010000229,4,BURGD,Burglary Of Dwelling,6.0,45.017746,-93.306988,Folwell,2010/01/01,02:23
...,...,...,...,...,...,...,...,...,...,...,...
235545,00008X 8Th St S,MP2020708915,1,SHOPLF,Shoplifting,7.0,44.975633,-93.272376,Downtown West,2020/12/24,14:45
235546,00008X 8Th St S,MP2020708916,1,SHOPLF,Shoplifting,7.0,44.975633,-93.272376,Downtown West,2020/12/24,15:45
235547,0006Xx Washington Ave Se,MP2020708917,2,BIKETF,Bike Theft,7.0,44.973658,-93.229494,University Of Minnesota,2020/12/22,11:00
235548,0025Xx 36Th Ave N,MP2020321240,4,THFTSW,Theft By Swindle,7.0,45.020468,-93.312715,Cleveland,2020/12/27,17:00


In [3]:
# Read in the CSV
# incidents = pd.read_csv("../incident_data/yearly_incident_data/all_incidents.csv", low_memory=False)
incidents = incidents.dropna()
incidents

Unnamed: 0,publicAddress,caseNumber,precinct,offense,description,ucrCode,lat,lon,neighborhood,incidentDate,incidentTime
0,0056Xx Lyndale Av S,MP2010000051,5,CSCR,Crim Sex Cond-Rape,3.0,44.900291,-93.288239,Windom,2010/01/01,00:30
1,0001Xx 62 St W,MP2010000099,5,ROBPAG,Robbery Per Agg,4.0,44.890636,-93.280041,Windom,2010/01/01,01:05
2,0039Xx 5 Av S,MP2010000373,3,BURGD,Burglary Of Dwelling,6.0,44.931448,-93.268841,Bryant,2010/01/01,04:18
3,0013Xx Nicollet Av S,MP2010000305,1,ASLT2,Asslt W/Dngrs Weapon,5.0,44.970506,-93.277714,Loring Park,2010/01/01,03:00
4,0034Xx Oliver Av N,MP2010000229,4,BURGD,Burglary Of Dwelling,6.0,45.017746,-93.306988,Folwell,2010/01/01,02:23
...,...,...,...,...,...,...,...,...,...,...,...
235545,00008X 8Th St S,MP2020708915,1,SHOPLF,Shoplifting,7.0,44.975633,-93.272376,Downtown West,2020/12/24,14:45
235546,00008X 8Th St S,MP2020708916,1,SHOPLF,Shoplifting,7.0,44.975633,-93.272376,Downtown West,2020/12/24,15:45
235547,0006Xx Washington Ave Se,MP2020708917,2,BIKETF,Bike Theft,7.0,44.973658,-93.229494,University Of Minnesota,2020/12/22,11:00
235548,0025Xx 36Th Ave N,MP2020321240,4,THFTSW,Theft By Swindle,7.0,45.020468,-93.312715,Cleveland,2020/12/27,17:00


In [None]:
# Reset the index to date, and split the date into fields for analysis
incidents.index = pd.DatetimeIndex(incidents.incidentDate)

incidents['month'] = incidents.index.month
incidents['year'] = incidents.index.year
incidents['dayOfWeek'] = incidents.index.dayofweek
incidents['dayOfMonth'] = incidents.index.day
incidents['dayOfYear'] = incidents.index.dayofyear
incidents['weekOfMonth'] = incidents.dayOfMonth.apply(lambda d: (d-1) // 7 +1)

dayOfYear = list(incidents.index.dayofyear)
weekOfYear = [math.ceil(i/7) for i in dayOfYear]
incidents['weekOfYear'] = weekOfYear

incidents = incidents.sort_index()

incidents.loc[(incidents['neighborhood'] == "Steven'S Square - Loring Heights"), 'neighborhood'] = 'Stevens Square - Loring Heights'

incidents.head()

In [None]:
# Training on 2010-2017 data, then using 2019 for testing. Need to investigate 2018 data - seems like we lost a large number of records potentially during the data cleaning phase when we dropped NAs
incidents_train = incidents.loc['2010-01-01':'2017-12-31']
incidents_test = incidents.loc['2019-01-01':'2019-12-31']

# Setup Train and Test Data Sets for Monthly Predictions

In [None]:
# Gather the unique neighborhoods into a list
neighborhood_list = list(incidents['neighborhood'].unique())
print(neighborhood_list)

In [None]:
# Training data - count of crimes by neighborhood by month
train_n = []
for neighborhood in neighborhood_list:
    df = incidents_train[incidents_train['neighborhood'] == neighborhood]
    df_gr = df.groupby(['year', 'month']).count()
    train_n.append(list(df_gr['incidentDate'].values))

In [None]:
# Testing data - count of crimes by neighborhood by month
test_n = []
for neighborhood in neighborhood_list:
    df = incidents_test[incidents_test['neighborhood'] == neighborhood]
    df_gr = df.groupby(['month']).count()
    test_n.append(list(df_gr['incidentDate'].values))

## Simple Moving Average

In [None]:
# Prepare situation. Used window of 3 (3 months - quarter of a year)
window = 3
predTot = list()
testTot = list()
# Walk forward over time steps in test
for neighborhoodNum in range(len(train_n)):
    
    history = train_n[neighborhoodNum]
    test = test_n[neighborhoodNum]
    preds = []
    for t in range(len(test)):
        length = len(history)
        yhat = mean([history[i] for i in range(length - window, length)])
        obs = test[t]
        preds.append(yhat)
        history.append(obs)
    
    print('Neighborhood #: {}'.format(neighborhoodNum+1))
    print('Name: {}'.format(neighborhood_list[neighborhoodNum]))
    print('Actuals: {}'.format(test))
    print('Predictions: {}'.format(preds))
    
    # Plot the actual crimes for the test period in blue, the prediction based on the training set is in red
    plt.plot(test, label='Actual')
    plt.plot(preds, color='red', label='Predicted')
    plt.title(neighborhood_list[neighborhoodNum])
    plt.xlabel('Month')
    plt.ylabel('Count of crimes')
    plt.legend(loc='upper right')
    plt.show()
    
    testTot = testTot + test
    predTot = predTot + preds
error = mean_squared_error(predTot, testTot) ** .5
print('Test RMSE: %.3f' % error)

# Neighborhood weekly predictions

In [None]:
# Training data - count of crimes by neighborhood by week
train_n = []
for neighborhood in neighborhood_list:
    df = incidents_train[incidents_train['neighborhood'] == neighborhood]
    df_gr = df.groupby(['year', 'weekOfYear']).count()
    train_n.append(list(df_gr['incidentDate'].values))

In [None]:
# Testing data - count of crimes by neighborhood by week
test_n = []
for neighborhood in neighborhood_list:
    df = incidents_test[incidents_test['neighborhood'] == neighborhood]
    df_gr = df.groupby(['weekOfYear']).count()
    test_n.append(list(df_gr['incidentDate'].values))

In [None]:
# Prepare situation. Used window of 4 (4 week rolling average)
window = 4
predTot = list()
testTot = list()
# Walk forward over time steps in test
for neighborhoodNum in range(len(train_n)):
    
    history = train_n[neighborhoodNum]
    test = test_n[neighborhoodNum]
    preds = []
    for t in range(len(test)):
        length = len(history)
        yhat = mean([history[i] for i in range(length - window, length)])
        obs = test[t]
        preds.append(yhat)
        history.append(obs)
    
    print('Neighborhood #: {}'.format(neighborhoodNum+1))
    print('Name: {}'.format(neighborhood_list[neighborhoodNum]))
    # print('Actuals: {}'.format(test))
    # print('Predictions: {}'.format(preds))
    
    # Plot the actual crimes for the test period in blue, the prediction based on the training set is in red
    plt.plot(test, label='Actual')
    plt.plot(preds, color='red', label='Predicted')
    plt.title(neighborhood_list[neighborhoodNum])
    plt.xlabel('Week No.')
    plt.ylabel('Count of crimes')
    plt.legend(loc='upper right')
    plt.show()
    
    testTot = testTot + test
    predTot = predTot + preds
error = mean_squared_error(predTot, testTot) ** .5
print('Test RMSE: %.3f' % error)

# Precinct Monthly Predictions

In [None]:
# Predictions by precinct
precinct_list = list(incidents['precinct'].unique())

In [None]:
train_p = []
for precinct in precinct_list:
    df = incidents_train[incidents_train['precinct'] == precinct]
    df_gr = df.groupby(['year', 'month']).count()
    train_p.append(list(df_gr['incidentDate'].values))

In [None]:
test_p = []
for precinct in precinct_list:
    df = incidents_test[incidents_test['precinct'] == precinct]
    df_gr = df.groupby(['month']).count()
    test_p.append(list(df_gr['incidentDate'].values))

In [None]:
# prepare situation
window = 3
predTot = list()
testTot = list()
# walk forward over time steps in test
for precinctNum in range(len(train_p)):
    
    history = train_p[precinctNum]
    test = test_p[precinctNum]
    preds = []
    for t in range(len(test)):
        length = len(history)
        yhat = mean([history[i] for i in range(length - window, length)])
        obs = test[t]
        preds.append(yhat)
        history.append(obs)
    
    print('Precinct: {}'.format(precinctNum+1))
    print('Actuals: {}'.format(test))
    print('Predictions: {}'.format(preds))
    # plot
    plt.plot(test)
    plt.plot(preds, color='red')
    plt.show()
    
    testTot = testTot + test
    predTot = predTot + preds
error = mean_squared_error(predTot, testTot) ** .5
print('Test RMSE: %.3f' % error)

Weighted Moving Average

In [None]:
# prepare situation
window = 3
predTot = list()
testTot = list()
# walk forward over time steps in test
for precinctNum in range(len(train_p)):
    
    history = train_p[precinctNum]
    test = test_p[precinctNum]
    preds = []
    for t in range(len(test)):
        length = len(history)
        yhat = np.average([history[i] for i in range(length - window, length)], weights=[1,2,3,4,5])
        obs = test[t]
        preds.append(yhat)
        history.append(obs)
    
    print('Precinct: {}'.format(precinctNum+1))
    print('Actuals: {}'.format(test))
    print('Predictions: {}'.format(preds))
    # plot
    plt.plot(test)
    plt.plot(preds, color='red')
    plt.show()
    
    testTot = testTot + test
    predTot = predTot + preds
error = mean_squared_error(predTot, testTot) ** .5
print('Test RMSE: %.3f' % error)

Exponential Moving Average

In [None]:
# prepare situation
predTot = list()
testTot = list()
alpha = 0.6
# walk forward over time steps in test
for neighborhoodNum in range(len(train_n)):
    
    history = train_n[neighborhoodNum]
    test = test_n[neighborhoodNum]
    preds = []
    lastPred = 0
    for t in range(len(test)):
        yhat = ((1-alpha)*lastPred + (alpha*history[-1]))
        lastPred = yhat
        obs = test[t]
        preds.append(yhat)
        history.append(obs)
        
    # plot
    plt.plot(test)
    plt.plot(preds, color='red')
    plt.show()
    
    testTot = testTot + test
    predTot = predTot + preds
error = mean_squared_error(predTot, testTot) ** .5
print('Test RMSE: %.3f' % error)

## Predictions by offense - weekly

In [None]:
# Get a list of offenses
offense_list = list(incidents['offense'].unique())
print(offense_list)

In [None]:
# Clean up offense field
offense_data = incidents['offense'].to_list()
# print(precinct_data)

offense_list = []

for offense in offense_data:
    offense_cln = offense.replace(' ', '')
    offense_list.append(offense_cln)

# print(offense_list)

# Insert corrected offense back into the df
incidents['offense'] = offense_list
incidents['offense'] = incidents['offense']
# incidents.head()

offense_list = list(incidents['offense'].unique())
print(offense_list)

In [None]:
# Training data by offense by week
train_o = []
for offense in offense_list:
    df = incidents_train[incidents_train['offense'] == offense]
    df_gr = df.groupby(['year', 'weekOfYear']).count()
    train_o.append(list(df_gr['incidentDate'].values))

In [None]:
test_o = []
for offense in offense_list:
    df = incidents_test[incidents_test['offense'] == offense]
    df_gr = df.groupby(['weekOfYear']).count()
    test_o.append(list(df_gr['incidentDate'].values))

In [None]:
# Prepare situation
window = 12
predTot = list()
testTot = list()
# Walk forward over time steps in test
for offenseNum in range(len(train_o)):
    
    history = train_o[offenseNum]
    test = test_o[offenseNum]
    preds = []
    for t in range(len(test)):
        length = len(history)
        yhat = mean([history[i] for i in range(length - window, length)])
        obs = test[t]
        preds.append(yhat)
        history.append(obs)
    
    print('Offense #: {}'.format(offenseNum+1))
    print('Offense: {}'.format(offense_list[offenseNum]))
    print('Actuals: {}'.format(test))
    print('Predictions: {}'.format(preds))

    # plot
    plt.plot(test)
    plt.plot(preds, color='red')
    plt.xlabel('Week No.')
    plt.ylabel('Count of crimes')
    plt.show()
    
    testTot = testTot + test
    predTot = predTot + preds
error = mean_squared_error(predTot, testTot) ** .5
print('Test RMSE: %.3f' % error)

In [None]:
# Haven't used this yet...they used it in the NN part of their project (LSTM)
def split_sequence(sequence, n_steps):
    X, y = list(), list()
    for i in range(len(sequence)):
        # find the end of this pattern
        end_ix = i + n_steps
        # check if we are beyond the sequence
        if end_ix > len(sequence)-1:
            break
        # gather input and output parts of the pattern
        seq_x, seq_y = sequence[i:end_ix], sequence[end_ix]
        X.append(seq_x)
        y.append(seq_y)
    return array(X), array(y)