In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


**Preparation**

In [None]:
import pickle
# import data
num_crime_data = '/content/drive/My Drive/CS 547/DeepDiveProject/feature_num_crime.pickle'
labels_data = '/content/drive/My Drive/CS 547/DeepDiveProject/labels.pickle'



with open(num_crime_data, 'rb') as file:
    num_crime = pickle.load(file)


with open(labels_data, 'rb') as file:
    labels = pickle.load(file)

We use the distribution of Primary Crime Type during the last 24 hours as the feature to predict the distribution of Primary Crime Type of the next hour

In [None]:
print(num_crime[0])
print(labels[0])

[[0, 4, 18, 2, 0, 21, 48, 2, 4, 36, 0, 0, 0, 0, 0, 0, 0, 0, 9, 2, 0, 0, 0, 0, 68, 0, 15, 0, 0, 0, 0, 2, 52, 0, 119, 14], [0, 4, 25, 1, 0, 3, 23, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 2, 2, 1, 0, 0, 0, 0, 2, 0, 4, 0, 0, 1, 0, 4, 1, 0, 9, 7], [0, 5, 27, 2, 0, 0, 23, 0, 2, 0, 0, 0, 0, 0, 1, 0, 0, 1, 2, 1, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 1, 1, 0, 3, 3], [0, 0, 19, 1, 0, 3, 15, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 6, 1, 0, 12, 1], [0, 2, 23, 2, 0, 2, 15, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 3, 0], [0, 3, 13, 1, 0, 0, 7, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 1, 0, 3, 0, 0, 0, 0, 2, 0, 0, 7, 0], [0, 0, 5, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 1, 0, 0, 3, 0], [0, 1, 4, 4, 0, 0, 3, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 10, 0], [0, 1, 4, 2, 0, 3, 5, 0, 1, 8, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 6, 0, 11, 0, 0, 0, 0, 0, 

We use batch learning for memory efficiency

In [None]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error



In [None]:
# features shape: (length, 24, 36)
# labels shape: (length, 36)
features = np.array(num_crime)
labels = np.array(labels)

# Flatten the features
length = features.shape[0]
flattened_features = features.reshape(length, -1)  # Shape: (length, 864)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(flattened_features, labels, test_size=0.2, random_state=42)


In [None]:
# Create and train the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)


In [None]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")


# Calculate percentage error
with np.errstate(divide='ignore', invalid='ignore'):
    percentage_errors = np.abs((y_test - y_pred) / y_test) * 100

# Handle division by zero in y_test
percentage_errors = np.where(y_test != 0, percentage_errors, np.nan)

# Calculate Mean Percentage Error (MPE) for summary
mean_percentage_error = np.nanmean(percentage_errors)

print(f"Mean Percentage Error: {mean_percentage_error:.2f}%")

Mean Squared Error: 1.5620927755579954
Mean Percentage Error: 54.80%


In [None]:
with np.errstate(divide='ignore', invalid='ignore'):
    tmp_percentage_errors = np.abs((y_test - y_pred) / y_test) * 100

percentage_errors = np.where(y_test == 0, 100, tmp_percentage_errors)
percentage_errors = np.where((y_test == 0) & (y_pred <= 1e-1), 0, percentage_errors)


print(f"Mean Percentage Error in Total: {np.mean(np.mean(percentage_errors))}%")
print(f"Mean Percentage Error for each Crime Type: \n{np.mean(percentage_errors, axis=0)}%")

Mean Percentage Error in Total: 34.408069796816285%
Mean Percentage Error for each Crime Type: 
[2.41089834e+01 6.11904388e+01 4.99207680e+01 6.27736225e+01
 6.64043548e-01 5.76516564e+01 5.80208147e+01 1.87610600e+01
 6.84400487e+01 6.76017145e+01 0.00000000e+00 2.77604488e+01
 1.97305164e+01 3.58945933e-02 4.37700710e+01 2.36371840e+00
 8.54616121e+00 2.67307389e+01 6.15300182e+01 6.09831183e+01
 1.91734419e-02 7.88578735e-02 2.39818366e-03 4.69386653e-01
 7.69945634e+01 7.90282935e-02 6.16810874e+01 5.39203575e+01
 6.94003439e-02 6.76757583e+01 7.18280966e-03 6.16162329e+01
 7.14506017e+01 2.65748363e+00 4.94479640e+01 7.19372000e+01]%


The Mean percentage error of the primary crime type distribution of the next hour is 54.8 % in hour baseline model


In [None]:
absolute_errors = np.abs(np.array(y_test - y_pred))
mean_absolute_error = np.mean(absolute_errors, axis=0)

print(f"Mean Absolute Error in Total: {np.mean(np.mean(absolute_errors))}")
print(f"Mean Absolute Error for each Crime Type: \n{np.mean(absolute_errors, axis=0)}")

Mean Absolute Error in Total: 0.5387270314942209
Mean Absolute Error for each Crime Type: 
[1.22106575e-01 1.29177455e+00 2.31942710e+00 1.19965628e+00
 1.53180046e-02 2.20654078e-01 1.79040320e+00 9.06786639e-02
 8.08047724e-01 1.21315470e+00 1.17468498e-04 1.25318591e-01
 1.19268160e-01 1.65710399e-03 1.63347759e-01 4.70741605e-02
 6.78141886e-02 1.26127438e-01 1.14405021e+00 1.62332725e+00
 9.14774489e-04 2.72712335e-03 3.16863566e-04 9.96256146e-03
 4.08865609e-01 2.30969332e-03 1.26911126e+00 4.14772413e-01
 2.56210874e-03 3.56985596e-01 7.58497570e-04 9.76570025e-01
 2.57366884e-01 5.22562510e-02 2.56856918e+00 5.80801098e-01]
