In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pickle
import numpy as np
import pandas as pd

dataset_path = '/content/drive/My Drive/CS 547/DeepDiveProject/working_dataset.pickle'

# Load the dataset
with open(dataset_path, 'rb') as file:
    data = pickle.load(file)

In [3]:
data.head()

Unnamed: 0,ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,...,Ward,Community Area,FBI Code,X Coordinate,Y Coordinate,Year,Updated On,Latitude,Longitude,Location
0,11037294,JA371270,2015-03-18 12:00:00,0000X W WACKER DR,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,BANK,False,False,...,42.0,32.0,11,,,2015,2017-08-01 15:52:26,,,
1,11646293,JC213749,2018-12-20 15:00:00,023XX N LOCKWOOD AVE,1154,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT $300 AND UNDER,APARTMENT,False,False,...,36.0,19.0,11,,,2018,2019-04-06 16:04:43,,,
2,11645836,JC212333,2016-05-01 00:25:00,055XX S ROCKWELL ST,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,,False,False,...,15.0,63.0,11,,,2016,2019-04-06 16:04:43,,,
3,11645959,JC211511,2018-12-20 16:00:00,045XX N ALBANY AVE,2820,OTHER OFFENSE,TELEPHONE THREAT,RESIDENCE,False,False,...,33.0,14.0,08A,,,2018,2019-04-06 16:04:43,,,
4,11645601,JC212935,2014-06-01 00:01:00,087XX S SANGAMON ST,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,RESIDENCE,False,False,...,21.0,71.0,11,,,2014,2019-04-06 16:04:43,,,


In [4]:
data = data[['ID', 'Date', 'Primary Type', 'Arrest']]

In [5]:
crime_types = sorted(data['Primary Type'].unique())
crime_type_to_index = {crime: idx for idx, crime in enumerate(crime_types)}
crime_type_to_index

{'ARSON': 0,
 'ASSAULT': 1,
 'BATTERY': 2,
 'BURGLARY': 3,
 'CONCEALED CARRY LICENSE VIOLATION': 4,
 'CRIM SEXUAL ASSAULT': 5,
 'CRIMINAL DAMAGE': 6,
 'CRIMINAL SEXUAL ASSAULT': 7,
 'CRIMINAL TRESPASS': 8,
 'DECEPTIVE PRACTICE': 9,
 'DOMESTIC VIOLENCE': 10,
 'GAMBLING': 11,
 'HOMICIDE': 12,
 'HUMAN TRAFFICKING': 13,
 'INTERFERENCE WITH PUBLIC OFFICER': 14,
 'INTIMIDATION': 15,
 'KIDNAPPING': 16,
 'LIQUOR LAW VIOLATION': 17,
 'MOTOR VEHICLE THEFT': 18,
 'NARCOTICS': 19,
 'NON - CRIMINAL': 20,
 'NON-CRIMINAL': 21,
 'NON-CRIMINAL (SUBJECT SPECIFIED)': 22,
 'OBSCENITY': 23,
 'OFFENSE INVOLVING CHILDREN': 24,
 'OTHER NARCOTIC VIOLATION': 25,
 'OTHER OFFENSE': 26,
 'PROSTITUTION': 27,
 'PUBLIC INDECENCY': 28,
 'PUBLIC PEACE VIOLATION': 29,
 'RITUALISM': 30,
 'ROBBERY': 31,
 'SEX OFFENSE': 32,
 'STALKING': 33,
 'THEFT': 34,
 'WEAPONS VIOLATION': 35}

In [6]:
data['Date'] = pd.to_datetime(data['Date'])
data['Hour'] = data['Date'].dt.floor('h')
data.head()

Unnamed: 0,ID,Date,Primary Type,Arrest,Hour
0,11037294,2015-03-18 12:00:00,DECEPTIVE PRACTICE,False,2015-03-18 12:00:00
1,11646293,2018-12-20 15:00:00,DECEPTIVE PRACTICE,False,2018-12-20 15:00:00
2,11645836,2016-05-01 00:25:00,DECEPTIVE PRACTICE,False,2016-05-01 00:00:00
3,11645959,2018-12-20 16:00:00,OTHER OFFENSE,False,2018-12-20 16:00:00
4,11645601,2014-06-01 00:01:00,DECEPTIVE PRACTICE,False,2014-06-01 00:00:00


In [12]:
start_time = data['Date'].min().floor('h')
end_time = data['Date'].max().ceil('h')
all_hours = pd.date_range(start=start_time, end=end_time, freq='h')

total_number_of_hours = len(all_hours)
print(total_number_of_hours)

208609


In [None]:
hour_idx_dict = {hour: idx for idx, hour in enumerate(all_hours)}
print(len(hour_idx_dict))

208609


In [None]:
num_crime_by_hour_and_type = [[0] * len(crime_types) for _ in range(total_number_of_hours)]
hourly_data_crime_type = data.groupby(['Hour', 'Primary Type']).size().reset_index(name='Count')
hourly_data_crime_type.head()

Unnamed: 0,Hour,Primary Type,Count
0,2001-01-01,ASSAULT,4
1,2001-01-01,BATTERY,18
2,2001-01-01,BURGLARY,2
3,2001-01-01,CRIM SEXUAL ASSAULT,21
4,2001-01-01,CRIMINAL DAMAGE,48


In [None]:
import time

start = time.time()

for i in range(len(hourly_data_crime_type)):
    if (i+1) % 100000 == 0:
        print(f'{i+1}/{len(hourly_data_crime_type)}, time spent: {time.time() - start}s')
        start = time.time()
    num_crime_by_hour_and_type[hour_idx_dict[hourly_data_crime_type['Hour'][i]]][crime_type_to_index[hourly_data_crime_type['Primary Type'][i]]] = hourly_data_crime_type['Count'][i]

100000/2180454, time spent: 2.955807685852051s
200000/2180454, time spent: 3.281691551208496s
300000/2180454, time spent: 1.9522316455841064s
400000/2180454, time spent: 1.9621853828430176s
500000/2180454, time spent: 1.949103832244873s
600000/2180454, time spent: 1.9481897354125977s
700000/2180454, time spent: 2.229464054107666s
800000/2180454, time spent: 3.0919835567474365s
900000/2180454, time spent: 1.9365780353546143s
1000000/2180454, time spent: 1.9514646530151367s
1100000/2180454, time spent: 1.9896957874298096s
1200000/2180454, time spent: 1.9528672695159912s
1300000/2180454, time spent: 1.962998628616333s
1400000/2180454, time spent: 3.224454641342163s
1500000/2180454, time spent: 2.1067652702331543s
1600000/2180454, time spent: 1.9943420886993408s
1700000/2180454, time spent: 1.9528489112854004s
1800000/2180454, time spent: 1.971214771270752s
1900000/2180454, time spent: 1.9547595977783203s
2000000/2180454, time spent: 2.943700075149536s
2100000/2180454, time spent: 2.341639

In [None]:
percentage_arrest_by_hour_and_type = [[0] * len(crime_types) for _ in range(total_number_of_hours)]
hourly_data_num_arrest = data[data['Arrest']==True].groupby(['Hour', 'Primary Type']).size().reset_index(name='Count')
hourly_data_num_arrest.head()

Unnamed: 0,Hour,Primary Type,Count
0,2001-01-01,BATTERY,1
1,2001-01-01,CRIM SEXUAL ASSAULT,5
2,2001-01-01,CRIMINAL DAMAGE,1
3,2001-01-01,CRIMINAL TRESPASS,2
4,2001-01-01,DECEPTIVE PRACTICE,8


In [None]:
start = time.time()

for i in range(len(hourly_data_num_arrest)):
    if (i+1) % 100000 == 0:
        print(f'{i+1}/{len(hourly_data_num_arrest)}, time spent: {time.time() - start}s')
        start = time.time()
    percentage_arrest_by_hour_and_type[hour_idx_dict[hourly_data_num_arrest['Hour'][i]]][crime_type_to_index[hourly_data_num_arrest['Primary Type'][i]]] = hourly_data_num_arrest['Count'][i] / num_crime_by_hour_and_type[hour_idx_dict[hourly_data_num_arrest['Hour'][i]]][crime_type_to_index[hourly_data_num_arrest['Primary Type'][i]]]


100000/1000103, time spent: 3.369065284729004s
200000/1000103, time spent: 3.892021417617798s
300000/1000103, time spent: 4.3117454051971436s
400000/1000103, time spent: 3.3499338626861572s
500000/1000103, time spent: 3.3451039791107178s
600000/1000103, time spent: 4.764123201370239s
700000/1000103, time spent: 3.3935065269470215s
800000/1000103, time spent: 4.22903299331665s
900000/1000103, time spent: 3.9319682121276855s
1000000/1000103, time spent: 4.417592525482178s


In [13]:
all_hours

DatetimeIndex(['2001-01-01 00:00:00', '2001-01-01 01:00:00',
               '2001-01-01 02:00:00', '2001-01-01 03:00:00',
               '2001-01-01 04:00:00', '2001-01-01 05:00:00',
               '2001-01-01 06:00:00', '2001-01-01 07:00:00',
               '2001-01-01 08:00:00', '2001-01-01 09:00:00',
               ...
               '2024-10-18 15:00:00', '2024-10-18 16:00:00',
               '2024-10-18 17:00:00', '2024-10-18 18:00:00',
               '2024-10-18 19:00:00', '2024-10-18 20:00:00',
               '2024-10-18 21:00:00', '2024-10-18 22:00:00',
               '2024-10-18 23:00:00', '2024-10-19 00:00:00'],
              dtype='datetime64[ns]', length=208609, freq='h')

In [None]:
seq_len = 24

In [None]:
feature_num_crime = [num_crime_by_hour_and_type[i:i+seq_len] for i in range(len(num_crime_by_hour_and_type)-seq_len)]
feature_percentage_arrest = [percentage_arrest_by_hour_and_type[i+1:i+seq_len+1] for i in range(len(num_crime_by_hour_and_type)-seq_len)]

In [None]:
labels = [num_crime_by_hour_and_type[i+seq_len] for i in range(len(num_crime_by_hour_and_type)-seq_len)]

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_val_test, y_train, y_val_test = train_test_split(list(zip(feature_num_crime, feature_percentage_arrest)), labels, test_size=0.3)
x_val, x_test, y_val, y_test = train_test_split(x_val_test, y_val_test, test_size=0.5)

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset

class custom_dataset(Dataset):
    def __init__(self, num_crime, pencentage_arrest, labels):
        self.num_crime = num_crime
        self.pencentage_arrest = pencentage_arrest
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return torch.Tensor(self.num_crime[idx]), torch.Tensor(self.pencentage_arrest[idx]), torch.Tensor(self.labels[idx])

In [None]:
print(np.array(x_test).shape)
print(np.array(y_test).shape)

(31288, 2, 24, 36)
(31288, 36)


In [None]:
np.array(x_test)[:, 0, :, :].shape

(31288, 24, 36)

In [None]:
train_dataset = custom_dataset(np.array(x_train)[:, 0, :, :], np.array(x_train)[:, 1, :, :], y_train)
val_dataset = custom_dataset(np.array(x_val)[:, 0, :, :], np.array(x_val)[:, 1, :, :], y_val)
test_dataset = custom_dataset(np.array(x_test)[:, 0, :, :], np.array(x_test)[:, 1, :, :], y_test)

In [None]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers, nonlinearity, bias, batch_first, dropout, bidirectional):
        super(RNN, self).__init__()
        self.rnn = nn.RNN(input_size, hidden_size, num_layers=num_layers, nonlinearity=nonlinearity, bias=bias, batch_first=batch_first, dropout=dropout, bidirectional=bidirectional)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x, _ = self.rnn(x)
        x = self.fc(x[:, -1, :])
        return x

In [None]:
def model_evaluation(batch_size, optimizer, hidden_size, num_layers, nonlinearity, bias, dropout, bidirectional, num_epoch, learning_rate):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    rnn = RNN(input_size=72, hidden_size=hidden_size, output_size=36, num_layers=num_layers, nonlinearity=nonlinearity, bias=bias, batch_first=True, dropout=dropout, bidirectional=bidirectional).to(device)
    criterion = nn.MSELoss()
    if optimizer == 'Adam':
        optimizer = torch.optim.Adam(rnn.parameters(), lr=learning_rate)
    elif optimizer == 'SGD':
        optimizer = torch.optim.SGD(rnn.parameters(), lr=learning_rate)
    else:
        print('Invalid optimizer')
        return

    best_val_loss = float('inf')
    best_model = None

    for epoch in range(num_epoch):
        rnn.train()
        train_loss = 0
        for batch_x1, batch_x2, batch_y in train_loader:
            input = torch.cat((batch_x1, batch_x2), dim=-1).to(device)
            predictions = rnn(input)
            loss = criterion(predictions, batch_y.to(device))
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        train_loss /= len(train_loader)
        rnn.eval()
        val_loss = 0
        with torch.no_grad():
            for batch_x1, batch_x2, batch_y in val_loader:
                input = torch.cat((batch_x1, batch_x2), dim=-1).to(device)
                predictions = rnn(input)
                loss = criterion(predictions, batch_y.to(device))
                val_loss += loss.item()

        val_loss /= len(val_loader)
        print(f"Epoch {epoch + 1}/{num_epoch}, Train Loss: {train_loss}, Validation Loss: {val_loss}")

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model = rnn.state_dict()
            torch.save(best_model, '/content/drive/My Drive/CS 547/DeepDiveProject/best_model.pth')

    if best_model:
      rnn.load_state_dict(best_model)
    rnn.eval()
    test_loss = 0
    with torch.no_grad():
        for batch_x1, batch_x2, batch_y in test_loader:
            input = torch.cat((batch_x1, batch_x2), dim=-1).to(device)
            predictions = rnn(input)
            loss = criterion(predictions, batch_y.to(device))
            test_loss += loss.item()

    test_loss /= len(test_loader)
    rmse = torch.sqrt(torch.tensor(test_loss))
    print(f"Test Loss: {test_loss}, Test RMSE: {rmse}")
    return test_loss, rmse

In [None]:
batch_size = [32, 64]
optimizer = ['Adam', 'SGD']
hidden_size = [64, 256]
num_layers = [1, 2]
nonlinearity = ['relu']
bias = [True]
dropout = [0]
bidirectional = [False]
num_epoch = 10
learning_rate = [0.001, 0.01]

In [None]:
eval_results_loss = {}
eval_results_rmse = {}

for a in batch_size:
    for b in optimizer:
        for c in hidden_size:
            for d in num_layers:
                for e in nonlinearity:
                    for f in bias:
                        for g in dropout:
                            for h in bidirectional:
                                for l in learning_rate:
                                    print(f'batch_size: {a}, optimizer: {b}, hidden_size: {c}, num_layers: {d}, nonlinearity: {e}, bias: {f}, dropout: {g}, bidirectional: {h}, learning_rate: {l}')
                                    test_loss, rmse = model_evaluation(a, b, c, d, e, f, g, h, num_epoch, l)
                                    eval_results_loss[(a, b, c, d, e, f, g, h, l)] = test_loss
                                    eval_results_rmse[(a, b, c, d, e, f, g, h, l)] = rmse

In [None]:
import matplotlib.pyplot as plt

loss_values = list(eval_results_loss.values())
rmse_values = list(eval_results_rmse.values())
hyperparameter_combinations = list(eval_results_loss.keys())
batch_size_vals, optimizer_vals, hidden_size_vals, num_layers_vals, nonlinearity_vals, bias_vals, dropout_vals, bidirectional_vals, learning_rate_vals = zip(*hyperparameter_combinations)
hyperparameters = ['batch_size', 'optimizer', 'hidden_size', 'num_layers', 'nonlinearity', 'bias', 'dropout', 'bidirectional', 'learning_rate']
hyperparameter_values = [batch_size_vals, optimizer_vals, hidden_size_vals, num_layers_vals, nonlinearity_vals, bias_vals, dropout_vals, bidirectional_vals, learning_rate_vals]

fig, axes = plt.subplots(nrows=3, ncols=3, figsize=(15, 15))
axes = axes.flatten()

for i, (hyperparam, values) in enumerate(zip(hyperparameters, hyperparameter_values)):
    axes[i].scatter(values, loss_values, c='blue', label='Loss', alpha=0.5)
    axes[i].set_title(f'Loss vs {hyperparam}')
    axes[i].set_xlabel(hyperparam)
    axes[i].set_ylabel('Loss')
    axes[i].tick_params(axis='x', rotation=45)
    axes[i].legend()

plt.tight_layout()
plt.show()

fig, axes = plt.subplots(nrows=3, ncols=3, figsize=(15, 15))
axes = axes.flatten()

for i, (hyperparam, values) in enumerate(zip(hyperparameters, hyperparameter_values)):
    axes[i].scatter(values, rmse_values, c='green', label='RMSE', alpha=0.5)
    axes[i].set_title(f'RMSE vs {hyperparam}')
    axes[i].set_xlabel(hyperparam)
    axes[i].set_ylabel('RMSE')
    axes[i].tick_params(axis='x', rotation=45)
    axes[i].legend()

plt.tight_layout()
plt.show()


The effect of different mini-batch, optimizers, and some other hyperparameters are investigated.

Due to VRAM limitations, we were not able to test between single batch learning and mini-batch learning, but we tested on batch size of 32 and 64, and it seems that batch size of 64 performs slightly better than 32.

For optimizers, we tested on Adam and SGD, and Adam performs slightly better.

For hyperparameters, we tested on learning rate and hidden dimension of the RNN model, with 0.001 better than 0.01 as the learning rate and very close performance between the 64 and 128 hidden dimension.

In [None]:
optimizer = 'Adam'
num_epoch = 10
batch_size = 64
learning_rate = 0.001
hidden_size = 64
num_layers = 2
nonlinearity = 'relu'
bias = True
batch_first = True
dropout = 0
bidirectional = False

In [None]:
model_evaluation(batch_size, optimizer, hidden_size, num_layers, nonlinearity, bias, dropout, bidirectional, num_epoch, learning_rate)

Epoch 1/10, Train Loss: 1.7046964497131596, Validation Loss: 1.6899622135123349
Epoch 2/10, Train Loss: 1.5160789383962214, Validation Loss: 1.6209261569752527
Epoch 3/10, Train Loss: 1.472626588122036, Validation Loss: 1.5988999578362837
Epoch 4/10, Train Loss: 1.4529382417486176, Validation Loss: 1.6029629809724772
Epoch 5/10, Train Loss: 1.4400657231770098, Validation Loss: 1.5756049987484828
Epoch 6/10, Train Loss: 1.423398136909754, Validation Loss: 1.5873370893406233
Epoch 7/10, Train Loss: 1.4180718793825138, Validation Loss: 1.5613359941782883
Epoch 8/10, Train Loss: 1.4111966654775856, Validation Loss: 1.6671853455541568
Epoch 9/10, Train Loss: 1.4062163863497592, Validation Loss: 1.570977018898494
Epoch 10/10, Train Loss: 1.3991730977159964, Validation Loss: 1.5622303404203466
Test Loss: 1.417698829451953, Test RMSE: 1.1906715631484985


(1.417698829451953, tensor(1.1907))