In [None]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

In [None]:
# Loading data
on_colab = False

if on_colab:
    from google.colab import drive
    drive.mount('/content/drive')
    path = '/content/drive/My Drive/CS 547/DeepDiveProject/'
else:
    path = ''

dataset_path = path + 'feature_hour_of_day.pickle'
with open(dataset_path, 'rb') as f:
    feature_hour_of_day = torch.Tensor(np.array(pickle.load(f)))
print(feature_hour_of_day.shape)

dataset_path = path + 'feature_day_of_week.pickle'
with open(dataset_path, 'rb') as f:
    feature_day_of_week = torch.Tensor(np.array(pickle.load(f)))
print(feature_day_of_week.shape)

dataset_path = path + 'feature_day_of_month.pickle'
with open(dataset_path, 'rb') as f:
    feature_day_of_month = torch.Tensor(np.array(pickle.load(f)))
print(feature_day_of_month.shape)

dataset_path = path + 'feature_week_of_year.pickle'
with open(dataset_path, 'rb') as f:
    feature_week_of_year = torch.Tensor(np.array(pickle.load(f)))
print(feature_week_of_year.shape)

dataset_path = path + 'feature_month_of_year.pickle'
with open(dataset_path, 'rb') as f:
    feature_month_of_year = torch.Tensor(np.array(pickle.load(f)))
print(feature_month_of_year.shape)

dataset_path = path + 'feature_year.pickle'
with open(dataset_path, 'rb') as f:
    feature_year = torch.Tensor(np.array(pickle.load(f)))
print(feature_year.shape)

dataset_path = path + 'feature_num_crime.pickle'
with open(dataset_path, 'rb') as f:
    feature_num_crime = torch.Tensor(np.array(pickle.load(f)))
print(feature_num_crime.shape)

dataset_path = path + 'feature_percentage_arrest.pickle'
with open(dataset_path, 'rb') as f:
    feature_percentage_arrest = torch.Tensor(np.array(pickle.load(f)))
print(feature_percentage_arrest.shape)

dataset_path = path + 'labels.pickle'
with open(dataset_path, 'rb') as f:
    labels = torch.Tensor(np.array(pickle.load(f)))
print(labels.shape)

torch.Size([208585, 24, 24])
torch.Size([208585, 24, 7])
torch.Size([208585, 24, 31])
torch.Size([208585, 24, 53])
torch.Size([208585, 24, 12])
torch.Size([208585, 24, 24])
torch.Size([208585, 24, 36])
torch.Size([208585, 24, 36])
torch.Size([208585, 36])


In [None]:
# feature_num_crime 36
# feature_percentage_arrest 36
# feature_hour_of_day 24
# feature_day_of_week 7
# feature_day_of_month 31
# feature_week_of_year 53
# feature_month_of_year 12
# feature_year 24

In [None]:
# Combine the features
features = torch.cat((feature_num_crime, feature_percentage_arrest, feature_hour_of_day,
                      feature_day_of_week, feature_day_of_month, feature_week_of_year,
                      feature_month_of_year, feature_year), dim=-1)
features.shape

torch.Size([208585, 24, 223])

In [None]:
# Split train, val, test set
from sklearn.model_selection import train_test_split

x_train, x_val_test, y_train, y_val_test = train_test_split(features, labels, test_size=0.3)
x_val, x_test, y_val, y_test = train_test_split(x_val_test, y_val_test, test_size=0.5)

x_val_test = None
y_val_test = None

In [None]:
# Creating torch dataset for use in dataloader
class custom_dataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return torch.Tensor(self.features[idx]), torch.Tensor(self.labels[idx])

train_dataset = custom_dataset(x_train, y_train)
val_dataset = custom_dataset(x_val, y_val)
test_dataset = custom_dataset(x_test, y_test)

In [None]:
# Defining the model
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers, nonlinearity, bias, batch_first, dropout, bidirectional):
        super(RNN, self).__init__()
        self.rnn = nn.RNN(input_size, hidden_size, num_layers=num_layers, nonlinearity=nonlinearity, bias=bias, batch_first=batch_first, dropout=dropout, bidirectional=bidirectional)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x, _ = self.rnn(x)
        x = self.fc(x[:, -1, :])
        return x

In [None]:
def model_evaluation(batch_size, optimizer, hidden_size, num_layers, nonlinearity, bias, dropout, bidirectional, num_epoch, learning_rate):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Create dataloaders
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    # Initialize the model
    rnn = RNN(input_size=223, hidden_size=hidden_size, output_size=36, num_layers=num_layers, nonlinearity=nonlinearity, bias=bias, batch_first=True, dropout=dropout, bidirectional=bidirectional).to(device)

    criterion = nn.MSELoss()
    if optimizer == 'Adam':
        optimizer = torch.optim.Adam(rnn.parameters(), lr=learning_rate)
    elif optimizer == 'SGD':
        optimizer = torch.optim.SGD(rnn.parameters(), lr=learning_rate)
    else:
        print('Invalid optimizer')
        return

    best_val_loss = float('inf')
    best_model = None

    # Training and validating
    for epoch in range(num_epoch):
        rnn.train()
        train_loss = 0
        for batch_x, batch_y in train_loader:
            predictions = rnn(batch_x.to(device))
            loss = criterion(predictions, batch_y.to(device))
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        train_loss /= len(train_loader)
        rnn.eval()
        val_loss = 0
        with torch.no_grad():
            for batch_x, batch_y in val_loader:
                predictions = rnn(batch_x.to(device))
                loss = criterion(predictions, batch_y.to(device))
                val_loss += loss.item()

        val_loss /= len(val_loader)

        print(f"Epoch {epoch + 1}/{num_epoch}, Train Loss: {train_loss}, Validation Loss: {val_loss}")

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model = rnn.state_dict()
            torch.save(best_model, path+'best_model.pth')

    if best_model:
        rnn.load_state_dict(best_model)

    rnn.eval()
    test_loss = 0
    all_predictions = []
    all_labels = []

    # Evaluate on test set
    with torch.no_grad():
        for batch_x, batch_y in test_loader:
            predictions = rnn(batch_x.to(device))
            loss = criterion(predictions, batch_y.to(device))
            all_predictions.extend(predictions.cpu())
            all_labels.extend(batch_y.cpu())
            test_loss += loss.item()

    test_loss /= len(test_loader)
    rmse = torch.sqrt(torch.tensor(test_loss))


    print(f"Test Loss: {test_loss:.2f}, Test RMSE: {rmse:.2f}")
    return test_loss, rmse, best_model

In [None]:
optimizer = 'Adam'
num_epoch = 20
batch_size = 64
learning_rate = 0.001
hidden_size = 64
num_layers = 2
nonlinearity = 'relu'
bias = True
batch_first = True
dropout = 0
bidirectional = False

In [None]:
_, _, best_model = model_evaluation(batch_size, optimizer, hidden_size, num_layers, nonlinearity, bias, dropout, bidirectional, num_epoch, learning_rate)

Epoch 1/20, Train Loss: 1.614947163175847, Validation Loss: 1.4172786024450525
Epoch 2/20, Train Loss: 1.425435262484889, Validation Loss: 1.3760635861346082
Epoch 3/20, Train Loss: 1.3763091262237321, Validation Loss: 1.3540276694882867
Epoch 4/20, Train Loss: 1.3463152948698383, Validation Loss: 1.3795988129448062
Epoch 5/20, Train Loss: 1.3267485215463313, Validation Loss: 1.3133004468642862
Epoch 6/20, Train Loss: 1.313497017811099, Validation Loss: 1.3233411264565824
Epoch 7/20, Train Loss: 1.2999854849212322, Validation Loss: 1.2857606507274026
Epoch 8/20, Train Loss: 1.2931341077809162, Validation Loss: 1.2930412606958963
Epoch 9/20, Train Loss: 1.2784452332570613, Validation Loss: 1.2916413110458047
Epoch 10/20, Train Loss: 1.2674394916746305, Validation Loss: 1.2777662272346044
Epoch 11/20, Train Loss: 1.267148902654439, Validation Loss: 1.301818911527076
Epoch 12/20, Train Loss: 1.2582494748150643, Validation Loss: 1.2689996488499007
Epoch 13/20, Train Loss: 1.250883019853327

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# Loading model weights and predicting
model = RNN(input_size=223, hidden_size=64, output_size=36, num_layers=2, nonlinearity='relu', bias=True, batch_first=True, dropout=0, bidirectional=False).to(device)
model.load_state_dict(best_model)
model.eval()

with torch.no_grad():
    y_pred = model(x_test.to(device)).cpu()

In [None]:
# Evaluate on percentage error metrics
with np.errstate(divide='ignore', invalid='ignore'):
    tmp_percentage_errors = np.abs((y_test - y_pred) / y_test) * 100

percentage_errors = np.where(y_test == 0, 100, tmp_percentage_errors)
percentage_errors = np.where((y_test == 0) & (y_pred <= 1e-1), 0, percentage_errors)


print(f"Mean Percentage Error in Total: {np.mean(np.mean(percentage_errors))}%")
print(f"Mean Percentage Error for each Crime Type: \n{np.mean(percentage_errors, axis=0)}%")

Mean Percentage Error in Total: 31.457881927490234%
Mean Percentage Error for each Crime Type: 
[1.7182867e+01 6.0543976e+01 4.4721729e+01 6.2010773e+01 6.7549711e-01
 6.7789017e+01 5.4064777e+01 1.5468224e+01 6.2109032e+01 6.6694435e+01
 3.1963959e-03 2.9148979e+01 2.0922464e+01 4.7853474e-02 3.8934994e+01
 2.5524652e+00 1.0267479e+01 3.0388924e+01 6.0184338e+01 4.6431259e+01
 2.5571320e-02 8.5346766e-02 6.3922270e-03 5.0346261e-01 7.5144554e+01
 4.8182636e-02 5.9402275e+01 1.7317640e+01 1.2125890e-01 5.2558949e+01
 6.3895397e-03 6.1062572e+01 7.6728203e+01 3.0475271e+00 4.2326382e+01
 5.3956314e+01]%


In [None]:
# Evaluate on absolute error metrics
absolute_errors = np.abs(np.array(y_test - y_pred))
mean_absolute_error = np.mean(absolute_errors, axis=0)

print(f"Mean Absolute Error in Total: {np.mean(np.mean(absolute_errors))}")
print(f"Mean Absolute Error for each Crime Type: \n{np.mean(absolute_errors, axis=0)}")

Mean Absolute Error in Total: 0.49269866943359375
Mean Absolute Error for each Crime Type: 
[1.1171017e-01 1.2584739e+00 2.2140641e+00 1.1519334e+00 1.5595963e-02
 2.2493929e-01 1.7131293e+00 8.9520767e-02 5.9117591e-01 1.0085377e+00
 2.1684723e-04 1.3489152e-01 1.2921342e-01 5.1166443e-03 1.6011500e-01
 4.5134712e-02 6.7371242e-02 1.3725871e-01 1.1267688e+00 1.3349626e+00
 8.9255511e-04 1.9263355e-02 1.7786860e-03 1.0619721e-02 3.9502022e-01
 6.3158283e-03 1.2241113e+00 2.2717051e-01 3.8253551e-03 2.7235913e-01
 6.1950512e-04 9.8242754e-01 2.7928570e-01 4.6038892e-02 2.3330932e+00
 4.1420642e-01]
