In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pickle
import numpy as np
import pandas as pd

dataset_path = '/content/drive/My Drive/CS 547/DeepDiveProject/working_dataset.pickle'

# Load the dataset
with open(dataset_path, 'rb') as file:
    data = pickle.load(file)

data.head()

Unnamed: 0,ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,...,Ward,Community Area,FBI Code,X Coordinate,Y Coordinate,Year,Updated On,Latitude,Longitude,Location
0,11037294,JA371270,2015-03-18 12:00:00,0000X W WACKER DR,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,BANK,False,False,...,42.0,32.0,11,,,2015,2017-08-01 15:52:26,,,
1,11646293,JC213749,2018-12-20 15:00:00,023XX N LOCKWOOD AVE,1154,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT $300 AND UNDER,APARTMENT,False,False,...,36.0,19.0,11,,,2018,2019-04-06 16:04:43,,,
2,11645836,JC212333,2016-05-01 00:25:00,055XX S ROCKWELL ST,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,,False,False,...,15.0,63.0,11,,,2016,2019-04-06 16:04:43,,,
3,11645959,JC211511,2018-12-20 16:00:00,045XX N ALBANY AVE,2820,OTHER OFFENSE,TELEPHONE THREAT,RESIDENCE,False,False,...,33.0,14.0,08A,,,2018,2019-04-06 16:04:43,,,
4,11645601,JC212935,2014-06-01 00:01:00,087XX S SANGAMON ST,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,RESIDENCE,False,False,...,21.0,71.0,11,,,2014,2019-04-06 16:04:43,,,


In [None]:
import time

data = data[['ID', 'Date', 'Primary Type', 'Arrest']]

crime_types = sorted(data['Primary Type'].unique())
crime_type_to_index = {crime: idx for idx, crime in enumerate(crime_types)}

data['Date'] = pd.to_datetime(data['Date'])
data['Hour'] = data['Date'].dt.floor('h')

start_time = data['Date'].min().floor('h')
end_time = data['Date'].max().ceil('h')
all_hours = pd.date_range(start=start_time, end=end_time, freq='h')
total_number_of_hours = len(all_hours)
hour_idx_dict = {hour: idx for idx, hour in enumerate(all_hours)}

num_crime_by_hour_and_type = [[0] * len(crime_types) for _ in range(total_number_of_hours)]
hourly_data_crime_type = data.groupby(['Hour', 'Primary Type']).size().reset_index(name='Count')

start = time.time()

for i in range(len(hourly_data_crime_type)):
    if (i+1) % 100000 == 0:
        print(f'{i+1}/{len(hourly_data_crime_type)}, time spent: {time.time() - start}s')
        start = time.time()
    num_crime_by_hour_and_type[hour_idx_dict[hourly_data_crime_type['Hour'][i]]][crime_type_to_index[hourly_data_crime_type['Primary Type'][i]]] = hourly_data_crime_type['Count'][i]

percentage_arrest_by_hour_and_type = [[0] * len(crime_types) for _ in range(total_number_of_hours)]
hourly_data_num_arrest = data[data['Arrest']==True].groupby(['Hour', 'Primary Type']).size().reset_index(name='Count')

start = time.time()

for i in range(len(hourly_data_num_arrest)):
    if (i+1) % 100000 == 0:
        print(f'{i+1}/{len(hourly_data_num_arrest)}, time spent: {time.time() - start}s')
        start = time.time()
    percentage_arrest_by_hour_and_type[hour_idx_dict[hourly_data_num_arrest['Hour'][i]]][crime_type_to_index[hourly_data_num_arrest['Primary Type'][i]]] = hourly_data_num_arrest['Count'][i] / num_crime_by_hour_and_type[hour_idx_dict[hourly_data_num_arrest['Hour'][i]]][crime_type_to_index[hourly_data_num_arrest['Primary Type'][i]]]


100000/2180454, time spent: 1.9435040950775146s
200000/2180454, time spent: 1.96675705909729s
300000/2180454, time spent: 1.9392452239990234s
400000/2180454, time spent: 1.9726746082305908s
500000/2180454, time spent: 3.0952436923980713s
600000/2180454, time spent: 2.1999096870422363s
700000/2180454, time spent: 1.9365739822387695s
800000/2180454, time spent: 1.924309253692627s
900000/2180454, time spent: 1.9569284915924072s
1000000/2180454, time spent: 1.918813705444336s
1100000/2180454, time spent: 2.7907285690307617s
1200000/2180454, time spent: 2.5606703758239746s
1300000/2180454, time spent: 1.9332435131072998s
1400000/2180454, time spent: 1.9319427013397217s
1500000/2180454, time spent: 1.934542179107666s
1600000/2180454, time spent: 1.9869754314422607s
1700000/2180454, time spent: 2.4215683937072754s
1800000/2180454, time spent: 2.8685688972473145s
1900000/2180454, time spent: 1.9932587146759033s
2000000/2180454, time spent: 1.9458446502685547s
2100000/2180454, time spent: 1.925

In [None]:
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
import torch
import torch.nn as nn

In [None]:
class custom_dataset(Dataset):
    def __init__(self, num_crime, pencentage_arrest, labels):
        self.num_crime = num_crime
        self.pencentage_arrest = pencentage_arrest
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return torch.Tensor(self.num_crime[idx]), torch.Tensor(self.pencentage_arrest[idx]), torch.Tensor(self.labels[idx])

In [None]:
seq_len = 24

feature_num_crime = [num_crime_by_hour_and_type[i:i+seq_len] for i in range(len(num_crime_by_hour_and_type)-seq_len)]
feature_percentage_arrest = [percentage_arrest_by_hour_and_type[i+1:i+seq_len+1] for i in range(len(num_crime_by_hour_and_type)-seq_len)]

labels = [num_crime_by_hour_and_type[i+seq_len] for i in range(len(num_crime_by_hour_and_type)-seq_len)]

x_train, x_val_test, y_train, y_val_test = train_test_split(list(zip(feature_num_crime, feature_percentage_arrest)), labels, test_size=0.3)
x_val, x_test, y_val, y_test = train_test_split(x_val_test, y_val_test, test_size=0.5)

print(np.array(x_test).shape)
print(np.array(y_test).shape)

(31288, 2, 24, 36)
(31288, 36)


In [None]:
np.array(x_test)[:, 0, :, :].shape

(31288, 24, 36)

In [None]:
train_dataset = custom_dataset(np.array(x_train)[:, 0, :, :], np.array(x_train)[:, 0, :, :], y_train)
val_dataset = custom_dataset(np.array(x_val)[:, 0, :, :], np.array(x_val)[:, 0, :, :], y_val)
test_dataset = custom_dataset(np.array(x_test)[:, 0, :, :], np.array(x_test)[:, 0, :, :], y_test)

In [None]:
# Model with the best result
model_path = '/content/drive/My Drive/CS 547/DeepDiveProject/best_model.pth'

# The same model as in deep learning part
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers, nonlinearity, bias, batch_first, dropout, bidirectional):
        super(RNN, self).__init__()
        self.rnn = nn.RNN(input_size, hidden_size, num_layers=num_layers, nonlinearity=nonlinearity, bias=bias, batch_first=batch_first, dropout=dropout, bidirectional=bidirectional)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x, _ = self.rnn(x)
        x = self.fc(x[:, -1, :])
        return x

In [None]:
# Load model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
input_size = 72
output_size = 36
batch_size = 64
learning_rate = 0.001
hidden_size = 64
num_layers = 2
nonlinearity = 'relu'
bias = True
batch_first = True
dropout = 0
bidirectional = False

rnn = RNN(input_size=input_size, hidden_size=hidden_size, output_size=output_size,
          num_layers=num_layers, nonlinearity=nonlinearity, bias=bias,
          batch_first=batch_first, dropout=dropout, bidirectional=bidirectional).to(device)
rnn.load_state_dict(torch.load(model_path))
rnn.eval()
print("model loaded")

model loaded


  rnn.load_state_dict(torch.load(model_path))


In [None]:
def model_evaluation(batch_size, optimizer, hidden_size, num_layers, nonlinearity, bias, dropout, bidirectional, num_epoch, learning_rate):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    rnn = RNN(input_size=72, hidden_size=hidden_size, output_size=36, num_layers=num_layers, nonlinearity=nonlinearity, bias=bias, batch_first=True, dropout=dropout, bidirectional=bidirectional).to(device)
    criterion = nn.MSELoss()
    if optimizer == 'Adam':
        optimizer = torch.optim.Adam(rnn.parameters(), lr=learning_rate)
    elif optimizer == 'SGD':
        optimizer = torch.optim.SGD(rnn.parameters(), lr=learning_rate)
    else:
        print('Invalid optimizer')
        return

    best_val_loss = float('inf')
    best_model = None

    for epoch in range(num_epoch):
        rnn.train()
        train_loss = 0
        for batch_x1, batch_x2, batch_y in train_loader:
            input = torch.cat((batch_x1, batch_x2), dim=-1).to(device)
            predictions = rnn(input)
            loss = criterion(predictions, batch_y.to(device))
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        train_loss /= len(train_loader)
        rnn.eval()
        val_loss = 0
        with torch.no_grad():
            for batch_x1, batch_x2, batch_y in val_loader:
                input = torch.cat((batch_x1, batch_x2), dim=-1).to(device)
                predictions = rnn(input)
                loss = criterion(predictions, batch_y.to(device))
                val_loss += loss.item()

        val_loss /= len(val_loader)
        print(f"Epoch {epoch + 1}/{num_epoch}, Train Loss: {train_loss}, Validation Loss: {val_loss}")

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model = rnn.state_dict()
            torch.save(best_model, '/content/drive/My Drive/CS 547/DeepDiveProject/best_model.pth')

    if best_model:
      rnn.load_state_dict(best_model)
    rnn.eval()
    test_loss = 0
    with torch.no_grad():
        for batch_x1, batch_x2, batch_y in test_loader:
            input = torch.cat((batch_x1, batch_x2), dim=-1).to(device)
            predictions = rnn(input)
            loss = criterion(predictions, batch_y.to(device))
            test_loss += loss.item()

    test_loss /= len(test_loader)
    rmse = torch.sqrt(torch.tensor(test_loss))
    print(f"Test Loss: {test_loss}, Test RMSE: {rmse}")
    return test_loss, rmse

In [None]:
from sklearn.metrics import mean_squared_error
from torch.utils.data import DataLoader

# Assuming `val_dataset` is defined somewhere in your current notebook
value_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

def permutation_importance(model, X_val, y_val):
    model.eval()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    X_val, y_val = X_val.to(device), y_val.to(device)

    # Baseline performance
    baseline_preds = model(X_val).detach().cpu().numpy()
    baseline_loss = mean_squared_error(y_val.cpu().numpy(), baseline_preds)

    feature_importance = []

    # Define the two groups: first 36 elements and latter 36 elements
    groups = [(slice(0, 36), "Group 1"), (slice(36, 72), "Group 2")]

    for group, group_name in groups:
        # Permute the entire group of features
        X_val_permuted = X_val.clone()
        permuted_indices = torch.randperm(X_val.shape[0])  # Shuffle along batch dimension
        X_val_permuted[:, :, group] = X_val_permuted[permuted_indices, :, group]

        # Compute loss after permutation
        permuted_preds = model(X_val_permuted).detach().cpu().numpy()
        permuted_loss = mean_squared_error(y_val.cpu().numpy(), permuted_preds)

        # Importance is the increase in loss
        importance = (permuted_loss - baseline_loss) / X_val.shape[0]
        feature_importance.append(importance)

    return np.array(feature_importance)

# Example usage
feature_importance = np.zeros(2)
for X1_val, X2_val, y_val in value_loader:  # Get a batch of validation data
  # print(X1_val.shape)
  # print(X2_val.shape)
  # print(y_val.shape)
  X_val = torch.cat((X1_val, X2_val), dim=-1)
  # print(X_val.shape)
  feature_importance += permutation_importance(rnn, X_val, y_val)
# Print feature importance
for idx, importance in enumerate(feature_importance):
    print(f"Feature {idx}: Importance = {importance}")

Feature 0: Importance = -9.06616249786956
Feature 1: Importance = 2.6200458322252547


Conclusion:
Feature 0 represents an array containing the number of occurrences of each primary crime type within a specific hour, while Feature 1 captures the percentage of arrests corresponding to each primary crime type during the same time frame. Using the random permutation method, we observed that Feature 1 demonstrates greater importance compared to Feature 0, as it significantly contributes to a positive reduction in the loss. This finding aligns with our experimental results, where including Feature 1 in the dataset leads to a noticeable decrease in the overall loss, further validating its relevance and impact on the model's performance.