In [None]:
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

In [None]:
# Loading data

on_colab = False

if on_colab:
    from google.colab import drive
    drive.mount('/content/drive')
    path = '/content/drive/My Drive/CS 547/DeepDiveProject/'
else:
    path = ''

dataset_path = path + 'feature_hour_of_day.pickle'
with open(dataset_path, 'rb') as f:
    feature_hour_of_day = torch.Tensor(np.array(pickle.load(f)))
print(feature_hour_of_day.shape)

dataset_path = path + 'feature_day_of_week.pickle'
with open(dataset_path, 'rb') as f:
    feature_day_of_week = torch.Tensor(np.array(pickle.load(f)))
print(feature_day_of_week.shape)

dataset_path = path + 'feature_day_of_month.pickle'
with open(dataset_path, 'rb') as f:
    feature_day_of_month = torch.Tensor(np.array(pickle.load(f)))
print(feature_day_of_month.shape)

dataset_path = path + 'feature_week_of_year.pickle'
with open(dataset_path, 'rb') as f:
    feature_week_of_year = torch.Tensor(np.array(pickle.load(f)))
print(feature_week_of_year.shape)

dataset_path = path + 'feature_month_of_year.pickle'
with open(dataset_path, 'rb') as f:
    feature_month_of_year = torch.Tensor(np.array(pickle.load(f)))
print(feature_month_of_year.shape)

dataset_path = path + 'feature_year.pickle'
with open(dataset_path, 'rb') as f:
    feature_year = torch.Tensor(np.array(pickle.load(f)))
print(feature_year.shape)

dataset_path = path + 'feature_num_crime.pickle'
with open(dataset_path, 'rb') as f:
    feature_num_crime = torch.Tensor(np.array(pickle.load(f)))
print(feature_num_crime.shape)

dataset_path = path + 'feature_percentage_arrest.pickle'
with open(dataset_path, 'rb') as f:
    feature_percentage_arrest = torch.Tensor(np.array(pickle.load(f)))
print(feature_percentage_arrest.shape)

dataset_path = path + 'labels.pickle'
with open(dataset_path, 'rb') as f:
    labels = torch.Tensor(np.array(pickle.load(f)))
print(labels.shape)

torch.Size([208585, 24, 24])
torch.Size([208585, 24, 7])
torch.Size([208585, 24, 31])
torch.Size([208585, 24, 53])
torch.Size([208585, 24, 12])
torch.Size([208585, 24, 24])
torch.Size([208585, 24, 36])
torch.Size([208585, 24, 36])
torch.Size([208585, 36])


In [None]:
# feature_num_crime 36
# feature_percentage_arrest 36
# feature_hour_of_day 24
# feature_day_of_week 7
# feature_day_of_month 31
# feature_week_of_year 53
# feature_month_of_year 12
# feature_year 24

In [None]:
# Combine the features
features = torch.cat((feature_num_crime, feature_percentage_arrest, feature_hour_of_day,
                      feature_day_of_week, feature_day_of_month, feature_week_of_year,
                      feature_month_of_year, feature_year), dim=-1)
features.shape

torch.Size([208585, 24, 223])

In [None]:
# Split the train, val, test set
from sklearn.model_selection import train_test_split

x_train, x_val_test, y_train, y_val_test = train_test_split(features, labels, test_size=0.3)
x_val, x_test, y_val, y_test = train_test_split(x_val_test, y_val_test, test_size=0.5)

x_val_test = None
y_val_test = None

In [None]:
# Creating custom dataset for use in dataloaders
class custom_dataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return torch.Tensor(self.features[idx]), torch.Tensor(self.labels[idx])

train_dataset = custom_dataset(x_train, y_train)
val_dataset = custom_dataset(x_val, y_val)
test_dataset = custom_dataset(x_test, y_test)

In [None]:
# Defining the model
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers, nonlinearity, bias, batch_first, dropout, bidirectional):
        super(RNN, self).__init__()
        self.rnn = nn.RNN(input_size, hidden_size, num_layers=num_layers, nonlinearity=nonlinearity, bias=bias, batch_first=batch_first, dropout=dropout, bidirectional=bidirectional)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x, _ = self.rnn(x)
        x = self.fc(x[:, -1, :])
        return x

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# Loading in the model and do the predictions
model = RNN(input_size=223, hidden_size=64, output_size=36, num_layers=2, nonlinearity='relu', bias=True, batch_first=True, dropout=0, bidirectional=False).to(device)
model.load_state_dict(best_model)
model.eval()

with torch.no_grad():
    y_pred = model(x_test.to(device)).cpu()

In [None]:
from sklearn.metrics import mean_squared_error
from torch.utils.data import DataLoader

test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

def permutation_importance(model, x_val, y_val):
    model.eval()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    x_val, y_val = x_val.to(device), y_val.to(device)

    # Baseline performance
    baseline_preds = model(x_val).detach().cpu().numpy()
    baseline_loss = mean_squared_error(y_val.cpu().numpy(), baseline_preds)

    feature_importance = []

    # Define the two groups: first 36 elements and latter 36 elements
    groups = [
        (slice(0, 36), "feature_num_crime"),
        (slice(36, 72), "feature_percentage_arrest"),
        (slice(72, 96), "feature_hour_of_day"),
        (slice(96, 103), "feature_day_of_week"),
        (slice(103, 134), "feature_day_of_month"),
        (slice(134, 187), "feature_week_of_year"),
        (slice(187, 199), "feature_month_of_year"),
        (slice(199, 223), "feature_year"),
    ]

    for group, group_name in groups:
        # Permute the entire group of features
        x_val_permuted = x_val.clone()
        permuted_indices = torch.randperm(x_val.shape[0])  # Shuffle along batch dimension
        x_val_permuted[:, :, group] = x_val_permuted[permuted_indices, :, group]

        # Compute loss after permutation
        permuted_preds = model(x_val_permuted).detach().cpu().numpy()
        permuted_loss = mean_squared_error(y_val.cpu().numpy(), permuted_preds)

        # Importance is the increase in loss
        importance = (permuted_loss - baseline_loss) / x_val.shape[0]
        feature_importance.append(importance)

    return np.array(feature_importance)

# Example usage
feature_importance = np.zeros(8)
for x_val, y_val in test_loader:  # Get a batch of validation data
    feature_importance += permutation_importance(model, x_val, y_val)

features = ["feature_num_crime", "feature_percentage_arrest", "feature_hour_of_day", "feature_day_of_week", "feature_day_of_month", "feature_week_of_year", "feature_month_of_year", "feature_year"]
# Print feature importance
for idx, importance in enumerate(feature_importance):
    print(f"{features[idx]} importance = {importance}")

feature_num_crime importance = 4.2899253107607365
feature_percentage_arrest importance = 1.2773448530850666
feature_hour_of_day importance = 10.618078257223326
feature_day_of_week importance = 0.8638675891395126
feature_day_of_month importance = 1.2357479110360146
feature_week_of_year importance = 0.39645050479365246
feature_month_of_year importance = 0.5167712543958
feature_year importance = 3.696456347459129
