In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pickle
import numpy as np
import pandas as pd

dataset_path = '/content/drive/My Drive/CS 547/DeepDiveProject/working_dataset.pickle'

# Load the dataset
with open(dataset_path, 'rb') as file:
    data = pickle.load(file)

In [None]:
data.head()

Unnamed: 0,ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,...,Ward,Community Area,FBI Code,X Coordinate,Y Coordinate,Year,Updated On,Latitude,Longitude,Location
0,11037294,JA371270,2015-03-18 12:00:00,0000X W WACKER DR,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,BANK,False,False,...,42.0,32.0,11,,,2015,2017-08-01 15:52:26,,,
1,11646293,JC213749,2018-12-20 15:00:00,023XX N LOCKWOOD AVE,1154,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT $300 AND UNDER,APARTMENT,False,False,...,36.0,19.0,11,,,2018,2019-04-06 16:04:43,,,
2,11645836,JC212333,2016-05-01 00:25:00,055XX S ROCKWELL ST,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,,False,False,...,15.0,63.0,11,,,2016,2019-04-06 16:04:43,,,
3,11645959,JC211511,2018-12-20 16:00:00,045XX N ALBANY AVE,2820,OTHER OFFENSE,TELEPHONE THREAT,RESIDENCE,False,False,...,33.0,14.0,08A,,,2018,2019-04-06 16:04:43,,,
4,11645601,JC212935,2014-06-01 00:01:00,087XX S SANGAMON ST,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,RESIDENCE,False,False,...,21.0,71.0,11,,,2014,2019-04-06 16:04:43,,,


In [None]:
data = data[['ID', 'Date', 'Primary Type', 'Arrest']]

In [None]:
# Extract unique crime types from the dataset and sort them alphabetically.
crime_types = sorted(data['Primary Type'].unique())
crime_type_to_index = {crime: idx for idx, crime in enumerate(crime_types)}
# Display the mapping to verify the output.
crime_type_to_index

{'ARSON': 0,
 'ASSAULT': 1,
 'BATTERY': 2,
 'BURGLARY': 3,
 'CONCEALED CARRY LICENSE VIOLATION': 4,
 'CRIM SEXUAL ASSAULT': 5,
 'CRIMINAL DAMAGE': 6,
 'CRIMINAL SEXUAL ASSAULT': 7,
 'CRIMINAL TRESPASS': 8,
 'DECEPTIVE PRACTICE': 9,
 'DOMESTIC VIOLENCE': 10,
 'GAMBLING': 11,
 'HOMICIDE': 12,
 'HUMAN TRAFFICKING': 13,
 'INTERFERENCE WITH PUBLIC OFFICER': 14,
 'INTIMIDATION': 15,
 'KIDNAPPING': 16,
 'LIQUOR LAW VIOLATION': 17,
 'MOTOR VEHICLE THEFT': 18,
 'NARCOTICS': 19,
 'NON - CRIMINAL': 20,
 'NON-CRIMINAL': 21,
 'NON-CRIMINAL (SUBJECT SPECIFIED)': 22,
 'OBSCENITY': 23,
 'OFFENSE INVOLVING CHILDREN': 24,
 'OTHER NARCOTIC VIOLATION': 25,
 'OTHER OFFENSE': 26,
 'PROSTITUTION': 27,
 'PUBLIC INDECENCY': 28,
 'PUBLIC PEACE VIOLATION': 29,
 'RITUALISM': 30,
 'ROBBERY': 31,
 'SEX OFFENSE': 32,
 'STALKING': 33,
 'THEFT': 34,
 'WEAPONS VIOLATION': 35}

In [None]:
data['Date'] = pd.to_datetime(data['Date'])
data['Hour'] = data['Date'].dt.floor('h')
data.head()

Unnamed: 0,ID,Date,Primary Type,Arrest,Hour
0,11037294,2015-03-18 12:00:00,DECEPTIVE PRACTICE,False,2015-03-18 12:00:00
1,11646293,2018-12-20 15:00:00,DECEPTIVE PRACTICE,False,2018-12-20 15:00:00
2,11645836,2016-05-01 00:25:00,DECEPTIVE PRACTICE,False,2016-05-01 00:00:00
3,11645959,2018-12-20 16:00:00,OTHER OFFENSE,False,2018-12-20 16:00:00
4,11645601,2014-06-01 00:01:00,DECEPTIVE PRACTICE,False,2014-06-01 00:00:00


In [None]:
# Calculate the total number of hours in the range.
start_time = data['Date'].min().floor('h')
end_time = data['Date'].max().ceil('h')
all_hours = pd.date_range(start=start_time, end=end_time, freq='h')

total_number_of_hours = len(all_hours)
print(total_number_of_hours)

208609


In [None]:
# Create hour to idx dict for later use
hour_idx_dict = {hour: idx for idx, hour in enumerate(all_hours)}
print(len(hour_idx_dict))

208609


In [None]:
# Aggregate number of crimes by hour and crime type
num_crime_by_hour_and_type = [[0] * len(crime_types) for _ in range(total_number_of_hours)]
hourly_data_crime_type = data.groupby(['Hour', 'Primary Type']).size().reset_index(name='Count')
hourly_data_crime_type.head()

Unnamed: 0,Hour,Primary Type,Count
0,2001-01-01,ASSAULT,4
1,2001-01-01,BATTERY,18
2,2001-01-01,BURGLARY,2
3,2001-01-01,CRIM SEXUAL ASSAULT,21
4,2001-01-01,CRIMINAL DAMAGE,48


In [None]:
# Fill the list with data
for i in range(len(hourly_data_crime_type)):
    num_crime_by_hour_and_type[hour_idx_dict[hourly_data_crime_type['Hour'][i]]][crime_type_to_index[hourly_data_crime_type['Primary Type'][i]]] = hourly_data_crime_type['Count'][i]

In [None]:
# Aggregate number of arrests by hour and crime type
percentage_arrest_by_hour_and_type = [[0] * len(crime_types) for _ in range(total_number_of_hours)]
hourly_data_num_arrest = data[data['Arrest']==True].groupby(['Hour', 'Primary Type']).size().reset_index(name='Count')
hourly_data_num_arrest.head()

Unnamed: 0,Hour,Primary Type,Count
0,2001-01-01,BATTERY,1
1,2001-01-01,CRIM SEXUAL ASSAULT,5
2,2001-01-01,CRIMINAL DAMAGE,1
3,2001-01-01,CRIMINAL TRESPASS,2
4,2001-01-01,DECEPTIVE PRACTICE,8


In [None]:
# Calculate the arrest rate and fill the list with data
for i in range(len(hourly_data_num_arrest)):
    percentage_arrest_by_hour_and_type[hour_idx_dict[hourly_data_num_arrest['Hour'][i]]][crime_type_to_index[hourly_data_num_arrest['Primary Type'][i]]] = hourly_data_num_arrest['Count'][i] / num_crime_by_hour_and_type[hour_idx_dict[hourly_data_num_arrest['Hour'][i]]][crime_type_to_index[hourly_data_num_arrest['Primary Type'][i]]]


In [None]:
seq_len = 24

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
# One hot encoding for the time related features

In [None]:
encoder = OneHotEncoder(sparse_output=False)
hour_of_day = [all_hours[i].hour for i in range(len(all_hours))]
feature_hour_of_day = list(encoder.fit_transform(np.array(hour_of_day).reshape(-1, 1)))
feature_hour_of_day = [feature_hour_of_day[i:i+seq_len] for i in range(len(feature_hour_of_day)-seq_len)]

In [None]:
dataset_path = '/content/drive/My Drive/CS 547/DeepDiveProject/feature_hour_of_day.pickle'

with open(dataset_path, 'wb') as file:
    pickle.dump(feature_hour_of_day, file)

In [None]:
encoder = OneHotEncoder(sparse_output=False)
day_of_week = [all_hours[i].weekday() for i in range(len(all_hours))]
feature_day_of_week = list(encoder.fit_transform(np.array(day_of_week).reshape(-1, 1)))
feature_day_of_week = [feature_day_of_week[i:i+seq_len] for i in range(len(feature_day_of_week)-seq_len)]

In [None]:
dataset_path = '/content/drive/My Drive/CS 547/DeepDiveProject/feature_day_of_week.pickle'

with open(dataset_path, 'wb') as file:
    pickle.dump(feature_day_of_week, file)

In [None]:
encoder = OneHotEncoder(sparse_output=False)
day_of_month = [all_hours[i].day for i in range(len(all_hours))]
feature_day_of_month = list(encoder.fit_transform(np.array(day_of_month).reshape(-1, 1)))
feature_day_of_month = [feature_day_of_month[i:i+seq_len] for i in range(len(feature_day_of_month)-seq_len)]

In [None]:
dataset_path = '/content/drive/My Drive/CS 547/DeepDiveProject/feature_day_of_month.pickle'

with open(dataset_path, 'wb') as file:
    pickle.dump(feature_day_of_month, file)

In [None]:
encoder = OneHotEncoder(sparse_output=False)
week_of_year = [all_hours[i].week for i in range(len(all_hours))]
feature_week_of_year = list(encoder.fit_transform(np.array(week_of_year).reshape(-1, 1)))
feature_week_of_year = [feature_week_of_year[i:i+seq_len] for i in range(len(feature_week_of_year)-seq_len)]

In [None]:
dataset_path = '/content/drive/My Drive/CS 547/DeepDiveProject/feature_week_of_year.pickle'

with open(dataset_path, 'wb') as file:
    pickle.dump(feature_week_of_year, file)

In [None]:
encoder = OneHotEncoder(sparse_output=False)
month_of_year = [all_hours[i].month for i in range(len(all_hours))]
feature_month_of_year = list(encoder.fit_transform(np.array(month_of_year).reshape(-1, 1)))
feature_month_of_year = [feature_month_of_year[i:i+seq_len] for i in range(len(feature_month_of_year)-seq_len)]

In [None]:
dataset_path = '/content/drive/My Drive/CS 547/DeepDiveProject/feature_month_of_year.pickle'

with open(dataset_path, 'wb') as file:
    pickle.dump(feature_month_of_year, file)

In [None]:
encoder = OneHotEncoder(sparse_output=False)
year = [all_hours[i].year for i in range(len(all_hours))]
feature_year = list(encoder.fit_transform(np.array(year).reshape(-1, 1)))
feature_year = [feature_year[i:i+seq_len] for i in range(len(feature_year)-seq_len)]

In [None]:
dataset_path = '/content/drive/My Drive/CS 547/DeepDiveProject/feature_year.pickle'

with open(dataset_path, 'wb') as file:
    pickle.dump(feature_year, file)

In [None]:
seq_len = 24

In [None]:
# Creating the RNN feature dataset by sampling the above feature on every length 24 sequences
feature_num_crime = [num_crime_by_hour_and_type[i:i+seq_len] for i in range(len(num_crime_by_hour_and_type)-seq_len)]
feature_percentage_arrest = [percentage_arrest_by_hour_and_type[i+1:i+seq_len+1] for i in range(len(num_crime_by_hour_and_type)-seq_len)]

In [None]:
# Saving feature datasets
dataset_path = '/content/drive/My Drive/CS 547/DeepDiveProject/feature_num_crime.pickle'

with open(dataset_path, 'wb') as file:
    pickle.dump(feature_num_crime, file)

dataset_path = '/content/drive/My Drive/CS 547/DeepDiveProject/feature_percentage_arrest.pickle'

with open(dataset_path, 'wb') as file:
    pickle.dump(feature_percentage_arrest, file)

In [None]:
# Create labels
labels = [num_crime_by_hour_and_type[i+seq_len] for i in range(len(num_crime_by_hour_and_type)-seq_len)]

In [None]:
dataset_path = '/content/drive/My Drive/CS 547/DeepDiveProject/labels.pickle'

with open(dataset_path, 'wb') as file:
    pickle.dump(labels, file)