In [74]:
from scipy.io import arff
import pandas as pd
import numpy as np

In [75]:
# Load the ARFF file
data, meta = arff.loadarff('Seattle_Crime_Data_06-23-2019-4.arff')

# Convert to a pandas DataFrame
df = pd.DataFrame(data)

# Decode byte strings (ARFF stores nominal/string attributes as bytes)
for col in df.select_dtypes([object]):
    df[col] = df[col].str.decode('utf-8')
    
new_df = df.copy()

In [76]:
print(df.head())

   Report_Number  Occurred_Time  Reported_Time     Crime_Subcategory  \
0   1.975000e+12          900.0         1500.0  BURGLARY-RESIDENTIAL   
1   1.976000e+12            1.0         2359.0     SEX OFFENSE-OTHER   
2   1.979000e+12         1600.0         1430.0             CAR PROWL   
3   1.981000e+13         2029.0         2030.0              HOMICIDE   
4   1.981000e+12         2000.0          435.0  BURGLARY-RESIDENTIAL   

    Primary_Offense_Description   Precinct Sector Beat  \
0            BURGLARY-FORCE-RES      SOUTH      R   R3   
1     SEXOFF-INDECENT LIBERTIES    UNKNOWN      ?    ?   
2                THEFT-CARPROWL       EAST      G   G2   
3  HOMICIDE-PREMEDITATED-WEAPON      SOUTH      S   S2   
4            BURGLARY-FORCE-RES  SOUTHWEST      W   W3   

                     Neighborhood  
0            LAKEWOOD/SEWARD PARK  
1                         UNKNOWN  
2        CENTRAL AREA/SQUIRE PARK  
3                 BRIGHTON/DUNLAP  
4  ROXHILL/WESTWOOD/ARBOR HEIGHTS  


# Prepreprocessing

In [77]:
# time validation
print((df['Occurred_Time'] > 2359).sum())  # should be 0
print((df['Occurred_Time'] < 0).sum())     # should be 0

# check for reversed times
reversed_times = df['Reported_Time'] < df['Occurred_Time']
print(f"Reversed times: {reversed_times.sum()}")

# check class distribution
print(df['Primary_Offense_Description'].value_counts())

0
0
Reversed times: 140045
Primary_Offense_Description
THEFT-CARPROWL                   131297
THEFT-SHOPLIFT                    48638
THEFT-OTH                         47275
VEH-THEFT-AUTO                    37840
BURGLARY-FORCE-RES                27984
                                  ...  
NARC-SMUGGLE-HEROIN                   1
HOMICIDE-NEG-MANS-GUN                 1
NARC-SELL-BARBITUATE                  1
NARC-MANUFACTURE-HALLUCINOGEN         1
HOMICIDE-NEG-MANS-WEAPON              1
Name: count, Length: 144, dtype: int64


# Preprocessing

1. get rid of Report_Number as it’s a primary key (to link multiple offesnes to one report)

In [78]:
# new_df = new_df.drop(columns=['Report_Number'])

2. drop Crime_Subcategory to prevent leakage

In [79]:
# new_df = new_df.drop(columns=["Crime_Subcategory"])

3. split occurred_time and reported_time into hour and minute and maybe sin/cos tranform them to keep the relationship that 00:00 comes after 23:59

In [81]:
# split "Reported_Time" into "reported_hour" and "reported_minute"
new_df["reported_hour"] = new_df["Reported_Time"].apply(lambda x: int(x/100) if not pd.isna(x) else pd.NA)
new_df["reported_minute"] = new_df["Reported_Time"].apply(lambda x: x % 100 if not pd.isna(x) else pd.NA)
# make hours and minutes integers
new_df["reported_hour"] = new_df["reported_hour"].astype("Int64")
new_df["reported_minute"] = new_df["reported_minute"].astype("Int64")
# cyclical encoding (sin/cos)
new_df["reported_hour_sin"] = np.sin(2 * np.pi * new_df["reported_hour"] / 24)
new_df["reported_hour_cos"] = np.cos(2 * np.pi * new_df["reported_hour"] / 24)
new_df["reported_minute_sin"] = np.sin(2 * np.pi * new_df["reported_minute"] / 60)
new_df["reported_minute_cos"] = np.cos(2 * np.pi * new_df["reported_minute"] / 60)

# split "Occurred_Time" into "occured_hour" and "occured_minute"
new_df["occured_hour"] = new_df["Occurred_Time"].apply(lambda x: int(x/100) if not pd.isna(x) else pd.NA)
new_df["occured_minute"] = new_df["Occurred_Time"].apply(lambda x: x % 100 if not pd.isna(x) else pd.NA)
# make hours and minutes integers
new_df["occured_hour"] = new_df["occured_hour"].astype("Int64")
new_df["occured_minute"] = new_df["occured_minute"].astype("Int64")
# cyclical encoding (sin/cos)
new_df["occured_hour_sin"] = np.sin(2 * np.pi * new_df["occured_hour"] / 24)
new_df["occured_hour_cos"] = np.cos(2 * np.pi * new_df["occured_hour"] / 24)
new_df["occured_minute_sin"] = np.sin(2 * np.pi * new_df["occured_minute"] / 60)
new_df["occured_minute_cos"] = np.cos(2 * np.pi * new_df["occured_minute"] / 60)

# drop Reported_Time, Occurred_Time, reported_hour, reported_minute, occured_hour, occured_minute
# new_df = new_df.drop(columns=["Reported_Time", "Occurred_Time", "reported_hour", "reported_minute", "occured_hour", "occured_minute"])

new_df

Unnamed: 0,Report_Number,Occurred_Time,Reported_Time,Crime_Subcategory,Primary_Offense_Description,Precinct,Sector,Beat,Neighborhood,reported_hour,...,reported_hour_sin,reported_hour_cos,reported_minute_sin,reported_minute_cos,occured_hour,occured_minute,occured_hour_sin,occured_hour_cos,occured_minute_sin,occured_minute_cos
0,1.975000e+12,900.0,1500.0,BURGLARY-RESIDENTIAL,BURGLARY-FORCE-RES,SOUTH,R,R3,LAKEWOOD/SEWARD PARK,15,...,-0.707107,-0.707107,0.0,1.0,9,0,0.707107,-0.707107,0.0,1.0
1,1.976000e+12,1.0,2359.0,SEX OFFENSE-OTHER,SEXOFF-INDECENT LIBERTIES,UNKNOWN,?,?,UNKNOWN,23,...,-0.258819,0.965926,-0.104528,0.994522,0,1,0.0,1.0,0.104528,0.994522
2,1.979000e+12,1600.0,1430.0,CAR PROWL,THEFT-CARPROWL,EAST,G,G2,CENTRAL AREA/SQUIRE PARK,14,...,-0.5,-0.866025,0.0,-1.0,16,0,-0.866025,-0.5,0.0,1.0
3,1.981000e+13,2029.0,2030.0,HOMICIDE,HOMICIDE-PREMEDITATED-WEAPON,SOUTH,S,S2,BRIGHTON/DUNLAP,20,...,-0.866025,0.5,0.0,-1.0,20,29,-0.866025,0.5,0.104528,-0.994522
4,1.981000e+12,2000.0,435.0,BURGLARY-RESIDENTIAL,BURGLARY-FORCE-RES,SOUTHWEST,W,W3,ROXHILL/WESTWOOD/ARBOR HEIGHTS,4,...,0.866025,0.5,-0.5,-0.866025,20,0,-0.866025,0.5,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
523585,2.019000e+12,1713.0,1713.0,FAMILY OFFENSE-NONVIOLENT,CHILD-OTHER,SOUTH,O,O3,MID BEACON HILL,17,...,-0.965926,-0.258819,0.978148,0.207912,17,13,-0.965926,-0.258819,0.978148,0.207912
523586,2.019000e+12,730.0,1721.0,BURGLARY-RESIDENTIAL,BURGLARY-FORCE-RES,EAST,C,C2,MONTLAKE/PORTAGE BAY,17,...,-0.965926,-0.258819,0.809017,-0.587785,7,30,0.965926,-0.258819,0.0,-1.0
523587,2.019000e+12,1724.0,1724.0,ROBBERY-COMMERCIAL,ROBBERY-BUSINESS-BODYFORCE,SOUTH,S,S2,RAINIER BEACH,17,...,-0.965926,-0.258819,0.587785,-0.809017,17,24,-0.965926,-0.258819,0.587785,-0.809017
523588,2.019000e+12,1750.0,1904.0,THEFT-SHOPLIFT,THEFT-SHOPLIFT,NORTH,L,L2,NORTHGATE,19,...,-0.965926,0.258819,0.406737,0.913545,17,50,-0.965926,-0.258819,-0.866025,0.5


4. standard scale numerical values

In [82]:
# standard scale numerical values
print("lmao, nothing to scale")

lmao, nothing to scale


5. one-hot encode nominal features

In [87]:
# new_df.select_dtypes(include=['object']).columns
new_df = pd.get_dummies(new_df, columns=['Precinct', 'Sector', 'Beat', 'Neighborhood'])