In [1]:
import pandas as pd
import numpy as np

# 1. Load Data
# We act as if the file is in the data folder. 
# 'low_memory=False' helps if the file is huge.
df = pd.read_csv('../data/raw_crime_data.csv', low_memory=False)

print(f"Original Shape: {df.shape}")

# 2. Drop Useless Columns (Customize this list based on your actual columns)
# We only need Time, Location, and Type.
keep_cols = ['Date', 'Primary Type', 'Description', 'Latitude', 'Longitude']
df = df[keep_cols]

# 3. Handle Missing Values
# Geospatial analysis requires exact location. If Lat/Long is missing, we must drop it.
df.dropna(subset=['Latitude', 'Longitude'], inplace=True)

# 4. Feature Engineering: Time
# Convert the 'Date' column to a proper datetime object (this takes a moment)
df['Date'] = pd.to_datetime(df['Date'])

# Extract useful features for Person B
df['Hour'] = df['Date'].dt.hour
df['Month'] = df['Date'].dt.month
df['Day_Name'] = df['Date'].dt.day_name()

# 5. Filter for "Modern" Data
# Let's only look at the last 5 years to keep it relevant (and faster)
df = df[df['Date'].dt.year >= 2019]

print(f"Cleaned Shape: {df.shape}")
print(df.head())

# 6. Save the Processed Data
# Person B will load THIS file.
df.to_csv('../data/cleaned_crime_data.csv', index=False)
print("✅ Data cleaned and saved to ../data/cleaned_crime_data.csv")

Original Shape: (236077, 22)


  df['Date'] = pd.to_datetime(df['Date'])


Cleaned Shape: (236007, 8)
                 Date            Primary Type  \
0 2025-12-31 23:58:00                 ASSAULT   
1 2025-12-31 23:55:00     MOTOR VEHICLE THEFT   
2 2025-12-31 23:54:00  PUBLIC PEACE VIOLATION   
3 2025-12-31 23:54:00                 BATTERY   
4 2025-12-31 23:54:00  PUBLIC PEACE VIOLATION   

                                         Description   Latitude  Longitude  \
0                                             SIMPLE  41.802549 -87.667246   
1                      THEFT / RECOVERY - AUTOMOBILE  41.882329 -87.758411   
2                                    OTHER VIOLATION  41.976290 -87.905227   
3  AGGRAVATED P.O. - HANDS, FISTS, FEET, NO / MIN...  41.885427 -87.661759   
4                                    OTHER VIOLATION  41.976290 -87.905227   

   Hour  Month   Day_Name  
0    23     12  Wednesday  
1    23     12  Wednesday  
2    23     12  Wednesday  
3    23     12  Wednesday  
4    23     12  Wednesday  
✅ Data cleaned and saved to ../data/clean