In [5]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from google.colab import files

# Load the dataset (adjust the file path as needed)
file_path = '/content/Crime_Data_from_2020_to_Present.csv'  # Update this path
try:
    df = pd.read_csv(file_path)
except FileNotFoundError:
    print(f"Error: File not found at {file_path}. Please check the path.")
    raise

# Display initial info to verify loading
print("Initial dataset shape:", df.shape)
print("First few rows:\n", df.head())
print("LAT and LON summary:\n", df[['LAT', 'LON']].describe())

# Drop irrelevant or redundant features
drop_cols = ['DR_NO', 'Date Rptd', 'Cross Street', 'LOCATION', 'Status', 'Status Desc',
             'Crm Cd 2', 'Crm Cd 3', 'Crm Cd 4', 'Mocodes']  # Columns to drop
df = df.drop(columns=drop_cols, errors='ignore')  # errors='ignore' handles missing columns
print("Shape after dropping columns:", df.shape)

# Handle missing/invalid data
# Log invalid coordinates instead of dropping all rows
invalid_coords = df[(df['LAT'] == 0) | (df['LON'] == 0)]
print(f"Number of rows with invalid coordinates (LAT=0 or LON=0): {len(invalid_coords)}")

# Filter out invalid coordinates, but keep a copy
df = df[(df['LAT'] != 0) & (df['LON'] != 0)].copy()
print("Shape after filtering invalid coordinates:", df.shape)

if df.empty:
    print("Error: DataFrame is empty after filtering. Check LAT/LON values or dataset.")
    raise ValueError("No valid data remains after filtering.")

# Handle Vict Age = 0 (replace with median)
df['Vict Age'] = df['Vict Age'].replace(0, np.nan)
df['Vict Age'] = df['Vict Age'].fillna(df['Vict Age'].median())

# Handle missing categorical data (e.g., Vict Sex, Vict Descent)
df['Vict Sex'] = df['Vict Sex'].fillna('Unknown')
df['Vict Descent'] = df['Vict Descent'].fillna('Unknown')

# Convert date and time to proper formats
# Inspect DATE OCC values to confirm format
print("Sample DATE OCC values:\n", df['DATE OCC'].head(10))

# Specify date format (MM/DD/YYYY HH:MM:SS AM/PM based on LA Crime Data)
df['DATE OCC'] = pd.to_datetime(df['DATE OCC'], format='%m/%d/%Y %I:%M:%S %p', errors='coerce')
print("Number of invalid DATE OCC values (NaT):", df['DATE OCC'].isna().sum())

# Drop rows with invalid DATE OCC to ensure valid grouping
df = df[df['DATE OCC'].notna()].copy()
print("Shape after dropping invalid DATE OCC rows:", df.shape)

if df.empty:
    print("Error: DataFrame is empty after dropping invalid DATE OCC rows.")
    raise ValueError("No valid data remains after date filtering.")

df['TIME OCC'] = df['TIME OCC'].astype(str).str.zfill(4).str[:2].astype(int)  # Extract hour

# Feature engineering
df['Day of Week'] = df['DATE OCC'].dt.dayofweek  # 0 = Monday, 6 = Sunday
df['Is Violent'] = df['Crm Cd'].isin([510, 330, 480]).astype(int)  # 1 = Violent, 0 = Non-violent
print("Shape after feature engineering:", df.shape)

# Aggregate by Rpt Dist No and month
df_grouped = df.groupby(['Rpt Dist No', df['DATE OCC'].dt.to_period('M')]).agg({
    'LAT': 'mean',
    'LON': 'mean',
    'TIME OCC': 'mean',
    'Day of Week': 'mean',
    'Is Violent': 'sum',
    'Vict Age': 'mean',
    'Premis Cd': 'first'  # Take first if categorical, or use mode
}).reset_index()
df_grouped['Crime Count'] = df.groupby(['Rpt Dist No', df['DATE OCC'].dt.to_period('M')]).size().values
print("Shape after aggregation:", df_grouped.shape)

if df_grouped.empty:
    print("Error: Aggregated DataFrame is empty. Check grouping or data validity.")
    raise ValueError("No data remains after aggregation.")

# Create binary target: Top 20% crime counts = high-crime
df_grouped['Target'] = (df_grouped['Crime Count'] > df_grouped['Crime Count'].quantile(0.80)).astype(int)  # 1 = High-crime, 0 = Not

# Label encode categorical features
label_encoders = {}
for col in ['Rpt Dist No', 'Premis Cd']:
    le = LabelEncoder()
    df_grouped[col] = le.fit_transform(df_grouped[col])
    label_encoders[col] = dict(zip(le.classes_, le.transform(le.classes_)))

# Save label encodings to CSV
encoding_data = []
for col, mapping in label_encoders.items():
    for original, encoded in mapping.items():
        encoding_data.append([col, original, encoded])
encoding_df = pd.DataFrame(encoding_data, columns=['Variable', 'Original_Value', 'Encoded_Value'])
encoding_file = 'label_encodings.csv'
encoding_df.to_csv(encoding_file, index=False)

# Scale numerical features
scaler = StandardScaler()
numerical_cols = ['LAT', 'LON', 'TIME OCC', 'Day of Week', 'Vict Age', 'Is Violent', 'Crime Count']
df_grouped[numerical_cols] = scaler.fit_transform(df_grouped[numerical_cols])

# Convert DATE OCC to string for CSV compatibility
df_grouped['DATE OCC'] = df_grouped['DATE OCC'].dt.to_timestamp().dt.strftime('%Y-%m-%d')

# Save preprocessed data as CSV
output_file = 'preprocessed_crime_data.csv'
df_grouped.to_csv(output_file, index=False)

# Download both files in Google Colab
files.download(output_file)
files.download(encoding_file)

print(f"Preprocessed data has been saved as {output_file} and encodings as {encoding_file}. Both are ready for download!")

Initial dataset shape: (1005199, 28)
First few rows:
        DR_NO               Date Rptd                DATE OCC  TIME OCC  AREA  \
0  190326475  03/01/2020 12:00:00 AM  03/01/2020 12:00:00 AM      2130     7   
1  200106753  02/09/2020 12:00:00 AM  02/08/2020 12:00:00 AM      1800     1   
2  200320258  11/11/2020 12:00:00 AM  11/04/2020 12:00:00 AM      1700     3   
3  200907217  05/10/2023 12:00:00 AM  03/10/2020 12:00:00 AM      2037     9   
4  200412582  09/09/2020 12:00:00 AM  09/09/2020 12:00:00 AM       630     4   

    AREA NAME  Rpt Dist No  Part 1-2  Crm Cd  \
0    Wilshire          784         1     510   
1     Central          182         1     330   
2   Southwest          356         1     480   
3    Van Nuys          964         1     343   
4  Hollenbeck          413         1     510   

                                Crm Cd Desc  ... Status   Status Desc  \
0                          VEHICLE - STOLEN  ...     AA  Adult Arrest   
1                     BURGLARY

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Preprocessed data has been saved as preprocessed_crime_data.csv and encodings as label_encodings.csv. Both are ready for download!
