# SVM classification on the Seattle Crime dataset
Author: Tomas Hobza

In [4]:
from scipy.io import arff
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import pickle
from sklearn.svm import LinearSVC
import time
from sklearn.metrics import balanced_accuracy_score

## 0. Load the dataset

In [5]:
# Load the ARFF file
data, meta = arff.loadarff('Seattle_Crime_Data_06-23-2019-4.arff')

# Convert to a pandas DataFrame
df = pd.DataFrame(data)

# Decode byte strings (ARFF stores nominal/string attributes as bytes)
for col in df.select_dtypes([object]):
    df[col] = df[col].str.decode('utf-8')

In [6]:
def preprocess_crime(df, RARITY_THRESHOLD = 1000):
  new_df = df.copy()

  # == 1. handle missing values ==
  # unify unknown values into NAs
  new_df = new_df.replace(['UNKNOWN', '?', 'Unknown', 'unknown'], pd.NA)
  print(new_df.isnull().sum()[new_df.isnull().sum() > 0])
  # drop rows with missing Occurred_Time or Reported_Time
  new_df = new_df.dropna(subset=['Occurred_Time', 'Reported_Time'])
  # for missing values in other columns, we can substitute them as a unified "UNKNOWN" category.
  new_df = new_df.fillna('UNKNOWN')
  print("Number of NAs:", new_df.isnull().sum()[new_df.isnull().sum() > 0])

  # == 2. drop the UID column Report_Number ==
  new_df = new_df.drop(columns=['Report_Number'])

  # == 3. drop Crime_Subcategory to prevent leakage ==
  new_df = new_df.drop(columns=["Crime_Subcategory"])

  # == 4. split occurred_time and reported_time into hour and minute and
  # sin/cos tranform them to keep the relationship that 00:00 comes after 23:59 ==
  # split "Reported_Time" into "reported_hour" and "reported_minute"
  new_df["reported_hour"] = new_df["Reported_Time"].apply(lambda x: int(x/100) if not pd.isna(x) else pd.NA)
  new_df["reported_minute"] = new_df["Reported_Time"].apply(lambda x: x % 100 if not pd.isna(x) else pd.NA)
  # make hours and minutes integers
  new_df["reported_hour"] = new_df["reported_hour"].astype("Int64")
  new_df["reported_minute"] = new_df["reported_minute"].astype("Int64")
  # cyclical encoding (sin/cos)
  new_df["reported_hour_sin"] = np.sin(2 * np.pi * new_df["reported_hour"] / 24)
  new_df["reported_hour_cos"] = np.cos(2 * np.pi * new_df["reported_hour"] / 24)
  new_df["reported_minute_sin"] = np.sin(2 * np.pi * new_df["reported_minute"] / 60)
  new_df["reported_minute_cos"] = np.cos(2 * np.pi * new_df["reported_minute"] / 60)

  # split "Occurred_Time" into "occured_hour" and "occured_minute"
  new_df["occured_hour"] = new_df["Occurred_Time"].apply(lambda x: int(x/100) if not pd.isna(x) else pd.NA)
  new_df["occured_minute"] = new_df["Occurred_Time"].apply(lambda x: x % 100 if not pd.isna(x) else pd.NA)
  # make hours and minutes integers
  new_df["occured_hour"] = new_df["occured_hour"].astype("Int64")
  new_df["occured_minute"] = new_df["occured_minute"].astype("Int64")
  # cyclical encoding (sin/cos)
  new_df["occured_hour_sin"] = np.sin(2 * np.pi * new_df["occured_hour"] / 24)
  new_df["occured_hour_cos"] = np.cos(2 * np.pi * new_df["occured_hour"] / 24)
  new_df["occured_minute_sin"] = np.sin(2 * np.pi * new_df["occured_minute"] / 60)
  new_df["occured_minute_cos"] = np.cos(2 * np.pi * new_df["occured_minute"] / 60)

  # drop Reported_Time, Occurred_Time, reported_hour, reported_minute, occured_hour, occured_minute
  new_df = new_df.drop(columns=["Reported_Time", "Occurred_Time", "reported_hour", "reported_minute", "occured_hour", "occured_minute"])

  # == 5. handle class imbalance ==
  # check class distribution
  class_counts = new_df['Primary_Offense_Description'].value_counts()
  print("Original target class distribution: ", class_counts)
  rare_classes = class_counts[class_counts < RARITY_THRESHOLD].index
  print(f"Number of rare classes (less than {RARITY_THRESHOLD} instances): {len(rare_classes)}")
  new_df['Primary_Offense_Description'] = new_df['Primary_Offense_Description'].apply(lambda x: 'OTHER' if x in rare_classes else x)
  print("New target class distribution: ", new_df['Primary_Offense_Description'].value_counts())

  # == 6. split the data into training and testing ==
  X = new_df.drop(columns=['Primary_Offense_Description'])
  y = new_df['Primary_Offense_Description']
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
  print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")

  # == 7. one-hot encode nominal features ==
  # encode separately
  X_train = pd.get_dummies(X_train, columns=['Precinct', 'Sector', 'Beat', 'Neighborhood'])
  X_test = pd.get_dummies(X_test, columns=['Precinct', 'Sector', 'Beat', 'Neighborhood'])
  # align (ensures same columns)
  X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

  return X_train, X_test, y_train, y_test

## 1. Experiment - find the optimal RARITY_THRESHOLD

In [7]:
thresholds = [100, 500, 1000, 2000, 5000]
results = []

# Downsampling parameter
SAMPLE_SIZE = 100000  # Use 100k samples (adjust as needed)

for threshold in thresholds:
    print(f"\n=== Experiment with RARITY_THRESHOLD = {threshold} ===")
    
    # preprocess the data with the current threshold
    X_train, X_test, y_train, y_test = preprocess_crime(df, RARITY_THRESHOLD=threshold)
    
    # DOWNSAMPLE: Stratified sampling to keep all classes
    print(f"Original training size: {len(X_train):,}")
    
    if len(X_train) > SAMPLE_SIZE:
        X_train_sample, _, y_train_sample, _ = train_test_split(
            X_train, y_train,
            train_size=SAMPLE_SIZE,
            stratify=y_train,  # ← This preserves class distribution!
            random_state=42
        )
        print(f"Downsampled to: {len(X_train_sample):,} samples")
        print(f"Classes preserved: {y_train_sample.nunique()} / {y_train.nunique()}")
    else:
        X_train_sample = X_train
        y_train_sample = y_train
        print(f"No downsampling needed (dataset smaller than {SAMPLE_SIZE:,})")
    
    # train with FIXED C=1.0 (don't tune yet!)
    model = LinearSVC(
        C=1.0,
        class_weight='balanced',
        max_iter=1000,  # ← Reduced from 2000 for speed
        random_state=42,
        verbose=0  # ← Set to 1 if you want progress updates
    )
    
    # measure training time
    start = time.time()
    model.fit(X_train_sample, y_train_sample)  # ← Use sample
    train_time = time.time() - start
    
    # evaluate on FULL test set (don't downsample test!)
    y_pred = model.predict(X_test)
    acc = balanced_accuracy_score(y_test, y_pred)
    
    results.append({
        'threshold': threshold,
        'n_classes': y_train.nunique(),
        'train_samples': len(X_train_sample),  # Record sample size
        'original_train_size': len(X_train),
        'balanced_accuracy': acc,
        'train_time': train_time,
        'C': 1.0
    })
    
    print(f"Classes: {y_train.nunique()}")
    print(f"Balanced Accuracy: {acc:.4f}")
    print(f"Time: {train_time:.1f}s ({train_time/60:.1f} min)")

# Show results
df_results = pd.DataFrame(results)
print("\n" + "="*60)
print("EXPERIMENT RESULTS")
print("="*60)
print(df_results)


=== Experiment with RARITY_THRESHOLD = 100 ===
Occurred_Time           2
Reported_Time           2
Crime_Subcategory     262
Precinct             3352
Sector               3346
Beat                 3298
Neighborhood         3366
dtype: int64
Number of NAs: Series([], dtype: int64)
Original target class distribution:  Primary_Offense_Description
THEFT-CARPROWL                   131297
THEFT-SHOPLIFT                    48637
THEFT-OTH                         47275
VEH-THEFT-AUTO                    37840
BURGLARY-FORCE-RES                27984
                                  ...  
NARC-SMUGGLE-HEROIN                   1
HOMICIDE-NEG-MANS-GUN                 1
NARC-SELL-BARBITUATE                  1
NARC-MANUFACTURE-HALLUCINOGEN         1
HOMICIDE-NEG-MANS-WEAPON              1
Name: count, Length: 144, dtype: int64
Number of rare classes (less than 100 instances): 45
New target class distribution:  Primary_Offense_Description
THEFT-CARPROWL                      131297
THEFT-SHOPLIFT   