# SVM classification on the Seattle Crime dataset
Author: Tomas Hobza

In [4]:
from scipy.io import arff
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import pickle
from sklearn.svm import LinearSVC
import time
from sklearn.metrics import balanced_accuracy_score

## 0. Load the dataset

In [5]:
# Load the ARFF file
data, meta = arff.loadarff('Seattle_Crime_Data_06-23-2019-4.arff')

# Convert to a pandas DataFrame
df = pd.DataFrame(data)

# Decode byte strings (ARFF stores nominal/string attributes as bytes)
for col in df.select_dtypes([object]):
    df[col] = df[col].str.decode('utf-8')

In [6]:
def preprocess_crime(df, RARITY_THRESHOLD = 1000):
  new_df = df.copy()

  # == 1. handle missing values ==
  # unify unknown values into NAs
  new_df = new_df.replace(['UNKNOWN', '?', 'Unknown', 'unknown'], pd.NA)
  print(new_df.isnull().sum()[new_df.isnull().sum() > 0])
  # drop rows with missing Occurred_Time or Reported_Time
  new_df = new_df.dropna(subset=['Occurred_Time', 'Reported_Time'])
  # for missing values in other columns, we can substitute them as a unified "UNKNOWN" category.
  new_df = new_df.fillna('UNKNOWN')
  print("Number of NAs:", new_df.isnull().sum()[new_df.isnull().sum() > 0])

  # == 2. drop the UID column Report_Number ==
  new_df = new_df.drop(columns=['Report_Number'])

  # == 3. drop Crime_Subcategory to prevent leakage ==
  new_df = new_df.drop(columns=["Crime_Subcategory"])

  # == 4. split occurred_time and reported_time into hour and minute and
  # sin/cos tranform them to keep the relationship that 00:00 comes after 23:59 ==
  # split "Reported_Time" into "reported_hour" and "reported_minute"
  new_df["reported_hour"] = new_df["Reported_Time"].apply(lambda x: int(x/100) if not pd.isna(x) else pd.NA)
  new_df["reported_minute"] = new_df["Reported_Time"].apply(lambda x: x % 100 if not pd.isna(x) else pd.NA)
  # make hours and minutes integers
  new_df["reported_hour"] = new_df["reported_hour"].astype("Int64")
  new_df["reported_minute"] = new_df["reported_minute"].astype("Int64")
  # cyclical encoding (sin/cos)
  new_df["reported_hour_sin"] = np.sin(2 * np.pi * new_df["reported_hour"] / 24)
  new_df["reported_hour_cos"] = np.cos(2 * np.pi * new_df["reported_hour"] / 24)
  new_df["reported_minute_sin"] = np.sin(2 * np.pi * new_df["reported_minute"] / 60)
  new_df["reported_minute_cos"] = np.cos(2 * np.pi * new_df["reported_minute"] / 60)

  # split "Occurred_Time" into "occured_hour" and "occured_minute"
  new_df["occured_hour"] = new_df["Occurred_Time"].apply(lambda x: int(x/100) if not pd.isna(x) else pd.NA)
  new_df["occured_minute"] = new_df["Occurred_Time"].apply(lambda x: x % 100 if not pd.isna(x) else pd.NA)
  # make hours and minutes integers
  new_df["occured_hour"] = new_df["occured_hour"].astype("Int64")
  new_df["occured_minute"] = new_df["occured_minute"].astype("Int64")
  # cyclical encoding (sin/cos)
  new_df["occured_hour_sin"] = np.sin(2 * np.pi * new_df["occured_hour"] / 24)
  new_df["occured_hour_cos"] = np.cos(2 * np.pi * new_df["occured_hour"] / 24)
  new_df["occured_minute_sin"] = np.sin(2 * np.pi * new_df["occured_minute"] / 60)
  new_df["occured_minute_cos"] = np.cos(2 * np.pi * new_df["occured_minute"] / 60)

  # drop Reported_Time, Occurred_Time, reported_hour, reported_minute, occured_hour, occured_minute
  new_df = new_df.drop(columns=["Reported_Time", "Occurred_Time", "reported_hour", "reported_minute", "occured_hour", "occured_minute"])

  # == 5. handle class imbalance ==
  # check class distribution
  class_counts = new_df['Primary_Offense_Description'].value_counts()
  print("Original target class distribution: ", class_counts)
  rare_classes = class_counts[class_counts < RARITY_THRESHOLD].index
  print(f"Number of rare classes (less than {RARITY_THRESHOLD} instances): {len(rare_classes)}")
  new_df['Primary_Offense_Description'] = new_df['Primary_Offense_Description'].apply(lambda x: 'OTHER' if x in rare_classes else x)
  print("New target class distribution: ", new_df['Primary_Offense_Description'].value_counts())

  # == 6. split the data into training and testing ==
  X = new_df.drop(columns=['Primary_Offense_Description'])
  y = new_df['Primary_Offense_Description']
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
  print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")

  # == 7. one-hot encode nominal features ==
  # encode separately
  X_train = pd.get_dummies(X_train, columns=['Precinct', 'Sector', 'Beat', 'Neighborhood'])
  X_test = pd.get_dummies(X_test, columns=['Precinct', 'Sector', 'Beat', 'Neighborhood'])
  # align (ensures same columns)
  X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

  return X_train, X_test, y_train, y_test

In [None]:
# Downsampling parameter
SAMPLE_SIZE = 100000  # Use 100k samples (adjust as needed)

## 1. Experiment - find the optimal RARITY_THRESHOLD

In [None]:
thresholds = [100, 500, 1000, 2000, 5000]
results = []

for threshold in thresholds:
    print(f"\n=== Experiment with RARITY_THRESHOLD = {threshold} ===")
    
    # preprocess the data with the current threshold
    X_train, X_test, y_train, y_test = preprocess_crime(df, RARITY_THRESHOLD=threshold)
    
    # DOWNSAMPLE: Stratified sampling to keep all classes
    print(f"Original training size: {len(X_train):,}")
    
    if len(X_train) > SAMPLE_SIZE:
        X_train_sample, _, y_train_sample, _ = train_test_split(
            X_train, y_train,
            train_size=SAMPLE_SIZE,
            stratify=y_train,  # ← This preserves class distribution!
            random_state=42
        )
        print(f"Downsampled to: {len(X_train_sample):,} samples")
        print(f"Classes preserved: {y_train_sample.nunique()} / {y_train.nunique()}")
    else:
        X_train_sample = X_train
        y_train_sample = y_train
        print(f"No downsampling needed (dataset smaller than {SAMPLE_SIZE:,})")
    
    # train with FIXED C=1.0 (don't tune yet!)
    model = LinearSVC(
        C=1.0,
        class_weight='balanced',
        max_iter=1000,  # ← Reduced from 2000 for speed
        random_state=42,
        verbose=0  # ← Set to 1 if you want progress updates
    )
    
    # measure training time
    start = time.time()
    model.fit(X_train_sample, y_train_sample)  # ← Use sample
    train_time = time.time() - start
    
    # evaluate on FULL test set (don't downsample test!)
    y_pred = model.predict(X_test)
    acc = balanced_accuracy_score(y_test, y_pred)
    
    results.append({
        'threshold': threshold,
        'n_classes': y_train.nunique(),
        'train_samples': len(X_train_sample),  # Record sample size
        'original_train_size': len(X_train),
        'balanced_accuracy': acc,
        'train_time': train_time,
        'C': 1.0
    })
    
    print(f"Classes: {y_train.nunique()}")
    print(f"Balanced Accuracy: {acc:.4f}")
    print(f"Time: {train_time:.1f}s ({train_time/60:.1f} min)")

# Show results
df_results = pd.DataFrame(results)
print("\n" + "="*60)
print("EXPERIMENT RESULTS")
print("="*60)
print(df_results)


=== Experiment with RARITY_THRESHOLD = 100 ===
Occurred_Time           2
Reported_Time           2
Crime_Subcategory     262
Precinct             3352
Sector               3346
Beat                 3298
Neighborhood         3366
dtype: int64
Number of NAs: Series([], dtype: int64)
Original target class distribution:  Primary_Offense_Description
THEFT-CARPROWL                   131297
THEFT-SHOPLIFT                    48637
THEFT-OTH                         47275
VEH-THEFT-AUTO                    37840
BURGLARY-FORCE-RES                27984
                                  ...  
NARC-SMUGGLE-HEROIN                   1
HOMICIDE-NEG-MANS-GUN                 1
NARC-SELL-BARBITUATE                  1
NARC-MANUFACTURE-HALLUCINOGEN         1
HOMICIDE-NEG-MANS-WEAPON              1
Name: count, Length: 144, dtype: int64
Number of rare classes (less than 100 instances): 45
New target class distribution:  Primary_Offense_Description
THEFT-CARPROWL                      131297
THEFT-SHOPLIFT   

All the models performed very poorly

In [9]:
print("\nClass distribution in sample:")
class_dist = y_train_sample.value_counts()
print(f"Smallest class: {class_dist.min()} samples")
print(f"Largest class: {class_dist.max()} samples")
print(f"Classes with <100 samples: {(class_dist < 100).sum()}")


Class distribution in sample:
Smallest class: 1449 samples
Largest class: 25076 samples
Classes with <100 samples: 0


Let's do a sanity check with a baseline classifier that classifies everything as the most frequent class.

In [10]:
from sklearn.dummy import DummyClassifier

# What accuracy would we get by always predicting most common class?
dummy = DummyClassifier(strategy='most_frequent')
dummy.fit(X_train_sample, y_train_sample)
y_pred_dummy = dummy.predict(X_test)
dummy_acc = balanced_accuracy_score(y_test, y_pred_dummy)

print(f"Dummy classifier (always predict most common): {dummy_acc:.4f}")
print(f"Your SVM: {acc:.4f}")
print(f"Improvement over baseline: {acc - dummy_acc:.4f}")

Dummy classifier (always predict most common): 0.0556
Your SVM: 0.1785
Improvement over baseline: 0.1229


Our last model preforms better compared to the dummy baseline.

In [11]:
# Use threshold that gave best results
BEST_THRESHOLD = 5000

## 2. Experiment - find the optimal C parameter for the SVM

We're going to use the best performing RARITY_THRESHOLD of 5000 and now we'll 

In [13]:
print("="*60)
print(f"Focused experiments: RARITY_THRESHOLD={BEST_THRESHOLD}")
print("="*60)

# Preprocess with best threshold
X_train, X_test, y_train, y_test = preprocess_crime(df, RARITY_THRESHOLD=BEST_THRESHOLD)

print(f"Classes: {y_train.nunique()}")
print(f"Original training size: {len(X_train):,}")

# Downsample
X_train_sample, _, y_train_sample, _ = train_test_split(
    X_train, y_train,
    train_size=SAMPLE_SIZE,
    stratify=y_train,
    random_state=42
)

print(f"Sample size: {len(X_train_sample):,}")
print(f"Samples per class (avg): {len(X_train_sample) / y_train_sample.nunique():.0f}")

# Check class distribution
class_dist = y_train_sample.value_counts()
print(f"Smallest class: {class_dist.min()} samples")
print(f"Largest class: {class_dist.max()} samples")


Focused experiments: RARITY_THRESHOLD=5000
Occurred_Time           2
Reported_Time           2
Crime_Subcategory     262
Precinct             3352
Sector               3346
Beat                 3298
Neighborhood         3366
dtype: int64
Number of NAs: Series([], dtype: int64)
Original target class distribution:  Primary_Offense_Description
THEFT-CARPROWL                   131297
THEFT-SHOPLIFT                    48637
THEFT-OTH                         47275
VEH-THEFT-AUTO                    37840
BURGLARY-FORCE-RES                27984
                                  ...  
NARC-SMUGGLE-HEROIN                   1
HOMICIDE-NEG-MANS-GUN                 1
NARC-SELL-BARBITUATE                  1
NARC-MANUFACTURE-HALLUCINOGEN         1
HOMICIDE-NEG-MANS-WEAPON              1
Name: count, Length: 144, dtype: int64
Number of rare classes (less than 5000 instances): 127
New target class distribution:  Primary_Offense_Description
THEFT-CARPROWL                 131297
OTHER                    

Now let's train the SVM with a bunch of C options to see which one performs the best.

In [14]:
# Now tune C parameter on this configuration
C_values = [0.01, 0.1, 1.0, 10.0, 100.0]
results = []

for C in C_values:
    print(f"\n--- Testing C={C} ---")
    
    model = LinearSVC(
        C=C,
        class_weight='balanced',
        max_iter=2000,
        random_state=42,
        verbose=1
    )
    
    start = time.time()
    model.fit(X_train_sample, y_train_sample)
    train_time = time.time() - start
    
    # Check convergence
    if hasattr(model, 'n_iter_'):
        print(f"Converged in {model.n_iter_} iterations")
    
    y_pred = model.predict(X_test)
    acc = balanced_accuracy_score(y_test, y_pred)
    
    results.append({
        'C': C,
        'balanced_accuracy': acc,
        'train_time': train_time,
        'converged': model.converged_ if hasattr(model, 'converged_') else True
    })
    
    print(f"Balanced Accuracy: {acc:.4f}")
    print(f"Time: {train_time:.1f}s")

df_results = pd.DataFrame(results)
print("\n" + "="*60)
print("C PARAMETER TUNING RESULTS")
print("="*60)
print(df_results)


--- Testing C=0.01 ---
[LibLinear]iter  1 act 8.274e+02 pre 8.272e+02 delta 7.477e-01 f 1.037e+03 |g| 2.321e+03 CG   2
iter  2 act 4.740e+00 pre 5.158e+00 delta 7.477e-01 f 2.096e+02 |g| 6.168e+01 CG  12
iter  3 act 3.460e-01 pre 3.400e-01 delta 7.477e-01 f 2.048e+02 |g| 1.910e+01 CG  14
iter  4 act 2.765e-02 pre 2.749e-02 delta 7.477e-01 f 2.045e+02 |g| 1.753e+00 CG  28
iter  5 act 7.405e-04 pre 7.403e-04 delta 7.477e-01 f 2.045e+02 |g| 1.868e-01 CG  35
iter  6 act 5.410e-06 pre 5.410e-06 delta 7.477e-01 f 2.045e+02 |g| 1.470e-02 CG  35
iter  1 act 8.189e+02 pre 8.185e+02 delta 7.512e-01 f 1.027e+03 |g| 2.296e+03 CG   2
iter  2 act 6.149e+00 pre 6.554e+00 delta 7.512e-01 f 2.077e+02 |g| 7.680e+01 CG   9
iter  3 act 5.011e-01 pre 4.789e-01 delta 7.512e-01 f 2.015e+02 |g| 2.331e+01 CG  15
iter  4 act 6.051e-02 pre 6.014e-02 delta 7.512e-01 f 2.010e+02 |g| 3.762e+00 CG  20
iter  5 act 1.960e-03 pre 1.957e-03 delta 7.512e-01 f 2.010e+02 |g| 3.213e-01 CG  25
iter  6 act 2.519e-05 pre 2.51