In [1]:
import pandas as pd
import numpy as np
from sklearn.svm import LinearSVR
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import GridSearchCV

In [2]:
# Load data
X_train = pd.read_csv('~/Downloads/X_train.csv')
y_train = pd.read_csv('~/Downloads/y_train.csv')

In [3]:
# concat data to drop rows with NaN in any column (12k out of 900k+ rows have NaN)
train = pd.concat([X_train, y_train], axis=1)
train_clean = train.dropna()
X_train_clean = train_clean.iloc[:, :-1]
y_train_clean = train_clean.iloc[:, -1]

In [4]:
# Parse coordinates
import ast
y_coords = y_train_clean.apply(ast.literal_eval)
y_clean = np.vstack(y_coords.values)

In [11]:
# Sample data because dataset has over 900k rows
sample_size = 100000  # 100k samples
sample_indices = np.random.choice(len(X_train_clean), size=sample_size, replace=False)
X_sample = X_train_clean.iloc[sample_indices]
X_sample = X_sample.drop(columns=['Ward', 'Community Area', 'Beat', 'District', 'Block'], errors='ignore')
y_sample = y_clean[sample_indices]

In [12]:
features_to_keep = ['IUCR', 'Primary Type', 'FBI Code', 'Arrest', 'Domestic', 'Year']

X_sample_clean = X_sample[features_to_keep].copy()

In [13]:
# Preprocessing

numeric_cols = X_sample_clean.select_dtypes(include=['number']).columns
cat_cols = X_sample_clean.select_dtypes(exclude=['number']).columns

#SVM needs scaled data and one-hot encoded categorical variables
preprocess = ColumnTransformer([
    ('num', StandardScaler(), numeric_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=True), cat_cols)
], remainder='drop')

In [14]:
# LinearSVR with MultiOutputRegressor
pipeline = Pipeline([
    ('preprocess', preprocess),
    ('regressor', MultiOutputRegressor(LinearSVR(max_iter=5000, random_state=42)))
])

# Simplified parameter grid
param_grid = {
    'regressor__estimator__C': [0.1, 1.0, 10.0],
}

# Grid search with 3 CV folds
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=3,  
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    verbose=2
)

print("Starting grid search on 100,000 samples...")
grid_search.fit(X_sample_clean, y_sample)

print(f"\nBest parameters: {grid_search.best_params_}")
print(f"Best score: {grid_search.best_score_:.4f}")

Starting grid search on 100,000 samples...
Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV] END ........................regressor__estimator__C=0.1; total time=   2.3s
[CV] END ........................regressor__estimator__C=0.1; total time=   2.3s
[CV] END ........................regressor__estimator__C=0.1; total time=   2.3s




[CV] END ........................regressor__estimator__C=1.0; total time=  12.4s




[CV] END ........................regressor__estimator__C=1.0; total time=  12.7s
[CV] END ........................regressor__estimator__C=1.0; total time=  12.6s




[CV] END .......................regressor__estimator__C=10.0; total time=  45.3s




[CV] END .......................regressor__estimator__C=10.0; total time=  45.4s




[CV] END .......................regressor__estimator__C=10.0; total time=  43.7s





Best parameters: {'regressor__estimator__C': 1.0}
Best score: -0.0056




In [15]:
# Train final model with best parameters
best_model = Pipeline([
    ('preprocess', preprocess),
    ('regressor', MultiOutputRegressor(
        LinearSVR(
            C=grid_search.best_params_['regressor__estimator__C'],
            max_iter=5000,
            random_state=42
        )
    ))
])

print("Training final model with best parameters...")
best_model.fit(X_sample, y_sample)
print("Training complete")

Training final model with best parameters...




Training complete




In [16]:
# Evaluate on training data
from sklearn.metrics import mean_squared_error, r2_score
y_pred_train = best_model.predict(X_sample)
train_mse = mean_squared_error(y_sample, y_pred_train)
train_r2 = r2_score(y_sample, y_pred_train)

print(f"Training MSE: {train_mse:.6f}")
print(f"Training R²:  {train_r2:.4f}")

Training MSE: 0.005442
Training R²:  0.0195
