In [78]:
#Imports
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
import numpy as np
import pandas as pd
import ast
import time

In [92]:
#Load and clean
X_train = pd.read_csv('~/Downloads/X_train.csv')
y_train = pd.read_csv('~/Downloads/y_train.csv')

train = pd.concat([X_train, y_train], axis=1)
train_clean = train.dropna()
X_train_clean = train_clean.iloc[:, :-1]
y_train_clean = train_clean.iloc[:, -1]

y_coords = y_train_clean.apply(ast.literal_eval)
y_clean = np.vstack(y_coords.values)

In [93]:
# Sample data
sample_size = 100000
np.random.seed(42)
sample_indices = np.random.choice(len(X_train_clean), size=sample_size, replace=False)
X_sample = X_train_clean.iloc[sample_indices].reset_index(drop=True)
y_sample = y_clean[sample_indices]

print(f"Training samples: {len(X_sample):,}")

Training samples: 100,000


In [94]:
# Feature selection and encoding
# Check unique values in categorical columns
print("Checking categorical columns:")
for col in X_sample.select_dtypes(exclude=['number']).columns:
    n_unique = X_sample[col].nunique()
    print(f"  {col}: {n_unique} unique values")

Checking categorical columns:
  Case Number: 99999 unique values
  Date: 84854 unique values
  Block: 23088 unique values
  IUCR: 297 unique values
  Primary Type: 31 unique values
  Description: 279 unique values
  Arrest: 2 unique values
  Domestic: 2 unique values
  FBI Code: 26 unique values
  Updated On: 1768 unique values


In [95]:
# Prepare features - keep less unique categoricals
# Drop highly unique columns
high_cardinality_cols = ['ID', 'Case Number', 'Date', 'Block', 'Updated On']
X_selected = X_sample.drop(columns=high_cardinality_cols, errors='ignore')
# Drop location-related columns
X_selected = X_selected.drop(columns=['Ward', 'Community Area', 'Beat', 'District'], errors='ignore')

# Separate numeric and categorical
numeric_cols = X_selected.select_dtypes(include=['number']).columns.tolist()
categorical_cols = X_selected.select_dtypes(exclude=['number']).columns.tolist()

print(f"\nNumeric columns : {numeric_cols}")
print(f"Categorical columns : {categorical_cols}")

# Check if we should keep categorical columns based on uniqueness
keep_categorical = []
for col in categorical_cols:
    n_unique = X_selected[col].nunique()
    if n_unique <= 100:  # Keep if <= 100 unique values
        keep_categorical.append(col)
        print(f"  Keeping {col}: {n_unique} unique values")
    else:
        print(f"  Dropping {col}: {n_unique} unique values (too many)")

# Create final feature set
X_processed = X_selected[numeric_cols].copy()

# Label encode categorical columns 
label_encoders = {}
for col in keep_categorical:
    le = LabelEncoder()
    X_processed[col] = le.fit_transform(X_selected[col].astype(str))
    label_encoders[col] = le

print(f"\nFinal feature set: {X_processed.shape[1]} features")
print(f"Columns: {X_processed.columns.tolist()}")


Numeric columns : ['Year']
Categorical columns : ['IUCR', 'Primary Type', 'Description', 'Arrest', 'Domestic', 'FBI Code']
  Dropping IUCR: 297 unique values (too many)
  Keeping Primary Type: 31 unique values
  Dropping Description: 279 unique values (too many)
  Keeping Arrest: 2 unique values
  Keeping Domestic: 2 unique values
  Keeping FBI Code: 26 unique values

Final feature set: 5 features
Columns: ['Year', 'Primary Type', 'Arrest', 'Domestic', 'FBI Code']


In [96]:
# GridSearchCV
grid_sample_size = 10000
grid_indices = np.random.choice(len(X_processed), size=grid_sample_size, replace=False)
X_grid = X_processed.iloc[grid_indices]
y_grid = y_sample[grid_indices]

param_grid = {
    'n_estimators': [30, 50],
    'max_depth': [10, 15],
    'min_samples_split': [20, 50]
}

rf_grid = RandomForestRegressor(
    max_features='sqrt',  # Changed from fixed 5
    min_samples_leaf=20,
    n_jobs=4,
    random_state=42
)

print(f"Grid search sample: {grid_sample_size:,}")
print(f"Features: {X_processed.shape[1]}")

grid_search = GridSearchCV(
    rf_grid,
    param_grid,
    cv=3,
    scoring='neg_mean_squared_error',
    n_jobs=1,
    verbose=2
)

start = time.time()
grid_search.fit(X_grid, y_grid)
elapsed = time.time() - start

print(f"\nBest parameters: {grid_search.best_params_}")
print(f"Best CV score (neg MSE): {grid_search.best_score_:.4f}")

Grid search sample: 10,000
Features: 5
Fitting 3 folds for each of 8 candidates, totalling 24 fits
[CV] END max_depth=10, min_samples_split=20, n_estimators=30; total time=   0.1s
[CV] END max_depth=10, min_samples_split=20, n_estimators=30; total time=   0.0s
[CV] END max_depth=10, min_samples_split=20, n_estimators=30; total time=   0.1s
[CV] END max_depth=10, min_samples_split=20, n_estimators=50; total time=   0.1s
[CV] END max_depth=10, min_samples_split=20, n_estimators=50; total time=   0.1s
[CV] END max_depth=10, min_samples_split=20, n_estimators=50; total time=   0.1s
[CV] END max_depth=10, min_samples_split=50, n_estimators=30; total time=   0.0s
[CV] END max_depth=10, min_samples_split=50, n_estimators=30; total time=   0.1s
[CV] END max_depth=10, min_samples_split=50, n_estimators=30; total time=   0.1s
[CV] END max_depth=10, min_samples_split=50, n_estimators=50; total time=   0.1s
[CV] END max_depth=10, min_samples_split=50, n_estimators=50; total time=   0.1s
[CV] END m

In [97]:
#Train final model
best_rf = RandomForestRegressor(
    n_estimators=grid_search.best_params_['n_estimators'],
    max_depth=grid_search.best_params_['max_depth'],
    min_samples_split=grid_search.best_params_['min_samples_split'],
    max_features='sqrt',
    min_samples_leaf=20,
    n_jobs=4,
    random_state=42,
    verbose=1
)

start = time.time()
best_rf.fit(X_processed, y_sample)
elapsed = time.time() - start

print(f"\n✓ Training completed in {elapsed/60:.2f} minutes")

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.



✓ Training completed in 0.01 minutes


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.5s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    0.6s finished


In [98]:
# Evaluate on training data
y_pred_train = best_rf.predict(X_processed)
train_mse = mean_squared_error(y_sample, y_pred_train)
train_r2 = r2_score(y_sample, y_pred_train)

print(f"Training MSE: {train_mse:.6f}")
print(f"Training R²:  {train_r2:.4f}")

Training MSE: 0.005366
Training R²:  0.0338


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    0.1s finished
