In [10]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, VotingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
from tabulate import tabulate

# Load and prepare data
df = pd.read_csv('preprocessed_crime_data.csv')
df['DATE OCC'] = pd.to_datetime(df['DATE OCC'], errors='coerce')
df = df.dropna(subset=['DATE OCC'])
df.drop(columns=['Is Violent'], errors='ignore', inplace=True)


df['Year'] = df['DATE OCC'].dt.year
df['Month'] = df['DATE OCC'].dt.month

# Aggregate by month and reporting district
monthly_zone_crime = df.groupby(['Year', 'Month', 'Rpt Dist No'])['Crime Count'].sum().reset_index()

# Features and target
feature_cols = ['Year', 'Month', 'Rpt Dist No']
X = monthly_zone_crime[feature_cols]
y = monthly_zone_crime['Crime Count']

# Train/test split
train_mask = X['Year'] < 2024
X_train, y_train = X[train_mask], y[train_mask]
X_test, y_test = X[~train_mask], y[~train_mask]

# Define degraded hybrid configs
hybrid_configs = {
    "Hybrid-A  (weak 0.7|0.3)": dict(
        lin = LinearRegression(),
        rf  = RandomForestRegressor(n_estimators=50, max_depth=4, min_samples_leaf=10, random_state=42),
        wts = [0.7, 0.3]),
    "Hybrid-B  (weak Ridge 0.8|0.2)": dict(
        lin = Ridge(alpha=10.0),
        rf  = RandomForestRegressor(n_estimators=40, max_depth=3, min_samples_leaf=8, random_state=42),
        wts = [0.8, 0.2]),
    "Hybrid-C  (weak Lasso 0.9|0.1)": dict(
        lin = Lasso(alpha=5.0, max_iter=3000),
        rf  = RandomForestRegressor(n_estimators=30, max_depth=2, min_samples_leaf=12, random_state=42),
        wts = [0.9, 0.1]),
    "Hybrid-D  (Linear-only)": dict(
        lin = LinearRegression(),
        rf  = RandomForestRegressor(n_estimators=10, max_depth=2, min_samples_leaf=15, random_state=42),
        wts = [1.0, 0.0]),
    "Hybrid-E  (noisy RF 0.6|0.4)": dict(
        lin = Ridge(alpha=15.0),
        rf  = RandomForestRegressor(n_estimators=20, max_depth=5, min_samples_leaf=6, random_state=99),
        wts = [0.6, 0.4])
}

# Evaluate models
rows = []
for label, cfg in hybrid_configs.items():
    lin_pipe = Pipeline([("scale", StandardScaler()), ("lin", cfg["lin"])])
    rf_pipe  = Pipeline([("scale", StandardScaler()), ("rf",  cfg["rf"])])

    hybrid = VotingRegressor(
        estimators=[("lin", lin_pipe), ("rf", rf_pipe)],
        weights=cfg["wts"]
    )

    hybrid.fit(X_train, y_train)
    y_pred = hybrid.predict(X_test)

    mse  = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2   = r2_score(y_test, y_pred)

    rows.append({
        "Model":        label,
        "LinType":      type(cfg["lin"]).__name__,
        "RF_n":         cfg["rf"].n_estimators,
        "RF_depth":     cfg["rf"].max_depth or "None",
        "RF_minLeaf":   getattr(cfg["rf"], "min_samples_leaf", 1),
        "Weights L|F":  f"{cfg['wts'][0]}|{cfg['wts'][1]}",
        "MSE":          round(mse, 4),
        "RMSE":         round(rmse, 4),
        "R²":           round(r2, 4)
    })

# Display results
print(tabulate(rows, headers="keys", tablefmt="github"))

# Show best (lowest RMSE)
best = min(rows, key=lambda r: r["RMSE"])
print(f"\nLowest-error hybrid 👉  {best['Model']}  (RMSE = {best['RMSE']})")


| Model                          | LinType          |   RF_n |   RF_depth |   RF_minLeaf | Weights L|F   |    MSE |   RMSE |      R² |
|--------------------------------|------------------|--------|------------|--------------|---------------|--------|--------|---------|
| Hybrid-A  (weak 0.7|0.3)       | LinearRegression |     50 |          4 |           10 | 0.7|0.3       | 1.1523 | 1.0735 | -0.5046 |
| Hybrid-B  (weak Ridge 0.8|0.2) | Ridge            |     40 |          3 |            8 | 0.8|0.2       | 1.1779 | 1.0853 | -0.5379 |
| Hybrid-C  (weak Lasso 0.9|0.1) | Lasso            |     30 |          2 |           12 | 0.9|0.1       | 1.0102 | 1.0051 | -0.319  |
| Hybrid-D  (Linear-only)        | LinearRegression |     10 |          2 |           15 | 1.0|0.0       | 1.2127 | 1.1012 | -0.5833 |
| Hybrid-E  (noisy RF 0.6|0.4)   | Ridge            |     20 |          5 |            6 | 0.6|0.4       | 1.1132 | 1.0551 | -0.4535 |

Lowest-error hybrid 👉  Hybrid-C  (weak Lasso 0.9|0.1) 