In [10]:
import pandas as pd
import numpy as np
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error

# Load and preprocess data
file_path = 'preprocessed_crime_data.csv'
df = pd.read_csv(file_path)
df['DATE OCC'] = pd.to_datetime(df['DATE OCC'])
df['Year'] = df['DATE OCC'].dt.year
df['Month'] = df['DATE OCC'].dt.month

# Aggregate monthly crime count per zone
monthly_zone_crime = df.groupby(['Year', 'Month', 'Rpt Dist No'])['Crime Count'].sum().reset_index()

# Features and target
feature_cols = ['Year', 'Month', 'Rpt Dist No']
X = monthly_zone_crime[feature_cols]
y = monthly_zone_crime['Crime Count']

# Train-test split (2023-2024 for testing)
train_mask = X['Year'] < 2023
X_train, y_train = X[train_mask], y[train_mask]
X_test, y_test = X[~train_mask], y[~train_mask]


# Define 10 parameter combinations
param_combinations = [
    {'n_estimators': 100, 'learning_rate': 0.1, 'max_depth': 5, 'num_leaves': 31, 'min_child_samples': 20, 'subsample': 0.8, 'colsample_bytree': 0.8, 'reg_alpha': 0.0, 'reg_beta': 0.0},
    {'n_estimators': 200, 'learning_rate': 0.05, 'max_depth': 7, 'num_leaves': 50, 'min_child_samples': 10, 'subsample': 0.9, 'colsample_bytree': 0.7, 'reg_alpha': 0.1, 'reg_beta': 0.1},
    {'n_estimators': 300, 'learning_rate': 0.01, 'max_depth': 3, 'num_leaves': 20, 'min_child_samples': 30, 'subsample': 0.6, 'colsample_bytree': 0.6, 'reg_alpha': 0.5, 'reg_beta': 0.5},
    {'n_estimators': 150, 'learning_rate': 0.2, 'max_depth': 10, 'num_leaves': 70, 'min_child_samples': 15, 'subsample': 0.7, 'colsample_bytree': 0.9, 'reg_alpha': 0.0, 'reg_beta': 0.2},
    {'n_estimators': 400, 'learning_rate': 0.03, 'max_depth': 6, 'num_leaves': 40, 'min_child_samples': 25, 'subsample': 0.8, 'colsample_bytree': 0.8, 'reg_alpha': 0.3, 'reg_beta': 0.3},
    {'n_estimators': 250, 'learning_rate': 0.08, 'max_depth': 4, 'num_leaves': 25, 'min_child_samples': 20, 'subsample': 0.9, 'colsample_bytree': 0.7, 'reg_alpha': 0.2, 'reg_beta': 0.0},
    {'n_estimators': 500, 'learning_rate': 0.005, 'max_depth': 8, 'num_leaves': 60, 'min_child_samples': 10, 'subsample': 0.6, 'colsample_bytree': 0.6, 'reg_alpha': 0.4, 'reg_beta': 0.4},
    {'n_estimators': 100, 'learning_rate': 0.15, 'max_depth': 5, 'num_leaves': 35, 'min_child_samples': 30, 'subsample': 0.7, 'colsample_bytree': 0.8, 'reg_alpha': 0.1, 'reg_beta': 0.1},
    {'n_estimators': 350, 'learning_rate': 0.02, 'max_depth': 7, 'num_leaves': 45, 'min_child_samples': 15, 'subsample': 0.8, 'colsample_bytree': 0.9, 'reg_alpha': 0.0, 'reg_beta': 0.5},
    {'n_estimators': 200, 'learning_rate': 0.1, 'max_depth': 6, 'num_leaves': 30, 'min_child_samples': 20, 'subsample': 0.9, 'colsample_bytree': 0.7, 'reg_alpha': 0.3, 'reg_beta': 0.2}
]

# Function to evaluate a single parameter combination
def evaluate_model(params, X_train, y_train, X_test, y_test):
    model = LGBMRegressor(
        n_estimators=params['n_estimators'],
        learning_rate=params['learning_rate'],
        max_depth=params['max_depth'],
        num_leaves=params['num_leaves'],
        min_child_samples=params['min_child_samples'],
        subsample=params['subsample'],
        colsample_bytree=params['colsample_bytree'],
        reg_alpha=params['reg_alpha'],
        reg_beta=params['reg_beta'],
        random_state=42,
        verbose=-1
    )

    # Fit model
    model.fit(X_train, y_train)

    # Predict on test set
    y_pred = model.predict(X_test)

    # Calculate MSE
    mse = mean_squared_error(y_test, y_pred)

    # Identify most dangerous zone
    X_test_df = X_test.copy()
    X_test_df['Predicted Crime Count'] = y_pred
    zone_preds = X_test_df.groupby('Rpt Dist No')['Predicted Crime Count'].sum().reset_index()
    most_dangerous = zone_preds.sort_values(by='Predicted Crime Count', ascending=False).iloc[0]

    return mse, most_dangerous['Rpt Dist No'], most_dangerous['Predicted Crime Count']

# Evaluate all parameter combinations
results = []
for i, params in enumerate(param_combinations, 1):
    print(f"\n=== Evaluating Parameter Combination {i} ===")
    print(f"Parameters: {params}")
    mse, rpt_dist_no, pred_count = evaluate_model(params, X_train, y_train, X_test, y_test)
    print(f"MSE: {mse:.6f}")
    print(f"Most Dangerous Zone: Rpt Dist No {rpt_dist_no:.0f}, "
          f"Predicted Crime Count: {pred_count:.6f}")

    results.append({
        'Model': i,
        'n_estimators': params['n_estimators'],
        'learning_rate': params['learning_rate'],
        'max_depth': params['max_depth'],
        'num_leaves': params['num_leaves'],
        'min_child_samples': params['min_child_samples'],
        'subsample': params['subsample'],
        'colsample_bytree': params['colsample_bytree'],
        'reg_alpha': params['reg_alpha'],
        'reg_beta': params['reg_beta'],
        'Rpt Dist No': rpt_dist_no,
        'Predicted Crime Count': pred_count,
        'MSE': mse
    })

# Create and display results DataFrame
results_df = pd.DataFrame(results)
print("\n=== Model Evaluation Summary ===")
print(results_df.to_string(index=False))

# Identify best model
best_model = results_df.loc[results_df['MSE'].idxmin()]
print("\n=== Best Model ===")
print(f"Model {best_model['Model']}:")
print(f"Parameters: {best_model[['n_estimators', 'learning_rate', 'max_depth', 'num_leaves', 'min_child_samples', 'subsample', 'colsample_bytree', 'reg_alpha', 'reg_beta']].to_dict()}")
print(f"MSE: {best_model['MSE']:.6f}")
print(f"Most Dangerous Zone: Rpt Dist No {best_model['Rpt Dist No']:.0f}, "
      f"Predicted Crime Count: {best_model['Predicted Crime Count']:.6f}")
print("="*80)


=== Evaluating Parameter Combination 1 ===
Parameters: {'n_estimators': 100, 'learning_rate': 0.1, 'max_depth': 5, 'num_leaves': 31, 'min_child_samples': 20, 'subsample': 0.8, 'colsample_bytree': 0.8, 'reg_alpha': 0.0, 'reg_beta': 0.0}
MSE: 0.988204
Most Dangerous Zone: Rpt Dist No 991, Predicted Crime Count: 81.080523

=== Evaluating Parameter Combination 2 ===
Parameters: {'n_estimators': 200, 'learning_rate': 0.05, 'max_depth': 7, 'num_leaves': 50, 'min_child_samples': 10, 'subsample': 0.9, 'colsample_bytree': 0.7, 'reg_alpha': 0.1, 'reg_beta': 0.1}
MSE: 0.979440
Most Dangerous Zone: Rpt Dist No 989, Predicted Crime Count: 85.259763

=== Evaluating Parameter Combination 3 ===
Parameters: {'n_estimators': 300, 'learning_rate': 0.01, 'max_depth': 3, 'num_leaves': 20, 'min_child_samples': 30, 'subsample': 0.6, 'colsample_bytree': 0.6, 'reg_alpha': 0.5, 'reg_beta': 0.5}
MSE: 1.057540
Most Dangerous Zone: Rpt Dist No 989, Predicted Crime Count: 43.513258

=== Evaluating Parameter Combin