In [37]:
import pandas as pd
import xgboost as xgb
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
import numpy as np

**Load** **dataset**

In [38]:
file_path = 'preprocessed_crime_data.csv'
df = pd.read_csv(file_path)
df['DATE OCC'] = pd.to_datetime(df['DATE OCC'])

**Supervised-only feature**

In [39]:
df['hour'] = df['TIME OCC'].astype(int) // 100
df['dayofweek'] = df['DATE OCC'].dt.dayofweek
df['month'] = df['DATE OCC'].dt.month
df['year'] = df['DATE OCC'].dt.year
df['is_weekend'] = df['dayofweek'].isin([5, 6]).astype(int)

**Drop potential leakage and unused columns**

In [40]:
df = df.drop(columns=['Crime Count', 'Is Violent', 'DATE OCC'])

**Train-test split**

In [41]:
train_df = df[df['year'].isin([2020, 2021, 2022])]
test_df = df[df['year'].isin([2023, 2024])]

In [42]:
X_train = train_df.drop(columns=['Target', 'year'])
y_train = train_df['Target']
X_test = test_df.drop(columns=['Target', 'year'])
y_test = test_df['Target']

**Validation split from training**

In [43]:
X_train_final, X_val, y_train_final, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

**DMatrix for XGBoost**

In [44]:
dtrain = xgb.DMatrix(X_train_final, label=y_train_final)
dval = xgb.DMatrix(X_val, label=y_val)
dtest = xgb.DMatrix(X_test, label=y_test)

**Parameter configurations**

In [45]:
param_sets = [
    {'objective': 'binary:logistic', 'eval_metric': 'logloss',
     'max_depth': 3, 'eta': 0.1, 'subsample': 0.8, 'colsample_bytree': 0.8, 'seed': 42},

    {'objective': 'binary:logistic', 'eval_metric': 'logloss',
     'max_depth': 4, 'eta': 0.05, 'subsample': 0.9, 'colsample_bytree': 0.9, 'seed': 42},

    {'objective': 'binary:logistic', 'eval_metric': 'logloss',
     'max_depth': 5, 'eta': 0.3, 'subsample': 0.7, 'colsample_bytree': 0.7, 'seed': 42},

    {'objective': 'binary:logistic', 'eval_metric': 'logloss',
     'max_depth': 6, 'eta': 0.2, 'subsample': 1.0, 'colsample_bytree': 1.0, 'seed': 42},

    {'objective': 'binary:logistic', 'eval_metric': 'logloss',
     'max_depth': 3, 'eta': 0.15, 'subsample': 0.85, 'colsample_bytree': 0.85, 'seed': 42},

    {'objective': 'binary:logistic', 'eval_metric': 'logloss',
     'max_depth': 4, 'eta': 0.25, 'subsample': 0.95, 'colsample_bytree': 0.75, 'seed': 42},

    {'objective': 'binary:logistic', 'eval_metric': 'logloss',
     'max_depth': 7, 'eta': 0.05, 'subsample': 0.6, 'colsample_bytree': 0.6, 'seed': 42},

    {'objective': 'binary:logistic', 'eval_metric': 'logloss',
     'max_depth': 8, 'eta': 0.01, 'subsample': 0.8, 'colsample_bytree': 0.8, 'seed': 42},

    {'objective': 'binary:logistic', 'eval_metric': 'logloss',
     'max_depth': 6, 'eta': 0.2, 'subsample': 0.9, 'colsample_bytree': 0.8,
     'gamma': 0.1, 'seed': 42},

    {'objective': 'binary:logistic', 'eval_metric': 'logloss',
     'max_depth': 5, 'eta': 0.3, 'subsample': 1.0, 'colsample_bytree': 1.0,
     'gamma': 0.2, 'lambda': 1.5, 'seed': 42}
]

**Training with early stopping**


In [46]:
for idx, params in enumerate(param_sets, 1):
    print(f"\n=== Model {idx} with params: {params} ===")
    model = xgb.train(
        params,
        dtrain,
        num_boost_round=2500,
        evals=[(dval, 'validation')],
        early_stopping_rounds=20,
        verbose_eval=False
    )
    y_pred_prob = model.predict(dtest)
    y_pred = (y_pred_prob > 0.5).astype(int)
    acc = accuracy_score(y_test, y_pred)
    print(f"Stopped at round: {model.best_iteration}")
    print(f"Accuracy: {acc:.6f}")
    print(classification_report(y_test, y_pred, digits=6))


=== Model 1 with params: {'objective': 'binary:logistic', 'eval_metric': 'logloss', 'max_depth': 3, 'eta': 0.1, 'subsample': 0.8, 'colsample_bytree': 0.8, 'seed': 42} ===
Stopped at round: 1629
Accuracy: 0.892342
              precision    recall  f1-score   support

           0   0.931495  0.941532  0.936487     21995
           1   0.666839  0.628265  0.646978      4097

    accuracy                       0.892342     26092
   macro avg   0.799167  0.784898  0.791732     26092
weighted avg   0.889939  0.892342  0.891028     26092


=== Model 2 with params: {'objective': 'binary:logistic', 'eval_metric': 'logloss', 'max_depth': 4, 'eta': 0.05, 'subsample': 0.9, 'colsample_bytree': 0.9, 'seed': 42} ===
Stopped at round: 2268
Accuracy: 0.895639
              precision    recall  f1-score   support

           0   0.934759  0.941941  0.938336     21995
           1   0.674898  0.647059  0.660685      4097

    accuracy                       0.895639     26092
   macro avg   0.804829  0