In [1]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [None]:
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
import numpy as np

**Load Dataset**

In [None]:
file_path = 'preprocessed_crime_data.csv'
df = pd.read_csv(file_path)
df['DATE OCC'] = pd.to_datetime(df['DATE OCC'])

**Feature engineering**

In [None]:
df['hour'] = df['TIME OCC'].astype(int) // 100
df['dayofweek'] = df['DATE OCC'].dt.dayofweek
df['month'] = df['DATE OCC'].dt.month
df['year'] = df['DATE OCC'].dt.year
df['is_weekend'] = df['dayofweek'].isin([5, 6]).astype(int)

**Drop unused or leaky columns**

In [None]:
df = df.drop(columns=['Crime Count', 'Is Violent', 'DATE OCC'])

** Train-test split by year**

In [None]:
train_df = df[df['year'].isin([2020, 2021, 2022])]
test_df = df[df['year'].isin([2023, 2024])]

X_train = train_df.drop(columns=['Target', 'year'])
y_train = train_df['Target']
X_test = test_df.drop(columns=['Target', 'year'])
y_test = test_df['Target']

In [None]:
param_grid = [
    {'iterations': 300, 'learning_rate': 0.1,  'depth': 4, 'l2_leaf_reg': 3},
    {'iterations': 500, 'learning_rate': 0.05, 'depth': 6, 'l2_leaf_reg': 5},
    {'iterations': 400, 'learning_rate': 0.15, 'depth': 3, 'l2_leaf_reg': 2},
    {'iterations': 600, 'learning_rate': 0.08, 'depth': 5, 'l2_leaf_reg': 4},
    {'iterations': 350, 'learning_rate': 0.2,  'depth': 4, 'l2_leaf_reg': 6},
    {'iterations': 450, 'learning_rate': 0.07, 'depth': 7, 'l2_leaf_reg': 1},
    {'iterations': 500, 'learning_rate': 0.03, 'depth': 6, 'l2_leaf_reg': 3},
    {'iterations': 550, 'learning_rate': 0.09, 'depth': 5, 'l2_leaf_reg': 4},
    {'iterations': 300, 'learning_rate': 0.25, 'depth': 3, 'l2_leaf_reg': 2},
    {'iterations': 400, 'learning_rate': 0.1,  'depth': 8, 'l2_leaf_reg': 5},
]

In [8]:
for i, params in enumerate(param_grid, 1):
    model = CatBoostClassifier(
        iterations=params['iterations'],
        learning_rate=params['learning_rate'],
        depth=params['depth'],
        eval_metric='Accuracy',
        verbose=0,
        random_seed=42
    )
    model.fit(X_train, y_train, eval_set=(X_test, y_test), early_stopping_rounds=20)

    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)

    print(f"\n=== Model {i} with params: {params} ===")
    print(f"Accuracy: {acc:.6f}")
    print(classification_report(y_test, y_pred, digits=6))



=== Model 1 with params: {'iterations': 300, 'learning_rate': 0.1, 'depth': 4, 'l2_leaf_reg': 3} ===
Accuracy: 0.883911
              precision    recall  f1-score   support

           0   0.910946  0.955717  0.932795     21995
           1   0.677056  0.498413  0.574160      4097

    accuracy                       0.883911     26092
   macro avg   0.794001  0.727065  0.753477     26092
weighted avg   0.874221  0.883911  0.876482     26092


=== Model 2 with params: {'iterations': 500, 'learning_rate': 0.05, 'depth': 6, 'l2_leaf_reg': 5} ===
Accuracy: 0.890694
              precision    recall  f1-score   support

           0   0.918024  0.955672  0.936470     21995
           1   0.694836  0.541860  0.608886      4097

    accuracy                       0.890694     26092
   macro avg   0.806430  0.748766  0.772678     26092
weighted avg   0.882979  0.890694  0.885032     26092


=== Model 3 with params: {'iterations': 400, 'learning_rate': 0.15, 'depth': 3, 'l2_leaf_reg': 2} ===
