In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

# =============================
# 1. Load the dataset
# =============================
df = pd.read_csv("cleaned_police_overtime_data.csv")

# =============================
# 2. Create a Classification Target
# Example: classify High vs Low Overtime Earners
# =============================
median_ot = df["OVERTIME"].median()

df["HIGH_OT"] = (df["OVERTIME"] > median_ot).astype(int)
# 1 = High overtime earner
# 0 = Low overtime earner

# =============================
# 3. Select features
# Exclude obviously non-predictive fields like NAME
# =============================
feature_cols = [
    "DEPARTMENT_NAME", "TITLE", "REGULAR", "RETRO", "OTHER",
    "INJURED", "DETAIL", "POSTAL"
]

X = df[feature_cols]
y = df["HIGH_OT"]

# =============================
# 4. Identify categorical + numeric columns
# =============================
categorical_cols = ["DEPARTMENT_NAME", "TITLE"]
numeric_cols = ["REGULAR", "RETRO", "OTHER", "INJURED", "DETAIL", "POSTAL"]

# =============================
# 5. Preprocessing steps
# One-hot encode categorical features
# =============================
preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
        ("num", "passthrough", numeric_cols)
    ]
)

# =============================
# 6. Build final ML pipeline
# =============================
model = Pipeline(steps=[
    ("preprocess", preprocess),
    ("clf", RandomForestClassifier(n_estimators=200, random_state=42))
])

# =============================
# 7. Train/Test Split
# =============================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

# =============================
# 8. Train the model
# =============================
model.fit(X_train, y_train)

# =============================
# 9. Evaluate
# =============================
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.9541963015647226

Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.95      0.95      1758
           1       0.95      0.96      0.95      1757

    accuracy                           0.95      3515
   macro avg       0.95      0.95      0.95      3515
weighted avg       0.95      0.95      0.95      3515


Confusion Matrix:
 [[1664   94]
 [  67 1690]]
