In [17]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tabulate import tabulate
import numpy as np
# ---------------------------------------------------------------------------#
# ---------------------------------------------------------------------------#
df = pd.read_csv("preprocessed_crime_data.csv")
df['DATE OCC'] = pd.to_datetime(df['DATE OCC'], errors='coerce', dayfirst=True)
df = df.dropna(subset=['DATE OCC'])
df['Year'] = df['DATE OCC'].dt.year
target_col = 'Target'
df = df[df[target_col].isin([0, 1])].copy()
df[target_col] = df[target_col].astype(int)
# ---------------------------------------------------------------------------#
# ---------------------------------------------------------------------------#
feature_cols = [c for c in df.columns
                if c not in [target_col, 'Crime Count', 'DATE OCC']]

# Print the features used for training
print("Features used for training:")
for idx, feature in enumerate(feature_cols, 1):
    print(f"{idx}. {feature}")
print(f"Total features: {len(feature_cols)}")
print("\n")
# ---------------------------------------------------------------------------#
# ---------------------------------------------------------------------------#
train_df = df[df['Year'].isin([2021, 2022, 2023])]
test_df  = df[df['Year'] == 2024]
if train_df.empty or test_df.empty:
    raise ValueError("Dataset is missing rows for the requested years (2021-2024).")
X_train, y_train = train_df[feature_cols], train_df[target_col]
X_test,  y_test  = test_df[feature_cols],  test_df[target_col]
# ---------------------------------------------------------------------------#
# 4. Five logistic-regression variants
# ---------------------------------------------------------------------------#
logit_configs = {
    "L2  C=1":        dict(penalty="l2",  C=1.0,  solver="lbfgs"),
    "L2  C=10":       dict(penalty="l2",  C=10.0, solver="lbfgs"),
    "L2  C=0.1":      dict(penalty="l2",  C=0.1,  solver="lbfgs"),
    "L1  C=1":        dict(penalty="l1",  C=1.0,  solver="liblinear"),
    "ElasticNet 0.5": dict(penalty="elasticnet", C=1.0,
                           l1_ratio=0.5, solver="saga")
}
rows = []
for label, params in logit_configs.items():
    pipe = Pipeline([
        ("scale", StandardScaler()),
        ("clf",   LogisticRegression(max_iter=2000,
                                     class_weight="balanced",
                                     random_state=42,
                                     **params))
    ])
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    # Collect hyper-params for the table
    rows.append({
        "Model":      label,
        "Penalty":    params.get("penalty"),
        "C":          params.get("C"),
        "Solver":     params.get("solver"),
        "l1_ratio":   params.get("l1_ratio", "—"),
        "Accuracy":   round(accuracy_score (y_test, y_pred), 5),
        "Precision":  round(precision_score(y_test, y_pred, zero_division=0), 5),
        "Recall":     round(recall_score   (y_test, y_pred, zero_division=0), 5),
        "F1":         round(f1_score      (y_test, y_pred, zero_division=0), 5)
    })
# ---------------------------------------------------------------------------#
# ---------------------------------------------------------------------------#
print(tabulate(rows, headers="keys", tablefmt="grid", missingval="—"))
best = max(rows, key=lambda r: r["F1"])
print(f"\nBest configuration by F1-score →  {best['Model']}  "
      f"(F1 = {best['F1']:.5f})")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Year'] = df['DATE OCC'].dt.year


Features used for training:
1. Rpt Dist No
2. LAT
3. LON
4. TIME OCC
5. Day of Week
6. Is Violent
7. Vict Age
8. Premis Cd
9. Year
Total features: 9


+----------------+------------+------+-----------+------------+------------+-------------+----------+---------+
| Model          | Penalty    |    C | Solver    | l1_ratio   |   Accuracy |   Precision |   Recall |      F1 |
| L2  C=1        | l2         |  1   | lbfgs     | —          |    0.74963 |     0.21569 |   0.8186 | 0.34142 |
+----------------+------------+------+-----------+------------+------------+-------------+----------+---------+
| L2  C=10       | l2         | 10   | lbfgs     | —          |    0.74982 |     0.21582 |   0.8186 | 0.34158 |
+----------------+------------+------+-----------+------------+------------+-------------+----------+---------+
| L2  C=0.1      | l2         |  0.1 | lbfgs     | —          |    0.74982 |     0.21582 |   0.8186 | 0.34158 |
+----------------+------------+------+-----------+------------+--