In [6]:
# Hybrid KNN + Random Forest Voting Classifier with 5-decimal Accuracy

import pandas as pd
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Load data
df = pd.read_csv("preprocessed_crime_data.csv")
df['DATE OCC'] = pd.to_datetime(df['DATE OCC'])
df['Year'] = df['DATE OCC'].dt.year

# Select features and target
feature_cols = ['Rpt Dist No', 'LAT', 'LON', 'TIME OCC', 'Day of Week',
                'Vict Age', 'Premis Cd']
X_train = df[df['Year'].isin([2020, 2021, 2022])][feature_cols]
y_train = df[df['Year'].isin([2020, 2021, 2022])]['Target']
X_test = df[df['Year'].isin([2023, 2024])][feature_cols]
y_test = df[df['Year'].isin([2023, 2024])]['Target']

# Define models
knn = Pipeline([('scaler', StandardScaler()),
                ('knn', KNeighborsClassifier(n_neighbors=7))])
rf = RandomForestClassifier(n_estimators=200, random_state=42)

# Create Voting Classifier
voting_clf = VotingClassifier(estimators=[('knn', knn), ('rf', rf)], voting='hard')

# Train and evaluate
voting_clf.fit(X_train, y_train)
y_pred = voting_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

# Print results
print(classification_report(y_test, y_pred))
print(f"Accuracy: {accuracy:.5f}")


              precision    recall  f1-score   support

           0       0.89      0.97      0.93     21995
           1       0.72      0.35      0.47      4097

    accuracy                           0.88     26092
   macro avg       0.81      0.66      0.70     26092
weighted avg       0.86      0.88      0.86     26092

Accuracy: 0.87686
