In [60]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV

*Read CSV*

In [61]:
train = pd.read_csv("data/processed/train.csv")
valid = pd.read_csv("data/processed/validation.csv")
test = pd.read_csv("data/processed/test.csv")
data_stack = pd.read_csv("data/processed/data_stack.csv")

In [62]:
print(f"Train: {train.isna().sum()}")
#print(f"Valid: {valid.isna().sum()}")
#print(f"Test: {test.isna().sum()}")
#print(f"Data Stack: {data_stack.isna().sum()}")

Train: Unnamed: 0                    0
ZCTA                          0
incident_month_timestamp      0
crime_Non-Violent             0
crime_Violent                 0
violent_lag_1                 0
violent_lag_2                 0
violent_lag_3                 0
season                        0
month_sin                     0
month_cos                     0
pop_total                     0
median_household_income     243
gini_index                  216
unemployment_rate           108
poverty_rate                108
dtype: int64


*Convert incident_month_timestamp to integer month index*

In [63]:
for df in [train, valid, test, data_stack]:
    df["incident_month_timestamp"] = pd.to_datetime(df["incident_month_timestamp"])
    df["incident_month_numeric"] = df["incident_month_timestamp"].view("int64") // 10**9

  df["incident_month_numeric"] = df["incident_month_timestamp"].view("int64") // 10**9
  df["incident_month_numeric"] = df["incident_month_timestamp"].view("int64") // 10**9
  df["incident_month_numeric"] = df["incident_month_timestamp"].view("int64") // 10**9
  df["incident_month_numeric"] = df["incident_month_timestamp"].view("int64") // 10**9


*Encode Zipcode*

In [64]:
if train["ZCTA"].dtype == object:
    le = LabelEncoder()
    train["ZCTA"] = le.fit_transform(train["ZCTA"])
    valid["ZCTA"] = le.transform(valid["ZCTA"])
    test["ZCTA"] = le.transform(test["ZCTA"])
    data_stack["ZCTA"] = le.transform(data_stack["ZCTA"])

Convert Seasons

In [65]:
#Convert season to integers
season_map = {
    "winter": 0,
    "spring": 1,
    "summer": 2,
    "fall": 3,
}

for df in [train, valid, test, data_stack]:
    df["season"] = df["season"].str.lower().map(season_map)

Select Features:

In [66]:
feature_cols = [
    "ZCTA",
    "incident_month_numeric",
    "season",
    "month_sin",
    "month_cos",
    "pop_total",
    "violent_lag_1",
    "violent_lag_2",
    "violent_lag_3",
    "crime_Non-Violent"
]
    #"median_household_income",
    #"gini_index",
    #"poverty_rate",
    #"unemployment_rate"
#]

target_col = "crime_Violent"

X_train = train[feature_cols]
y_train = train[target_col]

X_valid = valid[feature_cols]
y_valid = valid[target_col]

X_test = test[feature_cols]
y_test = test[target_col]

Random Forest

In [67]:
rf = RandomForestRegressor(random_state=123)

param_grid = {
    "n_estimators": [200, 400, 600],
    "max_depth": [10, 20, None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4]
}

grid = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    scoring="neg_mean_absolute_error",
    cv=3,
    n_jobs=-1,
    verbose=2
)

grid.fit(X_train, y_train)

best_rf = grid.best_estimator_
print("Best hyperparameters:", grid.best_params_)

Fitting 3 folds for each of 81 candidates, totalling 243 fits
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 400}


Evaluate on validation and test

In [68]:
#Evaluation
def evaluate(model, X, y, label):
    preds = model.predict(X)
    mae = mean_absolute_error(y, preds)
    rmse = mean_squared_error(y, preds) ** 0.5
    r2 = r2_score(y, preds)

    print(f"{label} performance:")
    print(f"  Mean Absolute Error : {mae:.3f}")
    print(f"  Root Mean Squared Error: {rmse:.3f}")
    print(f"  R^2 : {r2:.3f}")
    print()
    return preds

#Train, Validation, Test performance
print("\nTrain results:")
train_preds = evaluate(best_rf, X_train, y_train, "Train")

print("\nValidation results:")
valid_preds = evaluate(best_rf, X_valid, y_valid, "Validation")

print("\nTest results:")
test_preds = evaluate(best_rf, X_test, y_test, "Test")


Train results:
Train performance:
  Mean Absolute Error : 4.242
  Root Mean Squared Error: 6.622
  R^2 : 0.980


Validation results:
Validation performance:
  Mean Absolute Error : 6.303
  Root Mean Squared Error: 9.925
  R^2 : 0.957


Test results:
Test performance:
  Mean Absolute Error : 6.419
  Root Mean Squared Error: 10.126
  R^2 : 0.961



Predict on Full Data Stack

In [69]:
#Predict on data_stack and save
X_stack = data_stack[feature_cols]
stack_preds = best_rf.predict(X_stack)

data_stack["predicted_violent_crime"] = stack_preds

#Save output
data_stack.to_csv("data_stack_with_predictions.csv", index=False)

test_output = test.copy()
test_output["predicted_violent_crime"] = test_preds

Summary

In [70]:
#print(test_output.head(20))

test_summary = test_output.groupby("ZCTA")[["crime_Violent", "predicted_violent_crime"]].mean()
print("\nMean actual vs predicted by ZCTA:")
print(test_summary.head(20))


Mean actual vs predicted by ZCTA:
       crime_Violent  predicted_violent_crime
ZCTA                                         
90001      40.000000                38.792936
90002      87.000000                83.928660
90003     261.000000               256.810688
90004      94.444444                94.186122
90005      76.000000                75.277940
90006     116.222222               110.764414
90007      93.555556                92.794396
90008      70.333333                68.202419
90010      22.222222                20.625574
90011     199.000000               202.085341
90012     135.222222               140.749099
90013     170.222222               166.259502
90014      60.222222                64.206505
90015     131.888889               139.343385
90016      91.777778                89.482517
90017     126.333333               132.562679
90018      79.777778                79.851405
90019     103.666667               101.725456
90020      63.888889                67.499380