In [1]:
# Notebook: Model Train, Eval, & Opt - MCPA 911
# Author: Thomas Purk
# Date: 2025-03-25
# Reference: 

# Model Training, Evaluation, & Optimization

In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/emergency-911-calls-mcpa/911-processed.csv
/kaggle/input/emergency-911-calls-mcpa/911.csv
/kaggle/input/emergency-911-calls-mcpa/911-encoded.csv


In [3]:
# Validate package and version
!pip list | grep sklearn

sklearn-pandas                     2.2.0


In [5]:
# Notebook Step up steps

# SciKit Learn imports
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score, mean_absolute_error, mean_squared_error

# For saving models
import pickle

# Prevent Pandas and other warnings from displaying above outputcells, improve readability
import warnings
warnings.filterwarnings('ignore')

# Load visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Load the data 
df_in_path = '/kaggle/input/emergency-911-calls-mcpa/911-processed.csv'
df_911 = pd.read_csv(df_in_path)
df_911.info() # validate

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 649696 entries, 0 to 649695
Data columns (total 7 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   twp           649696 non-null  object
 1   e             649696 non-null  int64 
 2   service_type  649696 non-null  object
 3   service_desc  649696 non-null  object
 4   month         649696 non-null  int64 
 5   day_of_week   649696 non-null  int64 
 6   day_night     649696 non-null  object
dtypes: int64(3), object(4)
memory usage: 34.7+ MB


In [6]:
# Prepare the data

#Group the data to count the number of events per category.
cols = ['twp', 'month', 'day_of_week','day_night', 'service_type', 'service_desc']
# Columns of interest - Which type of vehicles (service type) does a township need the most?
df_911_sub = df_911[cols]
# Grouping
group_911 = df_911_sub.groupby(cols)
# New DataFrame
df_911_grouped = group_911.size().reset_index(name='call_count')

# Encode data and labels
df_for_encoding = df_911_grouped[cols]

# One-hot encode categorical features
encoder = OneHotEncoder(drop='first', sparse=False)
encoded_features = encoder.fit_transform(df_for_encoding)


In [7]:
# Prepare training features and labels

# Convert encoded features to a DataFrame
X = pd.DataFrame(
    data=encoded_features, 
    columns=encoder.get_feature_names_out()
)
# Convert labels to numerical formate
y = df_911_grouped['call_count']

# Split the data training and testing
X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size=0.2, 
    random_state=42
)


In [8]:
# Train the Model

# Train Random Forest Regressor
rfr_model = RandomForestRegressor(
    n_estimators=100, 
    random_state=42,
    verbose=1
)
rfr_model.fit(X_train, y_train)

print("Random Forest Model Trained Successfully!")

# Make predictions on test data
y_pred = rfr_model.predict(X_test)

# Evaluate model performance using MAE and RMSE
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:  3.0min


Random Forest Model Trained Successfully!


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.9s


Mean Absolute Error (MAE): 1.52
Root Mean Squared Error (RMSE): 2.94


In [9]:
# Save the model
# Save the model
with open('rfr_model.pkl', 'wb') as f:
    pickle.dump(rfr_model, f)

# 
# Load the model - Start here unless model needs to be recreated
with open('rfr_model.pkl', 'rb') as f:
    rfr_model = pickle.load(f)

In [None]:
from scipy.optimize import linprog

# Example: Minimize distance-weighted allocation (simplified)
# Assume `predicted_calls` is the expected number of 911 calls per township

# Define supply constraints (50 police, 50 ambulances, 50 fire trucks)
total_vehicles = [50, 50, 50]

# Define demand per township (normalized predictions)
demand = predicted_calls / predicted_calls.sum() * sum(total_vehicles)

# Linear Programming to optimize allocation
res = linprog(
    c=-demand,  # We want to maximize allocations where demand is high
    A_eq=[[1] * 70],  # Ensure all vehicles are allocated
    b_eq=[sum(total_vehicles)],
    bounds=[(0, None) for _ in range(70)]  # Each township gets at least 0
)

allocated_vehicles = res.x  # Optimized vehicle allocation per township

In [None]:
import pandas as pd

# Predict call volumes for future data
future_predictions = model.predict(future_data)

# Create a DataFrame with township IDs and predicted call volumes
predicted_df = pd.DataFrame({
    'township_id': future_data['township_id'],  # Ensure this column is in future_data
    'predicted_calls': future_predictions
})

# Display sample predictions
print(predicted_df.head())

In [None]:
# Define available resources
total_vehicles = 150  # Total police, fire, and ambulance vehicles

# Normalize predicted calls to ensure proportional allocation
predicted_df['normalized_demand'] = (predicted_df['predicted_calls'] / predicted_df['predicted_calls'].sum()) * total_vehicles

# Display results
print(predicted_df.head())

In [None]:
# Example: Assume historical breakdown of call types per township
# (These should ideally be based on past data)
predicted_df['police_calls'] = predicted_df['normalized_demand'] * 0.4  # 40% police
predicted_df['fire_calls'] = predicted_df['normalized_demand'] * 0.3    # 30% fire
predicted_df['medical_calls'] = predicted_df['normalized_demand'] * 0.3 # 30% ambulance

# Round allocations to whole numbers
predicted_df[['police_calls', 'fire_calls', 'medical_calls']] = predicted_df[['police_calls', 'fire_calls', 'medical_calls']].round()

# Ensure total allocations do not exceed available vehicles
predicted_df['police_calls'] = predicted_df['police_calls'].clip(0, 50)
predicted_df['fire_calls'] = predicted_df['fire_calls'].clip(0, 50)
predicted_df['medical_calls'] = predicted_df['medical_calls'].clip(0, 50)

print(predicted_df.head())