In [1]:
# Notebook: 911 Demo - Feature Engineering - twp - 1
# Author: Thomas Purk
# Date: 2025-03-17
# Reference: https://www.kaggle.com/datasets/mchirico/montcoalert

In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/montcoalert/911.csv


In [3]:
# Notebook Step up steps

import warnings
warnings.filterwarnings('ignore')

# Load the data 
df_911 = pd.read_csv('/kaggle/input/montcoalert/911.csv')

def report_null_empty(df, feature):
    ''' Prints the count and percent of total of null or empty feature (column) values.
    
    Parameters: 
        df (dataframe): A Pandas dataframe which contains the fature.
        feature (string): The name of the feature in the dataframe to report on.
    '''
    null_count = df[feature].isnull().sum()
    empty_count = (df[feature] == "").sum()
    false_count = (df[feature] == False).sum()
    nan_count = (df[feature].isna()).sum()
    print('')
    print(f'{feature}: Null / Empty Report')
    print(f'\tRow count: {len(df)}')
    print(f'\tNull count: {null_count}')
    print(f'\tNull percent: {round(null_count / len(df) * 100,6)}%')
    print(f'\tEmpty count: {empty_count}')
    print(f'\tEmpty precent: {round(empty_count / len(df) * 100,6)}%')
    print(f'\tFalse count: {false_count}')
    print(f'\tFalse precent: {round(false_count / len(df) * 100,6)}%')
    print(f'\tNAN count: {nan_count}')
    print(f'\tNAN precent: {round(nan_count / len(df) * 100,6)}%')

def get_one_offs(df, feature):

    # Count occurrences of each value in feature
    value_counts = df[feature].value_counts()
    
    # Identify values that appear only once
    unique_values = value_counts[value_counts == 1].index
    
    # Filter the DataFrame to include only rows with unique values in feature
    return df[df[feature].isin(unique_values)]

## twp - Data Exploration

In [11]:
# Display basic information
display(df_911.info())
display(df_911.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 663522 entries, 0 to 663521
Data columns (total 9 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   lat        663522 non-null  float64
 1   lng        663522 non-null  float64
 2   desc       663522 non-null  object 
 3   zip        583323 non-null  float64
 4   title      663522 non-null  object 
 5   timeStamp  663522 non-null  object 
 6   twp        663229 non-null  object 
 7   addr       663522 non-null  object 
 8   e          663522 non-null  int64  
dtypes: float64(3), int64(1), object(5)
memory usage: 45.6+ MB


None

Unnamed: 0,lat,lng,desc,zip,title,timeStamp,twp,addr,e
0,40.297876,-75.581294,REINDEER CT & DEAD END; NEW HANOVER; Station ...,19525.0,EMS: BACK PAINS/INJURY,2015-12-10 17:10:52,NEW HANOVER,REINDEER CT & DEAD END,1
1,40.258061,-75.26468,BRIAR PATH & WHITEMARSH LN; HATFIELD TOWNSHIP...,19446.0,EMS: DIABETIC EMERGENCY,2015-12-10 17:29:21,HATFIELD TOWNSHIP,BRIAR PATH & WHITEMARSH LN,1
2,40.121182,-75.351975,HAWS AVE; NORRISTOWN; 2015-12-10 @ 14:39:21-St...,19401.0,Fire: GAS-ODOR/LEAK,2015-12-10 14:39:21,NORRISTOWN,HAWS AVE,1
3,40.116153,-75.343513,AIRY ST & SWEDE ST; NORRISTOWN; Station 308A;...,19401.0,EMS: CARDIAC EMERGENCY,2015-12-10 16:47:36,NORRISTOWN,AIRY ST & SWEDE ST,1
4,40.251492,-75.60335,CHERRYWOOD CT & DEAD END; LOWER POTTSGROVE; S...,,EMS: DIZZINESS,2015-12-10 16:56:52,LOWER POTTSGROVE,CHERRYWOOD CT & DEAD END,1


In [12]:
# Inspect the twp feature
print("### twp ###")
display(df_911['twp'].describe())
report_null_empty(df_911,'twp')

### twp ###


count           663229
unique              68
top       LOWER MERION
freq             55490
Name: twp, dtype: object


twp: Null / Empty Report
	Row count: 663522
	Null count: 293
	Null percent: 0.044158%
	Empty count: 0
	Empty precent: 0.0%
	False count: 0
	False precent: 0.0%
	NAN count: 293
	NAN precent: 0.044158%


In [14]:
# NOTE: There are 298 nulls / 0.044158% of total
# NOTE: Possibly impute null 'twp' from similar lat/lng if lat/lng is good (engineer lat/lng first)
# what do they look like?

df_911[df_911['twp'].isnull()]

Unnamed: 0,lat,lng,desc,zip,title,timeStamp,twp,addr,e
1635,40.162804,-75.097848,TURNPIKE OVERPASS; ; 2015-12-14 @ 21:36:52-Sta...,19040.0,Fire: VEHICLE ACCIDENT,2015-12-14 21:36:52,,TURNPIKE OVERPASS,1
1821,40.099265,-75.175706,CHURCH RD; ; Station 322; 2015-12-15 @ 11:31:36;,,EMS: UNKNOWN MEDICAL EMERGENCY,2015-12-15 11:31:36,,CHURCH RD,1
5455,40.222272,-75.138302,GIANT; ; 2015-12-24 @ 17:30:07-Station:STA98;,18976.0,Fire: VEHICLE ACCIDENT,2015-12-24 17:30:07,,GIANT,1
7281,40.113517,-75.332257,HIGH ST; ; Station 329; 2015-12-30 @ 03:32:49;,19401.0,EMS: VEHICLE ACCIDENT,2015-12-30 03:32:49,,HIGH ST,1
7282,40.113517,-75.332257,HIGH ST; ; 2015-12-30 @ 03:32:28-Station:STA58;,19401.0,Fire: VEHICLE ACCIDENT,2015-12-30 03:32:28,,HIGH ST,1
...,...,...,...,...,...,...,...,...,...
659226,40.229008,-75.387852,NO LOCATION - NEIGHBORING COUNTY; ; Station 3...,,EMS: UNKNOWN MEDICAL EMERGENCY,2020-07-17 12:33:06,,NO LOCATION - NEIGHBORING COUNTY,1
660100,40.157506,-75.072525,RAILS TO TRAILS CONNECTOR TRL; ; 2020-07-19 @ ...,19006.0,Fire: RESCUE - WATER,2020-07-19 21:12:52,,RAILS TO TRAILS CONNECTOR TRL,1
660102,40.157506,-75.072525,RAILS TO TRAILS CONNECTOR TRL; ; Station 322A...,19006.0,EMS: RESCUE - WATER,2020-07-19 21:13:08,,RAILS TO TRAILS CONNECTOR TRL,1
660154,40.229008,-75.387852,NO LOCATION - NEIGHBORING COUNTY; ; Station 3...,,EMS: RESPIRATORY EMERGENCY,2020-07-20 03:34:03,,NO LOCATION - NEIGHBORING COUNTY,1


In [13]:
# NOTE: There are 68 unique 'twp' values
# what are they?
print(df_911['twp'].value_counts().sort_index(ascending=True).to_string())

twp
ABINGTON             39947
AMBLER                4454
BERKS COUNTY          1930
BRIDGEPORT            3695
BRYN ATHYN            1254
BUCKS COUNTY          1982
CHELTENHAM           30574
CHESTER COUNTY        7362
COLLEGEVILLE          2916
CONSHOHOCKEN          5655
DELAWARE COUNTY       1802
DOUGLASS              5550
EAST GREENVILLE       1316
EAST NORRITON        13963
FRANCONIA             9297
GREEN LANE             385
HATBORO               5448
HATFIELD BORO         1370
HATFIELD TOWNSHIP    11641
HORSHAM              18380
JENKINTOWN            4150
LANSDALE             11963
LEHIGH COUNTY          190
LIMERICK             14338
LOWER FREDERICK       2081
LOWER GWYNEDD        11139
LOWER MERION         55490
LOWER MORELAND       10988
LOWER POTTSGROVE     10775
LOWER PROVIDENCE     22476
LOWER SALFORD         9218
MARLBOROUGH           2144
MONTGOMERY           17315
NARBERTH              1751
NEW HANOVER           5207
NORRISTOWN           37633
NORTH WALES           21

In [15]:
# NOTE: Some records list the name of a county abutting MCPA as 'twp'. Assume there is at times some resource sharing or call routing from center to center.
# NOTE: https://www.pa.gov/agencies/penndot/maps/county-type-10.html

# Find valid MCPA calls
# Township validation data https://www.montgomerycountypa.gov/850/Municipal-Websites
# Hatfield is the only name that will appear in both township and borough lists, it is not a duplicate
mc_townships = ['Abington','Cheltenham','Douglass','East Norriton','Franconia','Hatfield','Horsham','Limerick','Lower Frederick','Lower Gwynedd','Lower Merion','Lower Moreland','Lower Pottsgrove','Lower Providence','Lower Salford','Marlborough','Montgomery','New Hanover','Perkiomen','Plymouth','Salford','Skippack','Springfield','Towamencin','Upper Dublin','Upper Frederick','Upper Gwynedd','Upper Hanover','Upper Merion','Upper Moreland','Upper Pottsgrove','Upper Providence','Upper Salford','West Norriton','West Pottsgrove','Whitemarsh','Whitpain','Worcester']

mc_boroughs = ['Ambler','Bridgeport','Bryn Athyn','Collegeville','Conshohocken','East Greenville','Green Lane','Hatboro','Hatfield','Jenkintown','Lansdale','Narberth','Norristown','North Wales','Pennsburg','Pottstown','Red Hill','Rockledge','Royersford','Schwenksville','Souderton','Telford','Trappe','West Conshohocken',]

mc_townships = list(map(str.upper, mc_townships))
mc_boroughs = list(map(str.upper, mc_boroughs))

In [26]:
# What are the values in the 'twp' feature that do not match official Montgomery county Names?
# NOTE: Non-MCPA: 13,826 ove 6 adjacent counties. Delete, the sample size is too small in comparison to the MCPA population size to characterize those counties
# Not in MCPA townships or boroughs
criteria_twp = ~df_911['twp'].str.replace(' TOWNSHIP', '').isin(mc_townships)
criteria_bor = ~df_911['twp'].str.replace(' BOROUGH', '').str.replace(' BORO', '').isin(mc_boroughs)
non_twp = df_911[criteria_twp & criteria_bor]

print(f'Calls not in MCPA: {len(non_twp)}')
print('')
display(non_twp['twp'].value_counts())


twp
CHESTER COUNTY     7362
BUCKS COUNTY       1982
BERKS COUNTY       1930
DELAWARE COUNTY    1802
PHILA COUNTY        267
LEHIGH COUNTY       190
Name: count, dtype: int64

## Accumulated Notes
- Starting count: 663,522
- There are 298 nulls / 0.044158% of total
- There are 68 unique 'twp' values
- Some records list the name of a county abutting MCPA as 'twp'. Assume there is at times some resource sharing or call routing from center to center.
- https://www.pa.gov/agencies/penndot/maps/county-type-10.html
- Non-MCPA: 13,826 ove 6 adjacent counties.


**Actions**
- Detete records with null twp
- Delete Non-MCPA calls, the sample size is too small in comparison to the MCPA call population size to characterize those counties
- Create a 'twp_type' feature tracking township or borough
- Normalize 'twp' feature by adding 'TOWNSHIP' or 'BOROUGH' to every name

# Clean / Engineer 'twp' Features

In [22]:
# Detete records with null twp 
null_twp_indexes = df_911[df_911['twp'].isnull()].index
df_911.drop(null_twp_indexes, inplace=True)


In [28]:
# Delete Non-MCPA calls,
# non_twp is set in cells above
# Not in MCPA townships or boroughs
criteria_twp = ~df_911['twp'].str.replace(' TOWNSHIP', '').isin(mc_townships)
criteria_bor = ~df_911['twp'].str.replace(' BOROUGH', '').str.replace(' BORO', '').isin(mc_boroughs)
non_twp_index = df_911[criteria_twp & criteria_bor].index

df_911.drop(non_twp_index, inplace=True)

In [None]:
# Create a 'twp_type' feature tracking township or borough
criteria_twp = df_911['twp'].str.replace(' TOWNSHIP', '').isin(mc_townships)
criteria_bor = df_911['twp'].str.replace(' BOROUGH', '').str.replace(' BORO', '').isin(mc_boroughs)

df_911.loc[criteria_twp, 'twp_type'] = 'township'
df_911.loc[criteria_bor, 'twp_type'] = 'borough'

In [None]:
# Normalize 'twp' feature by adding 'TOWNSHIP' or 'BOROUGH' to every name

# 1. Remove any 'TOWNSHIP' or 'BOROUGH' so we don't have duplicates
df_911['twp'] = df_911['twp'].str.replace(' TOWNSHIP', '').str.replace(' BOROUGH', '').str.replace(' BORO', '')

# 2. Append
df_911.loc[criteria_twp, 'twp'] += ' TOWNSHIP'
df_911.loc[criteria_bor, 'twp'] += ' BOROUGH'


In [45]:
# Inspect the twp feature
print("### twp ###")
display(df_911['twp'].describe())
report_null_empty(df_911,'twp')

### twp ###


count                    649696
unique                       62
top       LOWER MERION TOWNSHIP
freq                      55490
Name: twp, dtype: object


twp: Null / Empty Report
	Row count: 649696
	Null count: 0
	Null percent: 0.0%
	Empty count: 0
	Empty precent: 0.0%
	False count: 0
	False precent: 0.0%
	NAN count: 0
	NAN precent: 0.0%


In [48]:
# NOTE: There should now be 62 unique 'twp' values
df_911['twp'].value_counts()

twp
LOWER MERION TOWNSHIP      55490
ABINGTON TOWNSHIP          39947
NORRISTOWN BOROUGH         37633
UPPER MERION TOWNSHIP      36010
CHELTENHAM TOWNSHIP        30574
                           ...  
HATFIELD BOROUGH            1370
SCHWENKSVILLE BOROUGH       1337
EAST GREENVILLE BOROUGH     1316
BRYN ATHYN BOROUGH          1254
GREEN LANE BOROUGH           385
Name: count, Length: 62, dtype: int64

In [50]:
df_911.info()

<class 'pandas.core.frame.DataFrame'>
Index: 649696 entries, 0 to 663521
Data columns (total 10 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   lat        649696 non-null  float64
 1   lng        649696 non-null  float64
 2   desc       649696 non-null  object 
 3   zip        574587 non-null  float64
 4   title      649696 non-null  object 
 5   timeStamp  649696 non-null  object 
 6   twp        649696 non-null  object 
 7   addr       649696 non-null  object 
 8   e          649696 non-null  int64  
 9   twp_type   649696 non-null  object 
dtypes: float64(3), int64(1), object(6)
memory usage: 70.6+ MB


In [4]:
# Update the file
# After Updating
# 1. Manually Download locally
# 2. Manually Updload to a new version of the Kaggle Dataset

file_path = '/kaggle/working/911.csv'

# Check if file exists
if os.path.exists(file_path):
    os.remove(file_path)
    print(f"File '{file_path}' has been deleted.")
else:
    print(f"The file '{file_path}' does not exist.")

df_911.to_csv('/kaggle/working/911.csv', index=False)

The file '/kaggle/working/911.csv' does not exist.
