In [4]:
# Notebook: Feature Engineering - zip - 1
# Author: Thomas Purk
# Date: 2025-03-21
# Reference: https://www.kaggle.com/datasets/mchirico/montcoalert

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/emergency-911-calls-mcpa/911.csv


# zip - Data Exploration

In [5]:
# Notebook Step up steps

import warnings
warnings.filterwarnings('ignore')

df_in_path = '/kaggle/input/emergency-911-calls-mcpa/911.csv'
df_out_path = '/kaggle/working/911.csv'

# Load the data 
df_911 = pd.read_csv(df_in_path)

def report_null_empty(df, feature):
    ''' Prints the count and percent of total of null or empty feature (column) values.
    
    Parameters: 
        df (dataframe): A Pandas dataframe which contains the fature.
        feature (string): The name of the feature in the dataframe to report on.
    '''
    null_count = df[feature].isnull().sum()
    empty_count = (df[feature] == "").sum()
    false_count = (df[feature] == False).sum()
    nan_count = (df[feature].isna()).sum()
    print('')
    print(f'{feature}: Null / Empty Report')
    print(f'\tRow count: {len(df)}')
    print(f'\tNull count: {null_count}')
    print(f'\tNull percent: {round(null_count / len(df) * 100,6)}%')
    print(f'\tEmpty count: {empty_count}')
    print(f'\tEmpty precent: {round(empty_count / len(df) * 100,6)}%')
    print(f'\tFalse count: {false_count}')
    print(f'\tFalse precent: {round(false_count / len(df) * 100,6)}%')
    print(f'\tNAN count: {nan_count}')
    print(f'\tNAN precent: {round(nan_count / len(df) * 100,6)}%')

def get_one_offs(df, feature):

    # Count occurrences of each value in feature
    value_counts = df[feature].value_counts()
    
    # Identify values that appear only once
    unique_values = value_counts[value_counts == 1].index
    
    # Filter the DataFrame to include only rows with unique values in feature
    return df[df[feature].isin(unique_values)]

In [6]:
# Display basic information
display(df_911.info())
display(df_911.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 649696 entries, 0 to 649695
Data columns (total 10 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   zip           574587 non-null  float64
 1   twp           649696 non-null  object 
 2   e             649696 non-null  int64  
 3   twp_type      649696 non-null  object 
 4   road_type     649696 non-null  object 
 5   serivce_type  649696 non-null  object 
 6   serivce_desc  649696 non-null  object 
 7   month         649696 non-null  int64  
 8   day_of_week   649696 non-null  int64  
 9   day_night     649696 non-null  object 
dtypes: float64(1), int64(3), object(6)
memory usage: 49.6+ MB


None

Unnamed: 0,zip,twp,e,twp_type,road_type,serivce_type,serivce_desc,month,day_of_week,day_night
0,19525.0,NEW HANOVER TOWNSHIP,1,township,minor,EMS,BACK PAINS/INJURY,12,3,day
1,19446.0,HATFIELD TOWNSHIP,1,township,minor,EMS,DIABETIC EMERGENCY,12,3,day
2,19401.0,NORRISTOWN BOROUGH,1,borough,medium,Fire,GAS-ODOR/LEAK,12,3,day
3,19401.0,NORRISTOWN BOROUGH,1,borough,minor,EMS,CARDIAC EMERGENCY,12,3,day
4,,LOWER POTTSGROVE TOWNSHIP,1,township,minor,EMS,DIZZINESS,12,3,day


In [15]:
# Inspect the zip feature
print("### month ###")
display(df_911['zip'].describe())
report_null_empty(df_911,'zip')
display(get_one_offs(df_911,'zip'))

### month ###


count    574587.000000
mean      19235.567550
std         296.516027
min        1104.000000
25%       19038.000000
50%       19401.000000
75%       19446.000000
max       77316.000000
Name: zip, dtype: float64


zip: Null / Empty Report
	Row count: 649696
	Null count: 75109
	Null percent: 11.560638%
	Empty count: 0
	Empty precent: 0.0%
	False count: 0
	False precent: 0.0%
	NAN count: 75109
	NAN precent: 11.560638%


Unnamed: 0,zip,twp,e,twp_type,road_type,serivce_type,serivce_desc,month,day_of_week,day_night
24920,36107.0,MONTGOMERY TOWNSHIP,1,township,minor,Traffic,DISABLED VEHICLE,2,4,day
65246,77316.0,MONTGOMERY TOWNSHIP,1,township,minor,EMS,CARDIAC EMERGENCY,6,3,day
152194,17555.0,NEW HANOVER TOWNSHIP,1,township,minor,Traffic,DISABLED VEHICLE,1,5,day
234049,19018.0,LOWER MERION TOWNSHIP,1,township,minor,Traffic,ROAD OBSTRUCTION,8,2,night
269448,18051.0,HATFIELD TOWNSHIP,1,township,minor,Traffic,DISABLED VEHICLE,11,4,day
278019,18049.0,LOWER MERION TOWNSHIP,1,township,minor,Traffic,VEHICLE ACCIDENT,12,4,day
292943,19144.0,HATFIELD TOWNSHIP,1,township,minor,Fire,BUILDING FIRE,1,4,day
298726,19607.0,WHITPAIN TOWNSHIP,1,township,minor,Traffic,DISABLED VEHICLE,2,6,day
299274,19450.0,TELFORD BOROUGH,1,borough,minor,Traffic,VEHICLE ACCIDENT,2,0,day
304303,17506.0,AMBLER BOROUGH,1,borough,minor,Traffic,ROAD OBSTRUCTION,2,6,day


In [12]:
print(df_911['zip'].nunique())

zip
19401.0    45575
19464.0    43851
19403.0    34866
19446.0    32253
19406.0    22441
           ...  
17901.0        1
19134.0        1
19135.0        1
8502.0         1
19312.0        1
Name: count, Length: 164, dtype: int64

164


**NOTES**
- 12% null values
- 164 unique values
- twp is also a geographic identifier like zipcode, but twp is 0% null
- zip has greater geographic resolution compared to twp, but may not be as strong of a proxy for as twp values for other factors such population density and demographics.
- Also zip codes could cross county political boundaries
- Decision is to drop zip in favor of twp as a geographic identifier.

In [16]:
df_911.drop('zip', axis=1, inplace=True)

In [17]:
# Update the file
# After Updating
# 1. Manually Download locally
# 2. Manually Updload to a new version of the Kaggle Dataset


# Check if file exists
if os.path.exists(df_out_path):
    os.remove(df_out_path)
    print(f"File '{df_out_path}' has been deleted.")
else:
    print(f"The file '{df_out_path}' does not exist.")

df_911.to_csv(df_out_path, index=False)

The file '/kaggle/working/911.csv' does not exist.
