In [1]:
# Notebook: Feature Engineering - addr - 1
# Author: Thomas Purk
# Date: 2025-03-18
# Reference: https://www.kaggle.com/datasets/mchirico/montcoalert

In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/emergency-911-calls-mcpa/911.csv


# addr - Data Explortation

In [3]:
# Notebook Step up steps
import re
import warnings
warnings.filterwarnings('ignore')

df_in_path = '/kaggle/input/emergency-911-calls-mcpa/911.csv'
df_out_path = '/kaggle/working/911.csv'

# Load the data 
df_911 = pd.read_csv(df_in_path)

def report_null_empty(df, feature):
    ''' Prints the count and percent of total of null or empty feature (column) values.
    
    Parameters: 
        df (dataframe): A Pandas dataframe which contains the fature.
        feature (string): The name of the feature in the dataframe to report on.
    '''
    null_count = df[feature].isnull().sum()
    empty_count = (df[feature] == "").sum()
    false_count = (df[feature] == False).sum()
    nan_count = (df[feature].isna()).sum()
    print('')
    print(f'{feature}: Null / Empty Report')
    print(f'\tRow count: {len(df)}')
    print(f'\tNull count: {null_count}')
    print(f'\tNull percent: {round(null_count / len(df) * 100,6)}%')
    print(f'\tEmpty count: {empty_count}')
    print(f'\tEmpty precent: {round(empty_count / len(df) * 100,6)}%')
    print(f'\tFalse count: {false_count}')
    print(f'\tFalse precent: {round(false_count / len(df) * 100,6)}%')
    print(f'\tNAN count: {nan_count}')
    print(f'\tNAN precent: {round(nan_count / len(df) * 100,6)}%')

def get_one_offs(df, feature):

    # Count occurrences of each value in feature
    value_counts = df[feature].value_counts()
    
    # Identify values that appear only once
    unique_values = value_counts[value_counts == 1].index
    
    # Filter the DataFrame to include only rows with unique values in feature
    return df[df[feature].isin(unique_values)]

In [4]:
# Display basic information
display(df_911.info())
display(df_911.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 649696 entries, 0 to 649695
Data columns (total 7 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   zip        574587 non-null  float64
 1   title      649696 non-null  object 
 2   timeStamp  649696 non-null  object 
 3   twp        649696 non-null  object 
 4   addr       649696 non-null  object 
 5   e          649696 non-null  int64  
 6   twp_type   649696 non-null  object 
dtypes: float64(1), int64(1), object(5)
memory usage: 34.7+ MB


None

Unnamed: 0,zip,title,timeStamp,twp,addr,e,twp_type
0,19525.0,EMS: BACK PAINS/INJURY,2015-12-10 17:10:52,NEW HANOVER TOWNSHIP,REINDEER CT & DEAD END,1,township
1,19446.0,EMS: DIABETIC EMERGENCY,2015-12-10 17:29:21,HATFIELD TOWNSHIP,BRIAR PATH & WHITEMARSH LN,1,township
2,19401.0,Fire: GAS-ODOR/LEAK,2015-12-10 14:39:21,NORRISTOWN BOROUGH,HAWS AVE,1,borough
3,19401.0,EMS: CARDIAC EMERGENCY,2015-12-10 16:47:36,NORRISTOWN BOROUGH,AIRY ST & SWEDE ST,1,borough
4,,EMS: DIZZINESS,2015-12-10 16:56:52,LOWER POTTSGROVE TOWNSHIP,CHERRYWOOD CT & DEAD END,1,township


In [5]:
# NOTE: addr column has no nulls 

# Inspect the desc feature
print("### addr ###")
display(df_911['addr'].describe())
report_null_empty(df_911,'addr')

### addr ###


count                                649696
unique                                38860
top       SHANNONDELL DR & SHANNONDELL BLVD
freq                                   7285
Name: addr, dtype: object


addr: Null / Empty Report
	Row count: 649696
	Null count: 0
	Null percent: 0.0%
	Empty count: 0
	Empty precent: 0.0%
	False count: 0
	False precent: 0.0%
	NAN count: 0
	NAN precent: 0.0%


In [25]:
# NOTE: The unique counts are possibily exaggerated due to reversed streets, Main St.& Elm St. vs Elm St. & Main St.
vc = df_911['addr'].value_counts()
print(vc)
print('')
print(vc[vc < 10])
print('')
print(vc[vc == 1])
print('')
print("Value counts for combination of 'addr' and 'twp':")
print(df_911.value_counts(subset=['twp', 'addr']).sort_index())

addr
SHANNONDELL DR & SHANNONDELL BLVD    7285
MAIN ST & OLD SUMNEYTOWN PIKE        2576
THE FAIRWAY  & RYDAL RD              1986
EAGLEVILLE RD & SUNDERLAND DR        1618
EVERGREEN RD & W LIGHTCAP RD         1591
                                     ... 
FOX LN & WESTAWAY DR                    1
N SCHOOL LN & HEMSING CIR               1
PRINCE ST & GLASGOW ST                  1
PERKIOMEN TRL & ARCOLA RD               1
IVY LN & WIDENER RD                     1
Name: count, Length: 38860, dtype: int64

addr
MENNONITE RD & RIDGEVIEW DR             9
OLD ORCHARD RD & SKIPPACK PIKE          9
MORRIS RD & BRENTWOOD DR                9
RAMP RT422 EB TO EGYPT RD & EGYPT RD    9
POTTSTOWN AVE & PERKIOMEN ST            9
                                       ..
FOX LN & WESTAWAY DR                    1
N SCHOOL LN & HEMSING CIR               1
PRINCE ST & GLASGOW ST                  1
PERKIOMEN TRL & ARCOLA RD               1
IVY LN & WIDENER RD                     1
Name: count, Length: 257

In [7]:
# count the unique number of townships per road
# In other words, does a road name repeat accross townships?
# NOTE: There are several addr values that appear in more than on twp

# Group by 'group' and count unique values in 'value'
unique_counts = df_911.groupby('addr')['twp'].nunique()
print(unique_counts[unique_counts > 1])

addr
.                          4
10TH AVE                   3
12TH AVE & HALLOWELL ST    2
12TH AVE & MAPLE ST        2
12TH AVE & UNNAMED ALY     2
                          ..
YORK RD & WASHINGTON LN    2
YOST RD                    2
ZEPP RD & ROSTKOWSKI RD    2
ZIEGLER RD                 3
ZVARICK RD & 11TH AVE      2
Name: twp, Length: 2559, dtype: int64


In [8]:
get_one_offs(df_911,'addr')

Unnamed: 0,zip,title,timeStamp,twp,addr,e,twp_type
0,19525.0,EMS: BACK PAINS/INJURY,2015-12-10 17:10:52,NEW HANOVER TOWNSHIP,REINDEER CT & DEAD END,1,township
46,19468.0,Traffic: VEHICLE ACCIDENT -,2015-12-10 18:51:01,LIMERICK TOWNSHIP,LINFIELD TRAPPE RD & RAMP N LEWIS RD TO RT422 EB,1,township
52,19446.0,Traffic: ROAD OBSTRUCTION -,2015-12-10 19:08:43,LANSDALE BOROUGH,W MT VERNON ST & S MITCHELL AVE,1,borough
137,,Traffic: VEHICLE ACCIDENT -,2015-12-11 05:19:35,WHITPAIN TOWNSHIP,NORTHGATE BLVD & QUAKERTOWN RD,1,township
184,,Traffic: DISABLED VEHICLE -,2015-12-11 08:50:55,AMBLER BOROUGH,PARK ON ALLENTOWN,1,borough
...,...,...,...,...,...,...,...
649361,19401.0,Traffic: VEHICLE ACCIDENT -,2020-07-28 16:03:28,WEST NORRITON TOWNSHIP,NSH GATE 4 & STERIGERE ST,1,township
649462,19002.0,EMS: BACK PAINS/INJURY,2020-07-28 22:20:43,LOWER GWYNEDD TOWNSHIP,FAIR LAND DR & SEVEREN CT,1,township
649472,19464.0,Traffic: VEHICLE ACCIDENT -,2020-07-28 23:11:15,WEST POTTSGROVE TOWNSHIP,E VINE ST & JEFFERSON ST,1,township
649557,,Traffic: ROAD OBSTRUCTION -,2020-07-29 08:30:57,TELFORD BOROUGH,WAS E BROA,1,borough


# Clean / Engineer addr Features

In [8]:
# Make up a list of road suffixes that can be used to categorize the address
import itertools
from collections import Counter

# Create a list of all road words
word_list = df_911['addr'].str.split(' ').values
flat_word_list = list(itertools.chain(*word_list))

# Create a data frame of road words and their frequency counts
df_addr_suff = pd.DataFrame.from_dict(Counter(flat_word_list), orient='index', columns=['Count'])
df_addr_suff = df_addr_suff.reset_index().rename(columns={'index': 'Value'})

# Display
list(df_addr_suff[(df_addr_suff['Count'] > 7000) & (df_addr_suff['Value'].str.len() < 6)]['Value'])

['CT',
 '&',
 'DEAD',
 'END',
 'LN',
 'AVE',
 'ST',
 'W',
 'RD',
 'MAIN',
 'OLD',
 'PIKE',
 '',
 'RAMP',
 'NB',
 'TO',
 'S',
 'YORK',
 'EXPY',
 'DR',
 'LINE',
 'RT422',
 'RT309',
 'BLVD',
 'CIR',
 'E',
 'WELSH',
 'STATE',
 'HILL',
 'WAY',
 'PARK',
 'N',
 'EB',
 'RIDGE',
 'HIGH',
 'ALY',
 'MILL',
 'GULPH',
 'SB',
 'FORGE',
 'WB',
 'I76',
 'TPKE']

In [9]:
minor_road_suf = ['CT','LN','ST','RD','DR', 'CIR','WAY','ALY', 'PL','TRL', 'PK', 'TER', 'GDNS','PASS']
medium_road_suf = ['AVE', 'BLVD', 'LINE' 'PARK', 'PKWY' ]
major_road_suf = ['PIKE', 'RAMP','EXPY','TPKE', 'HWY']

#df_911['road_type'] = ''

# Loop through rows using iterrows()
for index, row in df_911.iterrows():
    road_type = ''
    addr_list = row['addr'].split(' ')
    for suf in minor_road_suf:
        if(suf in addr_list):
            road_type = 'minor'
    for suf in medium_road_suf:
        if(suf in addr_list):
            road_type = 'medium'
    for suf in major_road_suf:
        if(suf in addr_list):
            road_type = 'major'
    
    # Routes - > "RT#"
    if re.search('RT\d', row['addr']):
        road_type = 'medium'

    # Write the remaining string as a new feature
    df_911.at[index, 'road_type'] = road_type

In [20]:
# Inspect the desc feature
print("### road_type ###")
display(df_911['road_type'].describe())
report_null_empty(df_911,'road_type')

### road_type ###


count     649696
unique         3
top        minor
freq      348508
Name: road_type, dtype: object


road_type: Null / Empty Report
	Row count: 649696
	Null count: 0
	Null percent: 0.0%
	Empty count: 0
	Empty precent: 0.0%
	False count: 0
	False precent: 0.0%
	NAN count: 0
	NAN precent: 0.0%


In [18]:
# Set remaining nulls to minre
# Verify
print(list(df_911[df_911['road_type'] == '' ]['addr'].unique()))

df_911.loc[df_911['road_type'] == '' ,'road_type'] = 'minor'

In [26]:
# Inspect the desc feature
print("### road_type ###")
display(df_911['road_type'].describe())
report_null_empty(df_911,'road_type')

### road_type ###


count     649696
unique         3
top        minor
freq      348508
Name: road_type, dtype: object


road_type: Null / Empty Report
	Row count: 649696
	Null count: 0
	Null percent: 0.0%
	Empty count: 0
	Empty precent: 0.0%
	False count: 0
	False precent: 0.0%
	NAN count: 0
	NAN precent: 0.0%


In [27]:
# Update the file
# After Updating
# 1. Manually Download locally
# 2. Manually Updload to a new version of the Kaggle Dataset


# Check if file exists
if os.path.exists(df_out_path):
    os.remove(df_out_path)
    print(f"File '{df_out_path}' has been deleted.")
else:
    print(f"The file '{df_out_path}' does not exist.")

df_911.to_csv(df_out_path, index=False)

The file '/kaggle/working/911.csv' does not exist.
