In [None]:
# Notebook: Feature Engineering - lat/lng - 1
# Author: Thomas Purk
# Date: 2025-03-17
# Reference: https://www.kaggle.com/datasets/mchirico/montcoalert

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/emergency-911-calls-mcpa/911.csv


In [58]:
# Notebook Step up steps

import warnings
warnings.filterwarnings('ignore')

df_in_path = '/kaggle/input/emergency-911-calls-mcpa/911.csv'
df_out_path = '/kaggle/working/911.csv'

# Load the data
# This time start with account owned dataset copy
df_911 = pd.read_csv(df_in_path)

def report_null_empty(df, feature):
    ''' Prints the count and percent of total of null or empty feature (column) values.
    
    Parameters: 
        df (dataframe): A Pandas dataframe which contains the fature.
        feature (string): The name of the feature in the dataframe to report on.
    '''
    null_count = df[feature].isnull().sum()
    empty_count = (df[feature] == "").sum()
    false_count = (df[feature] == False).sum()
    nan_count = (df[feature].isna()).sum()
    print('')
    print(f'{feature}: Null / Empty Report')
    print(f'\tRow count: {len(df)}')
    print(f'\tNull count: {null_count}')
    print(f'\tNull percent: {round(null_count / len(df) * 100,6)}%')
    print(f'\tEmpty count: {empty_count}')
    print(f'\tEmpty precent: {round(empty_count / len(df) * 100,6)}%')
    print(f'\tFalse count: {false_count}')
    print(f'\tFalse precent: {round(false_count / len(df) * 100,6)}%')
    print(f'\tNAN count: {nan_count}')
    print(f'\tNAN precent: {round(nan_count / len(df) * 100,6)}%')

def get_one_offs(df, feature):

    # Count occurrences of each value in feature
    value_counts = df[feature].value_counts()
    
    # Identify values that appear only once
    unique_values = value_counts[value_counts == 1].index
    
    # Filter the DataFrame to include only rows with unique values in feature
    return df[df[feature].isin(unique_values)]

In [5]:
# Display basic information
display(df_911.info())
display(df_911.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 649696 entries, 0 to 649695
Data columns (total 10 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   lat        649696 non-null  float64
 1   lng        649696 non-null  float64
 2   desc       649696 non-null  object 
 3   zip        574587 non-null  float64
 4   title      649696 non-null  object 
 5   timeStamp  649696 non-null  object 
 6   twp        649696 non-null  object 
 7   addr       649696 non-null  object 
 8   e          649696 non-null  int64  
 9   twp_type   649696 non-null  object 
dtypes: float64(3), int64(1), object(6)
memory usage: 49.6+ MB


None

Unnamed: 0,lat,lng,desc,zip,title,timeStamp,twp,addr,e,twp_type
0,40.297876,-75.581294,REINDEER CT & DEAD END; NEW HANOVER; Station ...,19525.0,EMS: BACK PAINS/INJURY,2015-12-10 17:10:52,NEW HANOVER TOWNSHIP,REINDEER CT & DEAD END,1,township
1,40.258061,-75.26468,BRIAR PATH & WHITEMARSH LN; HATFIELD TOWNSHIP...,19446.0,EMS: DIABETIC EMERGENCY,2015-12-10 17:29:21,HATFIELD TOWNSHIP,BRIAR PATH & WHITEMARSH LN,1,township
2,40.121182,-75.351975,HAWS AVE; NORRISTOWN; 2015-12-10 @ 14:39:21-St...,19401.0,Fire: GAS-ODOR/LEAK,2015-12-10 14:39:21,NORRISTOWN BOROUGH,HAWS AVE,1,borough
3,40.116153,-75.343513,AIRY ST & SWEDE ST; NORRISTOWN; Station 308A;...,19401.0,EMS: CARDIAC EMERGENCY,2015-12-10 16:47:36,NORRISTOWN BOROUGH,AIRY ST & SWEDE ST,1,borough
4,40.251492,-75.60335,CHERRYWOOD CT & DEAD END; LOWER POTTSGROVE; S...,,EMS: DIZZINESS,2015-12-10 16:56:52,LOWER POTTSGROVE TOWNSHIP,CHERRYWOOD CT & DEAD END,1,township


## lat/lng - Data Exploration

In [7]:
# Inspect the lat feature
print("### lat ###")
display(df_911['lat'].describe())
report_null_empty(df_911,'lat')

### lat ###


count    649696.000000
mean         40.157112
std           0.222260
min           0.000000
25%          40.099784
50%          40.142480
75%          40.228899
max          51.335390
Name: lat, dtype: float64


lat: Null / Empty Report
	Row count: 649696
	Null count: 0
	Null percent: 0.0%
	Empty count: 0
	Empty precent: 0.0%
	False count: 2
	False precent: 0.000308%
	NAN count: 0
	NAN precent: 0.0%


In [None]:
# NOTE: Assume decimal degree coordinates are in WGS84, but could be other
# NOTE: WGS84 latitude ranges for Montgomery County are about
#    - lat : 39.92845686753457,  40.49685656154363,
# NOTE: Zero degree latitude is the equator. 
# NOTE: Postive 51 degrees latitude is in Quebec Canada
# NOTE: The mean around 40 degrees is about right for Montgomery County PA
# NOTE: Data contains values outside the expected range

In [8]:
# Inspect the lng feature
print("### lng ###")
display(df_911['lng'].describe())
report_null_empty(df_911,'lng')

### lng ###


count    649696.000000
mean        -75.296522
std           1.689107
min        -119.698206
25%         -75.391220
50%         -75.302575
75%         -75.207651
max          87.854975
Name: lng, dtype: float64


lng: Null / Empty Report
	Row count: 649696
	Null count: 0
	Null percent: 0.0%
	Empty count: 0
	Empty precent: 0.0%
	False count: 2
	False precent: 0.000308%
	NAN count: 0
	NAN precent: 0.0%


In [None]:
# NOTE: Assume decimal degree coordinates are in WGS84, but could be other
# NOTE: WGS84 longitude ranges for Montgomery County are about
#    - lng : -75.72962906989237, -74.99629164995669
# NOTE: Eastern Hemisphere always has negative longitudes
# NOTE: So there are some incorrect values in this data

In [21]:
# There are 2 record where lat/lng is reporting as false

df_false_lat_lng = df_911[(df_911['lat'] == False) | (df_911['lng'] == False)]

display(df_false_lat_lng)
display(df_false_lat_lng['addr'])

# The addr = 'RAMP EGYPT RD TO RT422  & EGYPT RD'
# Get all lat/lngs from records with this addr
lat_lngs = df_911[df_911['addr'] =='RAMP EGYPT RD TO RT422  & EGYPT RD'][['lat','lng']]
display(lat_lngs.value_counts())

# NOTE: The lat/lng values for the all other records with same addr as the 0,0 records are identical
# lat        lng       
# 40.136973  -75.472723    30
# 0.000000    0.000000      2

lat        lng       
40.136973  -75.472723    30
0.000000    0.000000      2
Name: count, dtype: int64

In [30]:
# Inspect the records with out of bounds lat/lng

# Define Min/Max Geospatial bounds
min_lat = 39.92845686753457
max_lat = 40.49685656154363
min_lng = -75.72962906989237
max_lng = -74.99629164995669

# If one or the other is out of range
bad_lat_lng_indexes = (min_lat > df_911['lat']) | (df_911['lat'] > max_lat) | (min_lng > df_911['lng']) | (df_911['lng'] > max_lng)
display(df_911[bad_lat_lng_indexes].head())
display(df_911[bad_lat_lng_indexes].info())

# Show count of addr
print('')
print(df_911[bad_lat_lng_indexes]['addr'].value_counts().to_string())

Unnamed: 0,lat,lng,desc,zip,title,timeStamp,twp,addr,e,twp_type
5324,40.160004,-77.686813,GIANT ; HORSHAM; 2015-12-24 @ 17:23:04;,,Traffic: VEHICLE ACCIDENT -,2015-12-24 17:23:04,HORSHAM TOWNSHIP,GIANT,1,township
16305,40.160004,-77.686813,EDGEHILL ; EAST NORRITON; 2016-01-23 @ 08:18:18;,,Traffic: ROAD OBSTRUCTION -,2016-01-23 08:18:18,EAST NORRITON TOWNSHIP,EDGEHILL,1,township
24920,32.38709,-86.276106,600 ; MONTGOMERY; 2016-02-12 @ 18:44:37;,36107.0,Traffic: DISABLED VEHICLE -,2016-02-12 18:44:37,MONTGOMERY TOWNSHIP,600,1,township
55631,39.745533,-84.395256,MAIN ST; UPPER PROVIDENCE; 2016-05-07 @ 16:08:...,19475.0,Fire: VEHICLE FIRE,2016-05-07 16:08:25,UPPER PROVIDENCE TOWNSHIP,MAIN ST,1,township
65246,30.333596,-95.595595,8931; MONTGOMERY; Station 311; 2016-06-02 @ 1...,77316.0,EMS: CARDIAC EMERGENCY,2016-06-02 13:31:21,MONTGOMERY TOWNSHIP,8931,1,township


<class 'pandas.core.frame.DataFrame'>
Index: 585 entries, 5324 to 649613
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   lat        585 non-null    float64
 1   lng        585 non-null    float64
 2   desc       585 non-null    object 
 3   zip        94 non-null     float64
 4   title      585 non-null    object 
 5   timeStamp  585 non-null    object 
 6   twp        585 non-null    object 
 7   addr       585 non-null    object 
 8   e          585 non-null    int64  
 9   twp_type   585 non-null    object 
dtypes: float64(3), int64(1), object(6)
memory usage: 50.3+ KB


None


addr
PENNSYLVANIA TPKE & RAMP I476 NB TO I276 EB                            98
PENNSYLVANIA TPKE & RAMP I276 WB TO I476                               94
PENNSYLVANIA TPKE & RAMP I276 WB TO VIRGINIA DR                        88
EXTENSION PENNSYLVANIA TPKE & RAMP I276 WB TO I476 NB                  28
EXTENSION PENNSYLVANIA TPKE & RAMP I476 SB TO I276 WB                  20
PENNSYLVANIA TPKE & RAMP RT309 TO I276 EB                              18
RAMP I76 WB TO RT202  & RAMP I76 WB TO I76 WB                          17
PENNSYLVANIA TPKE & RAMP I76 EB TO SCHUYLKILL EXPY EB                  11
RAMP RT422 EB TO EVERGREEN RD & RT422 EB                               10
RAMP I76 WB TO I276  & SCHUYLKILL EXPY WB                               8
EXTENSION PENNSYLVANIA TPKE & RAMP I476 NB TO I276 EB                   7
SCHUYLKILL EXPY & RAMP RT202 TO I76 WB                                  7
RT422 BYP & BENJAMIN FRANKLIN HWY E                                     6
RAMP MATSONSFORD RD TO I476 SB  

In [44]:
# Validating with googl maps

# lat        lng       
# 40.160007  -77.686817 
# Seems to be a default location on PENNSYLVANIA TPKE midpoint in PA

# lat        lng    
# 32.387090	-86.276106
# Seems to be the centroid for Montgomery Alabama. Maybe the system match record text "Montgomery" incorrectly


lat_lngs = df_911[['lat','lng']]
display(lat_lngs.value_counts())



lat        lng       
40.097222  -75.376195    9262
40.133037  -75.408463    7285
40.024967  -75.282905    5385
40.172314  -75.492728    3106
40.108267  -75.306233    3096
                         ... 
40.264475  -75.365678       1
40.146295  -75.445748       1
40.146316  -75.376522       1
40.146388  -75.228814       1
40.153483  -75.386306       1
Name: count, Length: 24338, dtype: int64

In [53]:
df_911[np.isclose(df_911['lat'],40.097222,rtol=0.0000001)]

Unnamed: 0,lat,lng,desc,zip,title,timeStamp,twp,addr,e,twp_type
14,40.097222,-75.376195,SCHUYLKILL EXPY & CROTON RD UNDERPASS; UPPER M...,,Traffic: VEHICLE ACCIDENT -,2015-12-10 17:09:49,UPPER MERION TOWNSHIP,SCHUYLKILL EXPY & CROTON RD UNDERPASS,1,township
27,40.097222,-75.376195,SCHUYLKILL EXPY & WEADLEY RD OVERPASS; UPPER M...,,Traffic: VEHICLE ACCIDENT -,2015-12-10 18:05:39,UPPER MERION TOWNSHIP,SCHUYLKILL EXPY & WEADLEY RD OVERPASS,1,township
29,40.097222,-75.376195,SCHUYLKILL EXPY & WEADLEY RD OVERPASS; UPPER M...,,Traffic: VEHICLE ACCIDENT -,2015-12-10 18:07:01,UPPER MERION TOWNSHIP,SCHUYLKILL EXPY & WEADLEY RD OVERPASS,1,township
317,40.097222,-75.376195,SCHUYLKILL EXPY & RR OVERPASS; UPPER MERION; 2...,,Traffic: VEHICLE ACCIDENT -,2015-12-11 15:41:06,UPPER MERION TOWNSHIP,SCHUYLKILL EXPY & RR OVERPASS,1,township
373,40.097222,-75.376195,SCHUYLKILL EXPY & RT202 OVERPASS; UPPER MERION...,,Traffic: DISABLED VEHICLE -,2015-12-11 17:24:30,UPPER MERION TOWNSHIP,SCHUYLKILL EXPY & RT202 OVERPASS,1,township
...,...,...,...,...,...,...,...,...,...,...
649585,40.097222,-75.376195,SCHUYLKILL EXPY & RAMP S GULPH RD TO I76 EB; U...,,Traffic: DISABLED VEHICLE -,2020-07-29 10:08:13,UPPER MERION TOWNSHIP,SCHUYLKILL EXPY & RAMP S GULPH RD TO I76 EB,1,township
649587,40.097222,-75.376195,SCHUYLKILL EXPY & RAMP S GULPH RD TO I76 EB; U...,,Traffic: DISABLED VEHICLE -,2020-07-29 10:11:04,UPPER MERION TOWNSHIP,SCHUYLKILL EXPY & RAMP S GULPH RD TO I76 EB,1,township
649684,40.097222,-75.376195,RAMP RT202 NB TO RT422 & RT202 NB; UPPER MERI...,,Fire: VEHICLE FIRE,2020-07-29 15:32:04,UPPER MERION TOWNSHIP,RAMP RT202 NB TO RT422 & RT202 NB,1,township
649687,40.097222,-75.376195,RAMP RT202 NB TO RT422 & RT202 NB; UPPER MERI...,,Traffic: VEHICLE FIRE -,2020-07-29 15:34:11,UPPER MERION TOWNSHIP,RAMP RT202 NB TO RT422 & RT202 NB,1,township


## Accumulated Notes
- lat / lng features appear to be very unreliable
- Many lat/lng values are repeated, very many times but show different location descriptions in the 'addr' column
- These high frequency repeated values could be the centroid of another data item, such as township
- Some lat/lng seem to be defaults such as using the centroid of Montgomery Alabama, if the word "Mongomery"
- Some lat/lng seem to be the a default point representing the center of a road, such as the PENNSYLVANIA TPKE
- values like lat/lng are not good inputs for ML
- and the quality issues associated with the data in these columns would not make them good candidates for input into imputation algorithms to fill in other missing values
- One option would be to idenitfy records that possess default lat/lngs, then delete those records.
- Another option would be to delete the lay/lng columns since they will not likely be used for imputation or ML model input
  
**Actions**
- Decision is to delete the lat/lng columns


# Clean / Engineer lat/lng Features

In [54]:
# drop the lat column
df_911.drop('lat', axis=1, inplace=True)
# drop the lng column
df_911.drop('lng', axis=1, inplace=True)

df_911.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 649696 entries, 0 to 649695
Data columns (total 8 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   desc       649696 non-null  object 
 1   zip        574587 non-null  float64
 2   title      649696 non-null  object 
 3   timeStamp  649696 non-null  object 
 4   twp        649696 non-null  object 
 5   addr       649696 non-null  object 
 6   e          649696 non-null  int64  
 7   twp_type   649696 non-null  object 
dtypes: float64(1), int64(1), object(6)
memory usage: 39.7+ MB


In [59]:
# Update the file
# After Updating
# 1. Manually Download locally
# 2. Manually Updload to a new version of the Kaggle Dataset


# Check if file exists
if os.path.exists(df_out_path):
    os.remove(df_out_path)
    print(f"File '{df_out_path}' has been deleted.")
else:
    print(f"The file '{df_out_path}' does not exist.")

df_911.to_csv(df_out_path, index=False)

The file '/kaggle/working/911.csv' does not exist.
