In [None]:
# Notebook: Feature Engineering - lat/lng - 1
# Author: Thomas Purk
# Date: 2025-03-17
# Reference: https://www.kaggle.com/datasets/mchirico/montcoalert

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Notebook Step up steps

import warnings
warnings.filterwarnings('ignore')

df_in_path = '/kaggle/input/emergency-911-calls-mcpa/911.csv'
df_out_path = '/kaggle/working/911.csv'

# Load the data
# This time start with account owned dataset copy
df_911 = pd.read_csv(df_in_path)

def report_null_empty(df, feature):
    ''' Prints the count and percent of total of null or empty feature (column) values.
    
    Parameters: 
        df (dataframe): A Pandas dataframe which contains the fature.
        feature (string): The name of the feature in the dataframe to report on.
    '''
    null_count = df[feature].isnull().sum()
    empty_count = (df[feature] == "").sum()
    false_count = (df[feature] == False).sum()
    nan_count = (df[feature].isna()).sum()
    print('')
    print(f'{feature}: Null / Empty Report')
    print(f'\tRow count: {len(df)}')
    print(f'\tNull count: {null_count}')
    print(f'\tNull percent: {round(null_count / len(df) * 100,6)}%')
    print(f'\tEmpty count: {empty_count}')
    print(f'\tEmpty precent: {round(empty_count / len(df) * 100,6)}%')
    print(f'\tFalse count: {false_count}')
    print(f'\tFalse precent: {round(false_count / len(df) * 100,6)}%')
    print(f'\tNAN count: {nan_count}')
    print(f'\tNAN precent: {round(nan_count / len(df) * 100,6)}%')

def get_one_offs(df, feature):

    # Count occurrences of each value in feature
    value_counts = df[feature].value_counts()
    
    # Identify values that appear only once
    unique_values = value_counts[value_counts == 1].index
    
    # Filter the DataFrame to include only rows with unique values in feature
    return df[df[feature].isin(unique_values)]

In [None]:
# Display basic information
display(df_911.info())
display(df_911.head())

## lat/lng - Data Exploration

In [None]:
# Inspect the lat feature
print("### lat ###")
display(df_911['lat'].describe())
report_null_empty(df_911,'lat')

In [None]:
# NOTE: Assume decimal degree coordinates are in WGS84, but could be other
# NOTE: WGS84 latitude ranges for Montgomery County are about
#    - lat : 39.92845686753457,  40.49685656154363,
# NOTE: Zero degree latitude is the equator. 
# NOTE: Postive 51 degrees latitude is in Quebec Canada
# NOTE: The mean around 40 degrees is about right for Montgomery County PA
# NOTE: Data contains values outside the expected range

In [None]:
# Inspect the lng feature
print("### lng ###")
display(df_911['lng'].describe())
report_null_empty(df_911,'lng')

In [None]:
# NOTE: Assume decimal degree coordinates are in WGS84, but could be other
# NOTE: WGS84 longitude ranges for Montgomery County are about
#    - lng : -75.72962906989237, -74.99629164995669
# NOTE: Eastern Hemisphere always has negative longitudes
# NOTE: So there are some incorrect values in this data

In [None]:
# There are 2 record where lat/lng is reporting as false

df_false_lat_lng = df_911[(df_911['lat'] == False) | (df_911['lng'] == False)]

display(df_false_lat_lng)
display(df_false_lat_lng['addr'])

# The addr = 'RAMP EGYPT RD TO RT422  & EGYPT RD'
# Get all lat/lngs from records with this addr
lat_lngs = df_911[df_911['addr'] =='RAMP EGYPT RD TO RT422  & EGYPT RD'][['lat','lng']]
display(lat_lngs.value_counts())

# NOTE: The lat/lng values for the all other records with same addr as the 0,0 records are identical
# lat        lng       
# 40.136973  -75.472723    30
# 0.000000    0.000000      2

In [None]:
# Inspect the records with out of bounds lat/lng

# Define Min/Max Geospatial bounds
min_lat = 39.92845686753457
max_lat = 40.49685656154363
min_lng = -75.72962906989237
max_lng = -74.99629164995669

# If one or the other is out of range
bad_lat_lng_indexes = (min_lat > df_911['lat']) | (df_911['lat'] > max_lat) | (min_lng > df_911['lng']) | (df_911['lng'] > max_lng)
display(df_911[bad_lat_lng_indexes].head())
display(df_911[bad_lat_lng_indexes].info())

# Show count of addr
print('')
print(df_911[bad_lat_lng_indexes]['addr'].value_counts().to_string())

In [None]:
# Validating with googl maps

# lat        lng       
# 40.160007  -77.686817 
# Seems to be a default location on PENNSYLVANIA TPKE midpoint in PA

# lat        lng    
# 32.387090	-86.276106
# Seems to be the centroid for Montgomery Alabama. Maybe the system match record text "Montgomery" incorrectly


lat_lngs = df_911[['lat','lng']]
display(lat_lngs.value_counts())



In [None]:
df_911[np.isclose(df_911['lat'],40.097222,rtol=0.0000001)]

## Accumulated Notes
- lat / lng features appear to be very unreliable
- Many lat/lng values are repeated, very many times but show different location descriptions in the 'addr' column
- These high frequency repeated values could be the centroid of another data item, such as township
- Some lat/lng seem to be defaults such as using the centroid of Montgomery Alabama, if the word "Mongomery"
- Some lat/lng seem to be the a default point representing the center of a road, such as the PENNSYLVANIA TPKE
- values like lat/lng are not good inputs for ML
- and the quality issues associated with the data in these columns would not make them good candidates for input into imputation algorithms to fill in other missing values
- One option would be to idenitfy records that possess default lat/lngs, then delete those records.
- Another option would be to delete the lay/lng columns since they will not likely be used for imputation or ML model input
  
**Actions**
- Decision is to delete the lat/lng columns


# Clean / Engineer lat/lng Features

In [None]:
# drop the lat column
df_911.drop('lat', axis=1, inplace=True)
# drop the lng column
df_911.drop('lng', axis=1, inplace=True)

df_911.info()

In [None]:
# Update the file
# After Updating
# 1. Manually Download locally
# 2. Manually Updload to a new version of the Kaggle Dataset


# Check if file exists
if os.path.exists(df_out_path):
    os.remove(df_out_path)
    print(f"File '{df_out_path}' has been deleted.")
else:
    print(f"The file '{df_out_path}' does not exist.")

df_911.to_csv(df_out_path, index=False)