In [1]:
# Notebook: Feature Engineering - title - 1
# Author: Thomas Purk
# Date: 2025-03-20
# Reference: https://www.kaggle.com/datasets/mchirico/montcoalert

In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/emergency-911-calls-mcpa/911.csv


# title - Data Engineering

In [3]:
# Notebook Step up steps

import warnings
warnings.filterwarnings('ignore')

df_in_path = '/kaggle/input/emergency-911-calls-mcpa/911.csv'
df_out_path = '/kaggle/working/911.csv'

# Load the data 
df_911 = pd.read_csv(df_in_path)

def report_null_empty(df, feature):
    ''' Prints the count and percent of total of null or empty feature (column) values.
    
    Parameters: 
        df (dataframe): A Pandas dataframe which contains the fature.
        feature (string): The name of the feature in the dataframe to report on.
    '''
    null_count = df[feature].isnull().sum()
    empty_count = (df[feature] == "").sum()
    false_count = (df[feature] == False).sum()
    nan_count = (df[feature].isna()).sum()
    print('')
    print(f'{feature}: Null / Empty Report')
    print(f'\tRow count: {len(df)}')
    print(f'\tNull count: {null_count}')
    print(f'\tNull percent: {round(null_count / len(df) * 100,6)}%')
    print(f'\tEmpty count: {empty_count}')
    print(f'\tEmpty precent: {round(empty_count / len(df) * 100,6)}%')
    print(f'\tFalse count: {false_count}')
    print(f'\tFalse precent: {round(false_count / len(df) * 100,6)}%')
    print(f'\tNAN count: {nan_count}')
    print(f'\tNAN precent: {round(nan_count / len(df) * 100,6)}%')

def get_one_offs(df, feature):

    # Count occurrences of each value in feature
    value_counts = df[feature].value_counts()
    
    # Identify values that appear only once
    unique_values = value_counts[value_counts == 1].index
    
    # Filter the DataFrame to include only rows with unique values in feature
    return df[df[feature].isin(unique_values)]

In [4]:
# Display basic information
display(df_911.info())
display(df_911.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 649696 entries, 0 to 649695
Data columns (total 7 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   zip        574587 non-null  float64
 1   title      649696 non-null  object 
 2   timeStamp  649696 non-null  object 
 3   twp        649696 non-null  object 
 4   e          649696 non-null  int64  
 5   twp_type   649696 non-null  object 
 6   road_type  649696 non-null  object 
dtypes: float64(1), int64(1), object(5)
memory usage: 34.7+ MB


None

Unnamed: 0,zip,title,timeStamp,twp,e,twp_type,road_type
0,19525.0,EMS: BACK PAINS/INJURY,2015-12-10 17:10:52,NEW HANOVER TOWNSHIP,1,township,minor
1,19446.0,EMS: DIABETIC EMERGENCY,2015-12-10 17:29:21,HATFIELD TOWNSHIP,1,township,minor
2,19401.0,Fire: GAS-ODOR/LEAK,2015-12-10 14:39:21,NORRISTOWN BOROUGH,1,borough,medium
3,19401.0,EMS: CARDIAC EMERGENCY,2015-12-10 16:47:36,NORRISTOWN BOROUGH,1,borough,minor
4,,EMS: DIZZINESS,2015-12-10 16:56:52,LOWER POTTSGROVE TOWNSHIP,1,township,minor


In [5]:
# Inspect the desc feature
print("### title ###")
display(df_911['title'].describe())
report_null_empty(df_911,'title')

### title ###


count                          649696
unique                            147
top       Traffic: VEHICLE ACCIDENT -
freq                           145850
Name: title, dtype: object


title: Null / Empty Report
	Row count: 649696
	Null count: 0
	Null percent: 0.0%
	Empty count: 0
	Empty precent: 0.0%
	False count: 0
	False precent: 0.0%
	NAN count: 0
	NAN precent: 0.0%


In [7]:
vc = df_911['title'].value_counts()
print(vc)

title
Traffic: VEHICLE ACCIDENT -    145850
Traffic: DISABLED VEHICLE -     46663
Fire: FIRE ALARM                37729
EMS: FALL VICTIM                33730
EMS: RESPIRATORY EMERGENCY      33409
                                ...  
EMS: DISABLED VEHICLE               1
Fire: ANIMAL COMPLAINT              1
EMS: HIT + RUN                      1
Fire: DIABETIC EMERGENCY            1
Fire: BARRICADED SUBJECT            1
Name: count, Length: 147, dtype: int64


In [25]:
# NOTE: There seems to be two features seperated by a :
# NOTE: Some values have " - " at the end. Removing will make them part of another group

title_split = df_911['title'].str.split(':', expand=True)
df_911['serivce_type'] = title_split[0].str.strip()
df_911['serivce_desc'] = title_split[1].str.strip().str.rstrip(' -')

In [26]:
# Inspect the serivce_type feature
print("### serivce_type ###")
display(df_911['serivce_type'].describe())
report_null_empty(df_911,'serivce_type')
print('')
display(df_911['serivce_type'].value_counts())

### serivce_type ###


count     649696
unique         3
top          EMS
freq      325044
Name: serivce_type, dtype: object


serivce_type: Null / Empty Report
	Row count: 649696
	Null count: 0
	Null percent: 0.0%
	Empty count: 0
	Empty precent: 0.0%
	False count: 0
	False precent: 0.0%
	NAN count: 0
	NAN precent: 0.0%



serivce_type
EMS        325044
Traffic    225686
Fire        98966
Name: count, dtype: int64

In [27]:
# Inspect the serivce_type feature
print("### serivce_desc ###")
display(df_911['serivce_desc'].describe())
report_null_empty(df_911,'serivce_desc')
print('')
print(df_911['serivce_desc'].value_counts().sort_index().to_string())

### serivce_desc ###


count               649696
unique                  88
top       VEHICLE ACCIDENT
freq                181034
Name: serivce_desc, dtype: object


serivce_desc: Null / Empty Report
	Row count: 649696
	Null count: 0
	Null percent: 0.0%
	Empty count: 0
	Empty precent: 0.0%
	False count: 0
	False precent: 0.0%
	NAN count: 0
	NAN precent: 0.0%

serivce_desc
ABDOMINAL PAINS                   8802
ACTIVE SHOOTER                       3
ALLERGIC REACTION                 2815
ALTERED MENTAL STATUS             9896
AMPUTATION                          93
ANIMAL BITE                        572
ANIMAL COMPLAINT                     1
APPLIANCE FIRE                    1240
ARMED SUBJECT                        2
ASSAULT VICTIM                    4112
BACK PAINS/INJURY                 4775
BARRICADED SUBJECT                   3
BOMB DEVICE FOUND                   10
BOMB THREAT                          2
BUILDING FIRE                     5875
BURN VICTIM                        493
CARBON MONOXIDE DETECTOR          4334
CARDIAC ARREST                    6587
CARDIAC EMERGENCY                31708
CHOKING                           1205
CVA/STROKE

In [28]:
df_911.drop('title', axis=1, inplace=True)

In [29]:
# Display basic information
display(df_911.info())
display(df_911.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 649696 entries, 0 to 649695
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   zip           574587 non-null  float64
 1   timeStamp     649696 non-null  object 
 2   twp           649696 non-null  object 
 3   e             649696 non-null  int64  
 4   twp_type      649696 non-null  object 
 5   road_type     649696 non-null  object 
 6   serivce_type  649696 non-null  object 
 7   serivce_desc  649696 non-null  object 
dtypes: float64(1), int64(1), object(6)
memory usage: 39.7+ MB


None

Unnamed: 0,zip,timeStamp,twp,e,twp_type,road_type,serivce_type,serivce_desc
0,19525.0,2015-12-10 17:10:52,NEW HANOVER TOWNSHIP,1,township,minor,EMS,BACK PAINS/INJURY
1,19446.0,2015-12-10 17:29:21,HATFIELD TOWNSHIP,1,township,minor,EMS,DIABETIC EMERGENCY
2,19401.0,2015-12-10 14:39:21,NORRISTOWN BOROUGH,1,borough,medium,Fire,GAS-ODOR/LEAK
3,19401.0,2015-12-10 16:47:36,NORRISTOWN BOROUGH,1,borough,minor,EMS,CARDIAC EMERGENCY
4,,2015-12-10 16:56:52,LOWER POTTSGROVE TOWNSHIP,1,township,minor,EMS,DIZZINESS


In [30]:
# Update the file
# After Updating
# 1. Manually Download locally
# 2. Manually Updload to a new version of the Kaggle Dataset


# Check if file exists
if os.path.exists(df_out_path):
    os.remove(df_out_path)
    print(f"File '{df_out_path}' has been deleted.")
else:
    print(f"The file '{df_out_path}' does not exist.")

df_911.to_csv(df_out_path, index=False)

The file '/kaggle/working/911.csv' does not exist.
