In [2]:
# Notebook: Feature Engineering - timeStamp - 1
# Author: Thomas Purk
# Date: 2025-03-20
# Reference: https://www.kaggle.com/datasets/mchirico/montcoalert

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/emergency-911-calls-mcpa/911.csv


## timeStamp - Data Explortation

In [2]:
# Notebook Step up steps
import re
import warnings
warnings.filterwarnings('ignore')

df_in_path = '/kaggle/input/emergency-911-calls-mcpa/911.csv'
df_out_path = '/kaggle/working/911.csv'

# Load the data 
df_911 = pd.read_csv(df_in_path)

def report_null_empty(df, feature):
    ''' Prints the count and percent of total of null or empty feature (column) values.
    
    Parameters: 
        df (dataframe): A Pandas dataframe which contains the fature.
        feature (string): The name of the feature in the dataframe to report on.
    '''
    null_count = df[feature].isnull().sum()
    empty_count = (df[feature] == "").sum()
    false_count = (df[feature] == False).sum()
    nan_count = (df[feature].isna()).sum()
    print('')
    print(f'{feature}: Null / Empty Report')
    print(f'\tRow count: {len(df)}')
    print(f'\tNull count: {null_count}')
    print(f'\tNull percent: {round(null_count / len(df) * 100,6)}%')
    print(f'\tEmpty count: {empty_count}')
    print(f'\tEmpty precent: {round(empty_count / len(df) * 100,6)}%')
    print(f'\tFalse count: {false_count}')
    print(f'\tFalse precent: {round(false_count / len(df) * 100,6)}%')
    print(f'\tNAN count: {nan_count}')
    print(f'\tNAN precent: {round(nan_count / len(df) * 100,6)}%')

def get_one_offs(df, feature):

    # Count occurrences of each value in feature
    value_counts = df[feature].value_counts()
    
    # Identify values that appear only once
    unique_values = value_counts[value_counts == 1].index
    
    # Filter the DataFrame to include only rows with unique values in feature
    return df[df[feature].isin(unique_values)]

In [3]:
# Display basic information
display(df_911.info())
display(df_911.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 649696 entries, 0 to 649695
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   zip           574587 non-null  float64
 1   timeStamp     649696 non-null  object 
 2   twp           649696 non-null  object 
 3   e             649696 non-null  int64  
 4   twp_type      649696 non-null  object 
 5   road_type     649696 non-null  object 
 6   serivce_type  649696 non-null  object 
 7   serivce_desc  649696 non-null  object 
dtypes: float64(1), int64(1), object(6)
memory usage: 39.7+ MB


None

Unnamed: 0,zip,timeStamp,twp,e,twp_type,road_type,serivce_type,serivce_desc
0,19525.0,2015-12-10 17:10:52,NEW HANOVER TOWNSHIP,1,township,minor,EMS,BACK PAINS/INJURY
1,19446.0,2015-12-10 17:29:21,HATFIELD TOWNSHIP,1,township,minor,EMS,DIABETIC EMERGENCY
2,19401.0,2015-12-10 14:39:21,NORRISTOWN BOROUGH,1,borough,medium,Fire,GAS-ODOR/LEAK
3,19401.0,2015-12-10 16:47:36,NORRISTOWN BOROUGH,1,borough,minor,EMS,CARDIAC EMERGENCY
4,,2015-12-10 16:56:52,LOWER POTTSGROVE TOWNSHIP,1,township,minor,EMS,DIZZINESS


In [4]:
# Inspect the timeStamp feature
print("### timeStamp ###")
display(df_911['timeStamp'].describe())
report_null_empty(df_911,'timeStamp')

### timeStamp ###


count                  649696
unique                 627920
top       2018-07-09 13:23:16
freq                        8
Name: timeStamp, dtype: object


timeStamp: Null / Empty Report
	Row count: 649696
	Null count: 0
	Null percent: 0.0%
	Empty count: 0
	Empty precent: 0.0%
	False count: 0
	False precent: 0.0%
	NAN count: 0
	NAN precent: 0.0%


In [5]:
df_911['timeStamp'].value_counts()

timeStamp
2018-07-09 13:23:16    8
2018-10-06 19:26:38    8
2019-11-08 20:52:00    7
2019-01-09 21:01:46    7
2020-01-12 01:41:13    7
                      ..
2017-07-16 05:59:22    1
2017-07-16 06:10:41    1
2017-07-16 06:08:28    1
2017-07-16 06:14:08    1
2020-07-29 15:52:46    1
Name: count, Length: 627920, dtype: int64

In [7]:
# NOTE: Create a new datetime column to store value as a datetime data type
df_911['datetime'] = pd.to_datetime(df_911['timeStamp'])


In [8]:
# NOTE: Create a month column to store the month portion of the date
df_911['month'] = df_911['datetime'].dt.month

In [13]:
# Inspect the month feature
print("### month ###")
display(df_911['month'].describe())
report_null_empty(df_911,'month')
display(df_911['month'].value_counts())

### month ###


count    649696.000000
mean          6.334701
std           3.474764
min           1.000000
25%           3.000000
50%           6.000000
75%           9.000000
max          12.000000
Name: month, dtype: float64


month: Null / Empty Report
	Row count: 649696
	Null count: 0
	Null percent: 0.0%
	Empty count: 0
	Empty precent: 0.0%
	False count: 0
	False precent: 0.0%
	NAN count: 0
	NAN precent: 0.0%


month
1     60998
3     58877
6     58824
7     58554
12    56500
5     56299
2     54195
4     52556
10    50195
11    49093
8     47348
9     46257
Name: count, dtype: int64

In [10]:
# NOTE: Create a day of week column to store this aspect of the date
df_911['day_of_week'] = df_911['datetime'].dt.dayofweek

In [14]:
# Inspect the day_of_week feature
print("### day_of_week ###")
display(df_911['day_of_week'].describe())
report_null_empty(df_911,'day_of_week')
display(df_911['day_of_week'].value_counts())

### day_of_week ###


count    649696.000000
mean          2.898568
std           1.950245
min           0.000000
25%           1.000000
50%           3.000000
75%           5.000000
max           6.000000
Name: day_of_week, dtype: float64


day_of_week: Null / Empty Report
	Row count: 649696
	Null count: 0
	Null percent: 0.0%
	Empty count: 0
	Empty precent: 0.0%
	False count: 94820
	False precent: 14.594518%
	NAN count: 0
	NAN precent: 0.0%


day_of_week
4    100847
2     97131
3     96630
1     95690
0     94820
5     87510
6     77068
Name: count, dtype: int64

In [11]:
# NOTE: Create a day and night column to classify the time as day or night
def get_day_night(datetime):
    ''' Determine day time or night time based on the hour of the day.'''
    return 'day' if 5 <= datetime.hour < 21 else 'night'
    
df_911['day_night'] = df_911['datetime'].apply(get_day_night)

In [12]:
# Inspect the day_night feature
print("### day_night ###")
display(df_911['day_night'].describe())
report_null_empty(df_911,'day_night')
display(df_911['day_night'].value_counts())

### day_night ###


count     649696
unique         2
top          day
freq      535582
Name: day_night, dtype: object


day_night: Null / Empty Report
	Row count: 649696
	Null count: 0
	Null percent: 0.0%
	Empty count: 0
	Empty precent: 0.0%
	False count: 0
	False precent: 0.0%
	NAN count: 0
	NAN precent: 0.0%


day_night
day      535582
night    114114
Name: count, dtype: int64

** NOTES **
- Assumption, season of the year, for which month is a proxy could impact the type and frequency of 911 calls
- Assumption, day time or night time could impact the type and frequency of 911 calls
- Assumption, day of week could impact the type and frequency of 911 calls
- So extracted month, day of week, and day/night from timeStamp to create new columns

In [15]:
# Display basic information
display(df_911.info())
display(df_911.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 649696 entries, 0 to 649695
Data columns (total 12 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   zip           574587 non-null  float64       
 1   timeStamp     649696 non-null  object        
 2   twp           649696 non-null  object        
 3   e             649696 non-null  int64         
 4   twp_type      649696 non-null  object        
 5   road_type     649696 non-null  object        
 6   serivce_type  649696 non-null  object        
 7   serivce_desc  649696 non-null  object        
 8   datetime      649696 non-null  datetime64[ns]
 9   month         649696 non-null  int32         
 10  day_of_week   649696 non-null  int32         
 11  day_night     649696 non-null  object        
dtypes: datetime64[ns](1), float64(1), int32(2), int64(1), object(7)
memory usage: 54.5+ MB


None

Unnamed: 0,zip,timeStamp,twp,e,twp_type,road_type,serivce_type,serivce_desc,datetime,month,day_of_week,day_night
0,19525.0,2015-12-10 17:10:52,NEW HANOVER TOWNSHIP,1,township,minor,EMS,BACK PAINS/INJURY,2015-12-10 17:10:52,12,3,day
1,19446.0,2015-12-10 17:29:21,HATFIELD TOWNSHIP,1,township,minor,EMS,DIABETIC EMERGENCY,2015-12-10 17:29:21,12,3,day
2,19401.0,2015-12-10 14:39:21,NORRISTOWN BOROUGH,1,borough,medium,Fire,GAS-ODOR/LEAK,2015-12-10 14:39:21,12,3,day
3,19401.0,2015-12-10 16:47:36,NORRISTOWN BOROUGH,1,borough,minor,EMS,CARDIAC EMERGENCY,2015-12-10 16:47:36,12,3,day
4,,2015-12-10 16:56:52,LOWER POTTSGROVE TOWNSHIP,1,township,minor,EMS,DIZZINESS,2015-12-10 16:56:52,12,3,day


In [16]:
# drop the timeStamp & datetime columns
df_911.drop('timeStamp', axis=1, inplace=True)
df_911.drop('datetime', axis=1, inplace=True)

In [17]:
# Update the file
# After Updating
# 1. Manually Download locally
# 2. Manually Updload to a new version of the Kaggle Dataset


# Check if file exists
if os.path.exists(df_out_path):
    os.remove(df_out_path)
    print(f"File '{df_out_path}' has been deleted.")
else:
    print(f"The file '{df_out_path}' does not exist.")

df_911.to_csv(df_out_path, index=False)

The file '/kaggle/working/911.csv' does not exist.
