In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats

%matplotlib inline

In [64]:
df = pd.read_csv('911.csv')

In [65]:
df.head()

Unnamed: 0,lat,lng,desc,zip,title,timeStamp,twp,addr,e
0,40.297876,-75.581294,REINDEER CT & DEAD END; NEW HANOVER; Station ...,19525.0,EMS: BACK PAINS/INJURY,2015-12-10 17:10:52,NEW HANOVER,REINDEER CT & DEAD END,1
1,40.258061,-75.26468,BRIAR PATH & WHITEMARSH LN; HATFIELD TOWNSHIP...,19446.0,EMS: DIABETIC EMERGENCY,2015-12-10 17:29:21,HATFIELD TOWNSHIP,BRIAR PATH & WHITEMARSH LN,1
2,40.121182,-75.351975,HAWS AVE; NORRISTOWN; 2015-12-10 @ 14:39:21-St...,19401.0,Fire: GAS-ODOR/LEAK,2015-12-10 14:39:21,NORRISTOWN,HAWS AVE,1
3,40.116153,-75.343513,AIRY ST & SWEDE ST; NORRISTOWN; Station 308A;...,19401.0,EMS: CARDIAC EMERGENCY,2015-12-10 16:47:36,NORRISTOWN,AIRY ST & SWEDE ST,1
4,40.251492,-75.60335,CHERRYWOOD CT & DEAD END; LOWER POTTSGROVE; S...,,EMS: DIZZINESS,2015-12-10 16:56:52,LOWER POTTSGROVE,CHERRYWOOD CT & DEAD END,1


In [66]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 663522 entries, 0 to 663521
Data columns (total 9 columns):
lat          663522 non-null float64
lng          663522 non-null float64
desc         663522 non-null object
zip          583323 non-null float64
title        663522 non-null object
timeStamp    663522 non-null object
twp          663229 non-null object
addr         663522 non-null object
e            663522 non-null int64
dtypes: float64(3), int64(1), object(5)
memory usage: 45.6+ MB


In [67]:
df.describe()

Unnamed: 0,lat,lng,zip,e
count,663522.0,663522.0,583323.0,663522.0
mean,40.158162,-75.300105,19236.055791,1.0
std,0.220641,1.672884,298.222637,0.0
min,0.0,-119.698206,1104.0,1.0
25%,40.100344,-75.392735,19038.0,1.0
50%,40.143927,-75.305143,19401.0,1.0
75%,40.229008,-75.211865,19446.0,1.0
max,51.33539,87.854975,77316.0,1.0


In [68]:
df.drop(columns=['e'],inplace=True)

In [69]:
df.timeStamp = df.timeStamp.astype('datetime64')

In [70]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 663522 entries, 0 to 663521
Data columns (total 8 columns):
lat          663522 non-null float64
lng          663522 non-null float64
desc         663522 non-null object
zip          583323 non-null float64
title        663522 non-null object
timeStamp    663522 non-null datetime64[ns]
twp          663229 non-null object
addr         663522 non-null object
dtypes: datetime64[ns](1), float64(3), object(4)
memory usage: 40.5+ MB


In [71]:
def station_def(x):
    if 'Station' in x:
        lis1 = x.split('Station')
        stn = lis1[1].split(';')[0]
        if ':' in stn:
            stn = stn.split(':')[1]
        return stn.strip()
    else:
        return np.nan
            
df['Station'] = df['desc'].apply(station_def)

In [72]:
df.Station = df.Station.replace(r'^\s*$', np.nan, regex=True)
# df.Station = df.Station.replace('0', np.nan, regex=False)
df.Station.unique()

array(['332', '345', 'STA27', '308A', '329', '352', '336', '344', nan,
       '345B', 'STA80', '322A', '385', 'STA23', '339', '313A', 'STA49',
       '308', '313', '324A', 'STA51', 'STA33', 'STA24', '381', '358A',
       '318', '317', '351', '345A', '3A84', 'STA28', 'STA38', '311',
       'STA69', 'STA9', '322', '308B', 'STA98', 'STA88', 'STA26', '331',
       '383', 'STA48', '382', '325', 'STA17', 'STA7', 'STA65', 'STA46',
       '344A', 'STA12', 'STA72', 'STA47', '369', 'STA1', 'STA11', 'STA8',
       '384', 'STA14', '3A85', 'STA99', '324', 'STA59', '355', 'STA53',
       'STA29', 'STA200', 'STA43', 'STA500', 'STA400', 'STA45', 'STA74',
       'STA6', '358', 'STA5', 'STA18', 'STA42', 'STA83', 'STA62', 'STA54',
       '4A114', 'STA82', 'STA58', 'STA76', 'STA67', 'STA36', 'EMS',
       'STA100', 'STA73', 'STA15', 'STA95', 'STA77', 'STA25', 'STA44',
       'STA61', 'STA89', 'STA31', 'STA700', 'STA57', 'STA22', 'STA75',
       'STA79', 'STA2', 'STA87', 'STA37', 'STA4', '6A15', 'STA86',
 

In [73]:
def category_splitter(x):
    return x.split(': ')[0]
def title_splitter(x):
    return x.split(': ')[1]

df['Category'] = df['title'].apply(category_splitter)
df['title'] = df['title'].apply(title_splitter)
df.head()

Unnamed: 0,lat,lng,desc,zip,title,timeStamp,twp,addr,Station,Category
0,40.297876,-75.581294,REINDEER CT & DEAD END; NEW HANOVER; Station ...,19525.0,BACK PAINS/INJURY,2015-12-10 17:10:52,NEW HANOVER,REINDEER CT & DEAD END,332,EMS
1,40.258061,-75.26468,BRIAR PATH & WHITEMARSH LN; HATFIELD TOWNSHIP...,19446.0,DIABETIC EMERGENCY,2015-12-10 17:29:21,HATFIELD TOWNSHIP,BRIAR PATH & WHITEMARSH LN,345,EMS
2,40.121182,-75.351975,HAWS AVE; NORRISTOWN; 2015-12-10 @ 14:39:21-St...,19401.0,GAS-ODOR/LEAK,2015-12-10 14:39:21,NORRISTOWN,HAWS AVE,STA27,Fire
3,40.116153,-75.343513,AIRY ST & SWEDE ST; NORRISTOWN; Station 308A;...,19401.0,CARDIAC EMERGENCY,2015-12-10 16:47:36,NORRISTOWN,AIRY ST & SWEDE ST,308A,EMS
4,40.251492,-75.60335,CHERRYWOOD CT & DEAD END; LOWER POTTSGROVE; S...,,DIZZINESS,2015-12-10 16:56:52,LOWER POTTSGROVE,CHERRYWOOD CT & DEAD END,329,EMS


In [74]:
df.Category.value_counts()

EMS        332692
Traffic    230208
Fire       100622
Name: Category, dtype: int64

In [76]:
df.drop(columns='desc',inplace=True)
df.head()

Unnamed: 0,lat,lng,zip,title,timeStamp,twp,addr,Station,Category
0,40.297876,-75.581294,19525.0,BACK PAINS/INJURY,2015-12-10 17:10:52,NEW HANOVER,REINDEER CT & DEAD END,332,EMS
1,40.258061,-75.26468,19446.0,DIABETIC EMERGENCY,2015-12-10 17:29:21,HATFIELD TOWNSHIP,BRIAR PATH & WHITEMARSH LN,345,EMS
2,40.121182,-75.351975,19401.0,GAS-ODOR/LEAK,2015-12-10 14:39:21,NORRISTOWN,HAWS AVE,STA27,Fire
3,40.116153,-75.343513,19401.0,CARDIAC EMERGENCY,2015-12-10 16:47:36,NORRISTOWN,AIRY ST & SWEDE ST,308A,EMS
4,40.251492,-75.60335,,DIZZINESS,2015-12-10 16:56:52,LOWER POTTSGROVE,CHERRYWOOD CT & DEAD END,329,EMS


In [77]:
df.isnull().sum()

lat               0
lng               0
zip           80199
title             0
timeStamp         0
twp             293
addr              0
Station      230217
Category          0
dtype: int64