In [42]:
import pandas as pd
import numpy as np
import datetime as dt
from pandas import Series
from time import sleep
from geopy.geocoders import Nominatim
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

#importing the dataset using pandas
df = pd.read_csv("./911.csv")

#sample of original dataset
df.head(5)

Unnamed: 0,lat,lng,desc,zip,title,timeStamp,twp,addr,e
0,40.297876,-75.581294,REINDEER CT & DEAD END; NEW HANOVER; Station ...,19525.0,EMS: BACK PAINS/INJURY,2015-12-10 17:10:52,NEW HANOVER,REINDEER CT & DEAD END,1
1,40.258061,-75.26468,BRIAR PATH & WHITEMARSH LN; HATFIELD TOWNSHIP...,19446.0,EMS: DIABETIC EMERGENCY,2015-12-10 17:29:21,HATFIELD TOWNSHIP,BRIAR PATH & WHITEMARSH LN,1
2,40.121182,-75.351975,HAWS AVE; NORRISTOWN; 2015-12-10 @ 14:39:21-St...,19401.0,Fire: GAS-ODOR/LEAK,2015-12-10 14:39:21,NORRISTOWN,HAWS AVE,1
3,40.116153,-75.343513,AIRY ST & SWEDE ST; NORRISTOWN; Station 308A;...,19401.0,EMS: CARDIAC EMERGENCY,2015-12-10 16:47:36,NORRISTOWN,AIRY ST & SWEDE ST,1
4,40.251492,-75.60335,CHERRYWOOD CT & DEAD END; LOWER POTTSGROVE; S...,,EMS: DIZZINESS,2015-12-10 16:56:52,LOWER POTTSGROVE,CHERRYWOOD CT & DEAD END,1


In [43]:
#separate timeStamp data into 2 new columns
df['date'] = df.timeStamp.str[0:11]
df['time'] = df.timeStamp.str[-8:]

#Get rid of dummy 'e' column and 'timeStamp' column
del df['e']
del df['timeStamp']
#If time at end, then try to extract station number and impute
del df['desc']

#sample of dataset
df.head(10)

Unnamed: 0,lat,lng,zip,title,twp,addr,date,time
0,40.297876,-75.581294,19525.0,EMS: BACK PAINS/INJURY,NEW HANOVER,REINDEER CT & DEAD END,2015-12-10,17:10:52
1,40.258061,-75.26468,19446.0,EMS: DIABETIC EMERGENCY,HATFIELD TOWNSHIP,BRIAR PATH & WHITEMARSH LN,2015-12-10,17:29:21
2,40.121182,-75.351975,19401.0,Fire: GAS-ODOR/LEAK,NORRISTOWN,HAWS AVE,2015-12-10,14:39:21
3,40.116153,-75.343513,19401.0,EMS: CARDIAC EMERGENCY,NORRISTOWN,AIRY ST & SWEDE ST,2015-12-10,16:47:36
4,40.251492,-75.60335,,EMS: DIZZINESS,LOWER POTTSGROVE,CHERRYWOOD CT & DEAD END,2015-12-10,16:56:52
5,40.253473,-75.283245,19446.0,EMS: HEAD INJURY,LANSDALE,CANNON AVE & W 9TH ST,2015-12-10,15:39:04
6,40.182111,-75.127795,19044.0,EMS: NAUSEA/VOMITING,HORSHAM,LAUREL AVE & OAKDALE AVE,2015-12-10,16:46:48
7,40.217286,-75.405182,19426.0,EMS: RESPIRATORY EMERGENCY,SKIPPACK,COLLEGEVILLE RD & LYWISKI RD,2015-12-10,16:17:05
8,40.289027,-75.39959,19438.0,EMS: SYNCOPAL EPISODE,LOWER SALFORD,MAIN ST & OLD SUMNEYTOWN PIKE,2015-12-10,16:51:42
9,40.102398,-75.291458,19462.0,Traffic: VEHICLE ACCIDENT -,PLYMOUTH,BLUEROUTE & RAMP I476 NB TO CHEMICAL RD,2015-12-10,17:35:41


In [44]:
#count number of values in each column
df.count()

lat      177755
lng      177755
zip      155957
title    177755
twp      177694
addr     177755
date     177755
time     177755
dtype: int64

In [45]:
#Finds total number of rows with missing values 
#Rows with more than one missing value only counted once
df.isnull().any(axis=1).sum()

21822

In [46]:
#Drop rows with missing zipcode values
df = df.dropna(subset=['zip'])

#Convert float values for zipcodes to integer type
df['zip'] = df['zip'].astype(int)
df.count()

lat      155957
lng      155957
zip      155957
title    155957
twp      155933
addr     155957
date     155957
time     155957
dtype: int64

In [47]:
empty = np.where(pd.isnull(df))
geolocator = Nominatim()
index = 0


#Impute 24 missing township values
for i in np.nditer(empty):
    
    #row of missing township cell
    row = empty[0][index] 
    #column of missing township cell
    column = empty[1][index]
    
    
    temp_lat = repr(df.iloc[row,0])
    temp_long = repr(df.iloc[row,1])
    
    
    location = geolocator.reverse([temp_lat, temp_long], timeout = 60)
    
    if column == 4:
        
    
        #extract township value from location dictionary
        town = location.raw['address']['city'] 
    
    
        #remove 'Township' ending from name of town    
        if town.endswith("Township"):
            town = town[0:-9]
            
        else:
            pass
                
        #convert to uppercase to maintain township format in dataframe    
        town = town.upper()
    
        #put imputed township name into corresponding missing cell of dataframe
        df.iloc[row, column] = town
    
        print(df.iloc[row, column])
        
    else:
        
        pass
        
        #Elected to comment out code to impute missing zipcodes because it would take too long (approx. 4-6 hrs.)
        #zcode = location.raw['address']['postcode']
        
        #df.iloc[row, column] = zcode
        
        #print(df.iloc[row, column])
    
        
    #increment index to get to next set of index values for empty township cell
    index += 1

UPPER MORELAND
WARRINGTON
NORRISTOWN
NORRISTOWN
MONTGOMERY
UPPER MERION
TOWAMENCIN
HATBORO
HORSHAM
HATFIELD
JENKINTOWN
LOWER MERION
FRANCONIA
HATBORO
POTTSTOWN
SKIPPACK
SKIPPACK
UPPER MORELAND
NORRISTOWN
NORRISTOWN
UPPER SALFORD
SPRINGFIELD
UPPER MERION
UPPER PROVIDENCE


In [48]:
df.count()

lat      155957
lng      155957
zip      155957
title    155957
twp      155957
addr     155957
date     155957
time     155957
dtype: int64

In [49]:
hour = df.time.str[0:2]
hour2 = pd.to_numeric(hour)

#If time of call is between 6PM and 6AM then it is classified as 'night', otherwise it is classified as 'day'

for i, row in df.iterrows():
    if(hour2.loc[i] >= 18 or hour2.loc[i] < 6):
        hour.at[i] = 'night'
    else:
        hour.at[i] = 'day'


In [50]:
#Replace military time with either 'night' or 'day'
del df['time']

df['time_of_day'] = hour

#df.head(10)   

In [51]:
#Change date format to weekdays format
df['dates'] = pd.to_datetime(df['date'])
df['weekday'] = df['dates'].dt.weekday_name

del df['date']
del df['dates']

In [52]:
#Separate first part of 911 call classification from rest of title
df['class'], df['title2'] = df['title'].str.split(':', 1).str
del df['title']
del df['title2']


In [53]:
df.head(10)

Unnamed: 0,lat,lng,zip,twp,addr,time_of_day,weekday,class
0,40.297876,-75.581294,19525,NEW HANOVER,REINDEER CT & DEAD END,day,Thursday,EMS
1,40.258061,-75.26468,19446,HATFIELD TOWNSHIP,BRIAR PATH & WHITEMARSH LN,day,Thursday,EMS
2,40.121182,-75.351975,19401,NORRISTOWN,HAWS AVE,day,Thursday,Fire
3,40.116153,-75.343513,19401,NORRISTOWN,AIRY ST & SWEDE ST,day,Thursday,EMS
5,40.253473,-75.283245,19446,LANSDALE,CANNON AVE & W 9TH ST,day,Thursday,EMS
6,40.182111,-75.127795,19044,HORSHAM,LAUREL AVE & OAKDALE AVE,day,Thursday,EMS
7,40.217286,-75.405182,19426,SKIPPACK,COLLEGEVILLE RD & LYWISKI RD,day,Thursday,EMS
8,40.289027,-75.39959,19438,LOWER SALFORD,MAIN ST & OLD SUMNEYTOWN PIKE,day,Thursday,EMS
9,40.102398,-75.291458,19462,PLYMOUTH,BLUEROUTE & RAMP I476 NB TO CHEMICAL RD,day,Thursday,Traffic
11,40.084161,-75.308386,19428,PLYMOUTH,BROOK RD & COLWELL LN,day,Thursday,Traffic


In [54]:
#unique zipcodes in the dataset
df.zip.unique()

array([19525, 19446, 19401, 19044, 19426, 19438, 19462, 19428, 19040,
       19027, 18936, 18974, 19031, 19403, 19422, 19085, 18964, 19038,
       19406, 19468, 19010, 19095, 19464, 19444, 19041, 19440, 19405,
       19002, 19096, 19454, 19465, 19004, 19066, 19072, 18041, 19046,
       19090, 19012, 19025, 19473, 18073, 18969, 18074, 19460, 19001,
       18054, 19009, 19006, 19035, 19150, 19075, 19034, 19151, 19453,
       19003, 18914, 19512, 18976, 19120, 18915, 18076, 19477, 19087,
       18966, 19131, 19128, 19083, 19053, 19475, 18960, 19504, 18070,
       19492, 18932, 19118, 18092, 19490, 19518, 18056, 19119, 19107,
       17752, 19111, 18927, 19435, 18951, 19472, 19503, 19126, 19505,
       19423, 19138, 36107, 18036, 19116, 19139, 19129, 19115, 19355,
       77316, 19457, 19082, 19127, 19443, 17555, 19520, 19063, 19020, 19404], dtype=int64)

In [55]:
#find the number of unique zipcodes
s = Series(df.zip)
zip_unique = s.unique().size
print(zip_unique)

109


In [56]:
#unique townships in the dataset
df.twp.unique()

array(['NEW HANOVER', 'HATFIELD TOWNSHIP', 'NORRISTOWN', 'LANSDALE',
       'HORSHAM', 'SKIPPACK', 'LOWER SALFORD', 'PLYMOUTH',
       'UPPER MORELAND', 'CHELTENHAM', 'MONTGOMERY', 'WHITEMARSH',
       'UPPER GWYNEDD', 'LOWER PROVIDENCE', 'WHITPAIN', 'DELAWARE COUNTY',
       'FRANCONIA', 'WEST CONSHOHOCKEN', 'UPPER MERION', 'LIMERICK',
       'DOUGLASS', 'LOWER MERION', 'POTTSTOWN', 'BRIDGEPORT', 'TOWAMENCIN',
       'AMBLER', 'LOWER POTTSGROVE', 'CHESTER COUNTY', 'UPPER HANOVER',
       'SPRINGFIELD', 'ROCKLEDGE', 'ABINGTON', 'WEST NORRITON',
       'ROYERSFORD', 'UPPER DUBLIN', 'UPPER SALFORD', 'CONSHOHOCKEN',
       'PENNSBURG', 'TELFORD', 'EAST NORRITON', 'UPPER FREDERICK',
       'UPPER PROVIDENCE', 'SALFORD', 'LEHIGH COUNTY', 'MARLBOROUGH',
       'BRYN ATHYN', 'LOWER MORELAND', 'HATBORO', 'LOWER GWYNEDD',
       'WORCESTER', 'COLLEGEVILLE', 'SCHWENKSVILLE', 'SOUDERTON',
       'PERKIOMEN', 'LOWER FREDERICK', 'BUCKS COUNTY', 'RED HILL',
       'WEST POTTSGROVE', 'UPPER POTTSGROV

In [57]:
#find the number of unique townships
s2 = Series(df.twp)
twp_unique = s2.unique().size
print(twp_unique)

70


In [58]:
#find the number of unique address locations
s4 = Series(df.addr)
s4.unique().size

25162

In [59]:
sLat = Series(df.lat)
sLat.unique().size

16968

In [60]:
sLong = Series(df.lng)
sLong.unique().size

16992

In [61]:
#find max, min latitudes and longitudes
#sLat.max()
#sLat.min()
#sLong.max()
#sLong.min()

In [62]:
#Change string data to label encoded integers for RandomForestClassifier()
le_dow = preprocessing.LabelEncoder()
le_tod = preprocessing.LabelEncoder()
le_addr = preprocessing.LabelEncoder()
le_twp = preprocessing.LabelEncoder()
le_class = preprocessing.LabelEncoder()

le_dow = le_dow.fit_transform(df['weekday'])
le_tod = le_tod.fit_transform(df['time_of_day'])
le_addr = le_addr.fit_transform(df['addr'])
le_twp = le_twp.fit_transform(df['twp'])
le_class = le_class.fit_transform(df['class'])

In [63]:
#Create copy of dataframe
df2 = df

#Delete old columns with string values and replace with new label encoded columns
del df2['weekday']
del df2['time_of_day']
del df2['addr']
del df2['twp']
del df2['class']


df2['weekday'] = le_dow
df2['time_of_day'] = le_tod
df2['addr'] = le_addr
df2['twp'] = le_twp
df2['class'] = le_class

df2.head(10)

Unnamed: 0,lat,lng,zip,weekday,time_of_day,addr,twp,class
0,40.297876,-75.581294,19525,4,0,18187,35,0
1,40.258061,-75.26468,19446,4,0,2398,19,0
2,40.121182,-75.351975,19401,4,0,9768,36,1
3,40.116153,-75.343513,19401,4,0,615,36,0
5,40.253473,-75.283245,19446,4,0,3245,22,0
6,40.182111,-75.127795,19044,4,0,12164,20,0
7,40.217286,-75.405182,19426,4,0,4540,48,0
8,40.289027,-75.39959,19438,4,0,13272,31,0
9,40.102398,-75.291458,19462,4,0,2201,41,2
11,40.084161,-75.308386,19428,4,0,2606,41,2


In [70]:
x = df[['lat', 'lng', 'zip', 'twp', 'addr', 'time_of_day', 'weekday']].copy()
y = df[['class']].copy()
x_train, x_test, y_train, y_test = train_test_split(x, y, stratify=y, random_state=30)

In [71]:
x.head()

Unnamed: 0,lat,lng,zip,twp,addr,time_of_day,weekday
0,40.297876,-75.581294,19525,35,18187,0,4
1,40.258061,-75.26468,19446,19,2398,0,4
2,40.121182,-75.351975,19401,36,9768,0,4
3,40.116153,-75.343513,19401,36,615,0,4
5,40.253473,-75.283245,19446,22,3245,0,4


In [79]:
y.head()

Unnamed: 0,class
0,0
1,0
2,1
3,0
5,0


In [84]:
bag = BaggingClassifier(base_estimator=None, max_features=7, n_estimators=100, random_state=0)
bag.fit(x_train, y_train.values.ravel())

BaggingClassifier(base_estimator=None, bootstrap=True,
         bootstrap_features=False, max_features=7, max_samples=1.0,
         n_estimators=100, n_jobs=1, oob_score=False, random_state=0,
         verbose=0, warm_start=False)

In [85]:
print("Accuracy on training set: {:.3f}".format(bag.score(x_train, y_train)))
print("Accuracy on test set: {:.3f}".format(bag.score(x_test, y_test)))

Accuracy on training set: 0.877
Accuracy on test set: 0.650
