In [62]:
import pandas as pd
import numpy as np
import datetime as dt
from pandas import Series
from time import sleep
from geopy.geocoders import Nominatim

#importing the dataset using pandas
df = pd.read_csv("./911.csv")

#sample of original dataset
df.head(5)

#title label

Unnamed: 0,lat,lng,desc,zip,title,timeStamp,twp,addr,e
0,40.297876,-75.581294,REINDEER CT & DEAD END; NEW HANOVER; Station ...,19525.0,EMS: BACK PAINS/INJURY,2015-12-10 17:10:52,NEW HANOVER,REINDEER CT & DEAD END,1
1,40.258061,-75.26468,BRIAR PATH & WHITEMARSH LN; HATFIELD TOWNSHIP...,19446.0,EMS: DIABETIC EMERGENCY,2015-12-10 17:29:21,HATFIELD TOWNSHIP,BRIAR PATH & WHITEMARSH LN,1
2,40.121182,-75.351975,HAWS AVE; NORRISTOWN; 2015-12-10 @ 14:39:21-St...,19401.0,Fire: GAS-ODOR/LEAK,2015-12-10 14:39:21,NORRISTOWN,HAWS AVE,1
3,40.116153,-75.343513,AIRY ST & SWEDE ST; NORRISTOWN; Station 308A;...,19401.0,EMS: CARDIAC EMERGENCY,2015-12-10 16:47:36,NORRISTOWN,AIRY ST & SWEDE ST,1
4,40.251492,-75.60335,CHERRYWOOD CT & DEAD END; LOWER POTTSGROVE; S...,,EMS: DIZZINESS,2015-12-10 16:56:52,LOWER POTTSGROVE,CHERRYWOOD CT & DEAD END,1


In [63]:
#separate timeStamp data into 2 new columns
df['date'] = df.timeStamp.str[0:11]
df['time'] = df.timeStamp.str[-8:]

#Get rid of dummy 'e' column and 'timeStamp' column
del df['e']
del df['timeStamp']
#If time at end, then try to extract station number and impute
del df['desc']

#sample of dataset
#df.head(10)

In [64]:
#count number of values in each column
df.count()

lat      177755
lng      177755
zip      155957
title    177755
twp      177694
addr     177755
date     177755
time     177755
dtype: int64

In [65]:
#Finds total number of rows with missing values 
#Rows with more than one missing value only counted once
df.isnull().any(axis=1).sum()

21822

In [66]:
#Drop rows with missing zipcode values
df = df.dropna(subset=['zip'])

#df[['twp', 'zip']].groupby(['twp'], as_index = False).mean().sort_values(by = 'twp', ascending=False)

#Convert float values for zipcodes to integer type
df['zip'] = df['zip'].astype(int)
df.count()

lat      155957
lng      155957
zip      155957
title    155957
twp      155933
addr     155957
date     155957
time     155957
dtype: int64

In [67]:
empty = np.where(pd.isnull(df))
geolocator = Nominatim()
index = 0


#Impute 24 missing township values
for i in np.nditer(empty):
    
    #row of missing township cell
    row = empty[0][index] 
    #column of missing township cell
    column = empty[1][index]
    
    
    temp_lat = repr(df.iloc[row,0])
    temp_long = repr(df.iloc[row,1])
    
    
    location = geolocator.reverse([temp_lat, temp_long],timeout=None)
    
    #extract township value from location dictionary
    town = location.raw['address']['city'] 
    
    
    #remove 'Township' ending from name of town    
    if town.endswith("Township"):
        town = town[0:-9]
            
    else:
        pass
                
    #convert to uppercase to maintain township format in dataframe    
    town = town.upper()
    
    #put imputed township name into corresponding missing cell of dataframe
    df.iloc[row, column] = town
    
    print(df.iloc[row, column])
             
    
    #required waiting period to prevent timeout exception from being thrown
    #this requriement would cause imputing the 21,798 missing zip codes to take approx. 12-20 hours to impute
    
        
    #increment index to get to next set of index values for empty township cell
    index += 1

UPPER MORELAND
WARRINGTON
NORRISTOWN
NORRISTOWN
MONTGOMERY
UPPER MERION
TOWAMENCIN
HATBORO
HORSHAM
HATFIELD
JENKINTOWN
LOWER MERION
FRANCONIA
HATBORO
POTTSTOWN
SKIPPACK
SKIPPACK
UPPER MORELAND
NORRISTOWN
NORRISTOWN
UPPER SALFORD
SPRINGFIELD
UPPER MERION
UPPER PROVIDENCE


In [68]:
df.count()

lat      155957
lng      155957
zip      155957
title    155957
twp      155957
addr     155957
date     155957
time     155957
dtype: int64

In [69]:
hour = df.time.str[0:2]
hour2 = pd.to_numeric(hour)

#If time of call is between 6PM and 6AM then it is classified as 'night', otherwise it is classified as 'day'

for i, row in df.iterrows():
    if(hour2.loc[i] >= 18 or hour2.loc[i] < 6):
        hour.at[i] = 'night'
    else:
        hour.at[i] = 'day'


In [70]:
#Replace military time with either 'night' or 'day'
del df['time']

df['time_of_day'] = hour

#df.head(10)   

In [71]:
#Change date format to weekdays format
df['dates'] = pd.to_datetime(df['date'])
df['weekday'] = df['dates'].dt.weekday_name

del df['date']
del df['dates']

In [72]:
#Separate first part of 911 call classification from rest of title
df['class'], df['title2'] = df['title'].str.split(':', 1).str
del df['title']
del df['title2']


In [73]:
df.head(10)

Unnamed: 0,lat,lng,zip,twp,addr,time_of_day,weekday,class
0,40.297876,-75.581294,19525,NEW HANOVER,REINDEER CT & DEAD END,day,Thursday,EMS
1,40.258061,-75.26468,19446,HATFIELD TOWNSHIP,BRIAR PATH & WHITEMARSH LN,day,Thursday,EMS
2,40.121182,-75.351975,19401,NORRISTOWN,HAWS AVE,day,Thursday,Fire
3,40.116153,-75.343513,19401,NORRISTOWN,AIRY ST & SWEDE ST,day,Thursday,EMS
5,40.253473,-75.283245,19446,LANSDALE,CANNON AVE & W 9TH ST,day,Thursday,EMS
6,40.182111,-75.127795,19044,HORSHAM,LAUREL AVE & OAKDALE AVE,day,Thursday,EMS
7,40.217286,-75.405182,19426,SKIPPACK,COLLEGEVILLE RD & LYWISKI RD,day,Thursday,EMS
8,40.289027,-75.39959,19438,LOWER SALFORD,MAIN ST & OLD SUMNEYTOWN PIKE,day,Thursday,EMS
9,40.102398,-75.291458,19462,PLYMOUTH,BLUEROUTE & RAMP I476 NB TO CHEMICAL RD,day,Thursday,Traffic
11,40.084161,-75.308386,19428,PLYMOUTH,BROOK RD & COLWELL LN,day,Thursday,Traffic


In [74]:
#unique zipcodes in the dataset
df.zip.unique()

array([19525, 19446, 19401, 19044, 19426, 19438, 19462, 19428, 19040,
       19027, 18936, 18974, 19031, 19403, 19422, 19085, 18964, 19038,
       19406, 19468, 19010, 19095, 19464, 19444, 19041, 19440, 19405,
       19002, 19096, 19454, 19465, 19004, 19066, 19072, 18041, 19046,
       19090, 19012, 19025, 19473, 18073, 18969, 18074, 19460, 19001,
       18054, 19009, 19006, 19035, 19150, 19075, 19034, 19151, 19453,
       19003, 18914, 19512, 18976, 19120, 18915, 18076, 19477, 19087,
       18966, 19131, 19128, 19083, 19053, 19475, 18960, 19504, 18070,
       19492, 18932, 19118, 18092, 19490, 19518, 18056, 19119, 19107,
       17752, 19111, 18927, 19435, 18951, 19472, 19503, 19126, 19505,
       19423, 19138, 36107, 18036, 19116, 19139, 19129, 19115, 19355,
       77316, 19457, 19082, 19127, 19443, 17555, 19520, 19063, 19020, 19404])

In [75]:
#find the number of unique zipcodes
s = Series(df.zip)
zip_unique = s.unique().size
print(zip_unique)

#delete addrs attribute
del df['addr']

109


In [76]:
df.head(5)

Unnamed: 0,lat,lng,zip,twp,time_of_day,weekday,class
0,40.297876,-75.581294,19525,NEW HANOVER,day,Thursday,EMS
1,40.258061,-75.26468,19446,HATFIELD TOWNSHIP,day,Thursday,EMS
2,40.121182,-75.351975,19401,NORRISTOWN,day,Thursday,Fire
3,40.116153,-75.343513,19401,NORRISTOWN,day,Thursday,EMS
5,40.253473,-75.283245,19446,LANSDALE,day,Thursday,EMS


In [82]:
#make the training and test 
x_train = df

x_test = df

#print our test and training data
print (x_train.head(5))
x_test.head(5)

#look at the attributes and the instances
print x_train.shape, x_test.shape

         lat        lng    zip                twp  time_of_day   weekday class
0  40.297876 -75.581294  19525        NEW HANOVER          NaN  Thursday   EMS
1  40.258061 -75.264680  19446  HATFIELD TOWNSHIP          NaN  Thursday   EMS
2  40.121182 -75.351975  19401         NORRISTOWN          NaN  Thursday  Fire
3  40.116153 -75.343513  19401         NORRISTOWN          NaN  Thursday   EMS
5  40.253473 -75.283245  19446           LANSDALE          NaN  Thursday   EMS
(155957, 7) (155957, 7)


In [83]:
for dataset in comb:
    dataset['class'] = dataset['class'].map( {'Traffic': 0,'EMS': 1, 'Fire': 2} ).astype(float)

TypeError: string indices must be integers, not str

In [84]:
comb = [x_train,x_test]

In [85]:
for dataset in comb:
    dataset['time_of_day'] = dataset['time_of_day'].map( {'day': 0,'night': 1} ).astype(float)

In [81]:
dataset.head(5)

Unnamed: 0,lat,lng,zip,twp,time_of_day,weekday,class
0,40.297876,-75.581294,19525,NEW HANOVER,,Thursday,EMS
1,40.258061,-75.26468,19446,HATFIELD TOWNSHIP,,Thursday,EMS
2,40.121182,-75.351975,19401,NORRISTOWN,,Thursday,Fire
3,40.116153,-75.343513,19401,NORRISTOWN,,Thursday,EMS
5,40.253473,-75.283245,19446,LANSDALE,,Thursday,EMS


In [None]:
for dataset in comb:
    dataset['class'] = dataset['class'].map( {'Traffic': 0,'EMS': 1, 'Fire': 2} ).astype(float)

In [None]:
x_train.head(5)

In [None]:
for dataset in comb:
     dataset['weekday'] = dataset['weekday'].map( {'Monday': 1,'Tuesday': 2,'Wednesday': 3,'Thursday': 4,'Friday': 5,'Saturday': 6, 'Sunday': 7} ).astype(float) 

In [None]:
'''for dataset in comb:
    dataset['twp'] = dataset['twp'].map({'NEW HANOVER' :10, 'HATFIELD TOWNSHIP': 11, 'NORRISTOWN': 12, 'LANSDALE': 13,
       'HORSHAM': 14, 'SKIPPACK': 15, 'LOWER SALFORD': 16, 'PLYMOUTH': 17,
       'UPPER MORELAND': 18, 'CHELTENHAM': 19, 'MONTGOMERY': 20, 'WHITEMARSH': 21,
       'UPPER GWYNEDD': 22, 'LOWER PROVIDENCE': 23, 'WHITPAIN': 24, 'DELAWARE COUNTY': 25,
       'FRANCONIA': 76, 'WEST CONSHOHOCKEN': 77, 'UPPER MERION': 78, 'LIMERICK': 79,
       'DOUGLASS': 26, 'LOWER MERION': 27, 'POTTSTOWN': 28, 'BRIDGEPORT': 29, 'TOWAMENCIN': 30,
       'AMBLER': 31, 'LOWER POTTSGROVE': 32, 'CHESTER COUNTY': 33, 'UPPER HANOVER': 34,
       'SPRINGFIELD': 35, 'ROCKLEDGE': 36, 'ABINGTON': 37, 'WEST NORRITON': 38,
       'ROYERSFORD': 39, 'UPPER DUBLIN': 40, 'UPPER SALFORD': 41, 'CONSHOHOCKEN': 42,
       'PENNSBURG': 43, 'TELFORD': 44, 'EAST NORRITON': 45, 'UPPER FREDERICK': 46,
       'UPPER PROVIDENCE': 47, 'SALFORD': 48, 'LEHIGH COUNTY': 49, 'MARLBOROUGH': 50,
       'BRYN ATHYN': 51, 'LOWER MORELAND': 52, 'HATBORO': 53, 'LOWER GWYNEDD': 54,
       'WORCESTER': 55, 'COLLEGEVILLE': 56, 'SCHWENKSVILLE': 57, 'SOUDERTON': 58,
       'PERKIOMEN': 59, 'LOWER FREDERICK': 60, 'BUCKS COUNTY': 61, 'RED HILL': 62,
       'WEST POTTSGROVE': 63, 'UPPER POTTSGROVE': 64, 'EAST GREENVILLE': 65,
       'NORTH WALES': 66,'JENKINTOWN': 67,'TRAPPE': 68, 'NARBERTH': 69, 'BERKS COUNTY': 70,
       'GREEN LANE': 71, 'WARRINGTON': 72, 'PHILA COUNTY': 73, 'HATFIELD': 74,
       'HATFIELD BORO': 75}).astype(float)
       '''

In [None]:
x_train.head(5)

In [None]:
#unique townships in the dataset
#df.twp.unique()

from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier

model = LinearSVC()

model.fit(x_train, x_test)

#OneVsRestClassifier(LinearSVC(random_state=0)).fit(x_train, x_test)

In [None]:
df[['twp', 'zip']].groupby(['twp'], as_index = False).mean().sort_values(by = 'twp', ascending=False)

In [None]:
#find the number of unique townships
s2 = Series(df.twp)
twp_unique = s2.unique().size
print(twp_unique)

In [None]:
#find the number of unique address locations
s4 = Series(df.addr)
s4.unique().size

In [None]:
sLat = Series(df.lat)
sLat.unique().size

In [None]:
sLong = Series(df.lng)
sLong.unique().size

In [None]:
sLat.max()

In [None]:
sLat.min()

In [None]:
sLong.max()

In [None]:
sLong.min()

In [None]:
'''
from sklearn.model_selection import train_test_split
X, y = df['twp'], df['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .8, random_state = 42)
target_class = ['EMS', 'Fire', 'Traffic']
drop long, lat for preprocess tweak
'''
#del df['lat']
#del df['lng']

del

df.head(10)

df[]

#df.weekday.shape

In [None]:
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split

from sklearn.svm import SVC

X = df[:-20]
Y = df[:20]

X.head(10) 

clf = SVC(gamma=0.001, C =100)

#clf.fit(X,Y)

df2 = np.array(df)


#x_train, x_test = train_test_split

