In [1]:
import csv
import pandas as pd
import numpy as np
import re
import datetime as dt
import time

In [2]:
df = pd.read_csv("Seattle_Police_Department_911_Incident_Response.csv", parse_dates=[0], infer_datetime_format=True)

In [3]:
df['fullDate'] = df['Event Clearance Date'].apply(lambda x: str(x).split(' ')[0])
df['ECG'] = df['Event Clearance Group']
df['dtDate'] = pd.DatetimeIndex(df['Event Clearance Date'])
df['eventClearanceSeconds'] = df['dtDate'].apply(lambda x: x.hour * 3600 + x.minute*60)

In [4]:
df['weekday'] = df['dtDate'].apply(lambda x: x.weekday())
df['weekdayName'] = df['dtDate'].apply(lambda x: x.weekday_name)

weekDummies = pd.get_dummies(df['weekdayName'])
df = pd.concat([df,weekDummies], axis = 1)

zoneBeatDummies = pd.get_dummies(df['Zone/Beat'])
df = pd.concat([df,zoneBeatDummies], axis = 1)

In [5]:
import calendar
weather = pd.read_csv("WEATHER.csv")

months_map = {'Jan': '01', 'Feb': '02', 'Mar': '03', 'Apr':'04', 'May': '05', 'Jun': '06', 'Jul': '07', 'Aug': '08', 'Sep': '09', 'Oct':'10', 'Nov': '11', 'Dec': '12'}
weather['m'] = weather['month'].apply(lambda x: months_map[x])
weather['day'] = weather['date'].apply(lambda x: ('0' + str(x)) if x < 10 else str(x) )
weather['fullDate'] = weather.apply(lambda x: str(x['m']) + '/' + str(x['day']) + '/' + str(x['\ufeffyear']), axis=1 )
weather['clear'] = weather['events'].apply(lambda x: 1 if ('Rain' in str(x) or 'Snow' in str(x) or 'Fog' in str(x) or 'Thunderstorm' in str(x)) else 0)
weather['notClear'] = weather['clear'].apply(lambda x: 1 if x is 0 else 0)

In [10]:
merged = pd.merge(df, weather, on = 'fullDate', how = 'left')

In [14]:
#list(merged.columns.values)

In [12]:
merged.shape

(1335033, 151)

In [13]:
merged = merged.dropna(subset=["Event Clearance Date", "Event Clearance Group","date", "tempAvg"])
merged.shape

(1311452, 151)

In [18]:
#merged.groupby('ECG').count()

In [59]:
sample = []
foo = merged.groupby('ECG')
for name, group in foo:
    sample.append(group.sample(n=10000, replace = True, random_state = 14))
new_df = pd.concat(sample)
new_df['secondsInDay'] = new_df['eventClearanceSeconds'].apply(lambda x: int(x))

In [60]:
#new_df.groupby('ECG').count()

In [61]:
train=new_df.sample(frac=0.7,random_state=14)
test=new_df.sample(frac = 0.3, random_state = 16)

In [62]:
train.shape

(308000, 152)

In [63]:
test.shape

(132000, 152)

In [75]:
features = ['secondsInDay', 'clear', 'Latitude', 'Longitude', 'm', 'day', 'tempHigh',
 'tempAvg',
 'tempLow',
 'dewHigh',
 'dewAvg',
 'dewLow',
 'humidityHigh',
 'humidityAvg',
 'humidityLow',
 'seaLvlHigh',
 'seaLvlAvg',
 'seaLvlLow',
 'visibilityHigh',
 'visibilityAvg',
 'visibilityLow',
 'windHigh',
 'windAvg','Friday',
 'Monday',
 'Saturday',
 'Sunday',
 'Thursday',
 'Tuesday',
 'Wednesday','99',
 'B1',
 'B2',
 'B3',
 'BS',
 'C1',
 'C2',
 'C3',
 'CCD',
 'COMM',
 'CS',
 'CTY',
 'D1',
 'D2',
 'D3',
 'DET',
 'DS',
 'E',
 'E1',
 'E2',
 'E3',
 'EDD',
 'EP',
 'ES',
 'F1',
 'F2',
 'F3',
 'FS',
 'G1',
 'G2',
 'G3',
 'GS',
 'H2',
 'H3',
 'INV',
 'J1',
 'J2',
 'J3',
 'JS',
 'K1',
 'K2',
 'K3',
 'KCIO07',
 'KS',
 'L1',
 'L2',
 'L3',
 'LAPT',
 'LS',
 'M1',
 'M2',
 'M3',
 'MS',
 'N',
 'N1',
 'N2',
 'N3',
 'NP',
 'NS',
 'O1',
 'O2',
 'O3',
 'OS',
 'Q1',
 'Q2',
 'Q3',
 'QS',
 'R1',
 'R2',
 'R3',
 'RS',
 'S',
 'S1',
 'S2',
 'S3',
 'SCTR1',
 'SP',
 'SS',
 'TAC3',
 'TRF',
 'U1',
 'U2',
 'U3',
 'US',
 'W',
 'W1',
 'W2',
 'W3',
 'WP',
 'WS']
X = train[features]
X2 = test[features]
X.head()

Unnamed: 0,secondsInDay,clear,Latitude,Longitude,m,day,tempHigh,tempAvg,tempLow,dewHigh,...,U1,U2,U3,US,W,W1,W2,W3,WP,WS
416055,8880,1.0,47.588294,-122.379822,4,20,56.0,50.0,44.0,50.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1235977,7980,0.0,47.717174,-122.344896,8,9,76.0,67.0,58.0,55.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1174653,36000,1.0,47.616875,-122.338544,10,31,55.0,51.0,47.0,52.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
364724,62580,0.0,47.625355,-122.318066,2,3,58.0,47.0,36.0,33.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
480179,80280,1.0,47.614684,-122.316845,7,15,66.0,61.0,56.0,56.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [76]:
y = train.ECG
y2 = test.ECG
print(X.shape)
print(y.shape)

(308000, 120)
(308000,)


In [78]:
from sklearn import linear_model
lr = linear_model.LogisticRegression(solver='newton-cg', multi_class = 'multinomial')
fit = lr.fit(X, y)

ImportError: cannot import name 'rankdata'

In [None]:
preds = lr.predict(X2)