In [10]:
import os
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import cross_validation as cv
import scipy.stats as stats
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import GridSearchCV as gs
from sklearn import metrics as metrics
from sklearn.preprocessing import StandardScaler



In [11]:
PATH = os.path.abspath(os.path.join(os.path.dirname("."), "Data"))
FINAL_DATA = os.path.join(PATH, "finaldataframecopy_wWeather.csv")

FINAL_DATA

'/Users/wrangamujadidi/Desktop/Capstone/capstone_opioid/Data/finaldataframecopy_wWeather.csv'

In [13]:
data = pd.read_csv(FINAL_DATA)

In [14]:
data.head()

Unnamed: 0,HexagonNumber,HexandMonth,Month,Under20,20to29,30to39,40to49,50to59,60to69,70andabove,...,MiscCals,OverdoseCalls,PropertyCalls,QOLCalls,TrafficCalls,ViolentCalls,AverageTemperature,AverageHumidity,AveragePrecipitation,Unemployment
0,1,1-1,1,0,0,0,0,0,0,0,...,0,0,0,5,5,0,37.4075,0.541429,0.00025,0.274
1,1,1-2,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,37.4075,0.541429,0.00025,0.274
2,1,1-3,3,0,0,0,0,0,0,0,...,0,0,0,0,0,0,37.4075,0.541429,0.00025,0.274
3,1,1-4,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,37.4075,0.541429,0.00025,0.274
4,1,1-5,5,0,0,0,0,0,0,0,...,0,0,0,0,0,0,37.4075,0.541429,0.00025,0.274


In [15]:
features = data [[
     'HexagonNumber', 'Month', 'Under20', '20to29', '30to39', '40to49',
       '50to59', '60to69', '70andabove', 'Male',
       'Female', 'MPDCalls', 'FEMSCalls', 'MedicalCalls', 'MiscCals',
       'OverdoseCalls', 'PropertyCalls', 'QOLCalls', 'TrafficCalls',
       'ViolentCalls', 'AverageTemperature', 'AverageHumidity',
       'AveragePrecipitation', 'Unemployment'
]]

X = features

y = data['NarcanAdministered']

print(X.shape)
print(y.shape)

(9444, 24)
(9444,)


In [16]:
#Drop the columns for HexandMonth and TotalNarcan.  HEXandMonth is not needed and TotalNarcan may cause bias because
#the Narcan Administered already accounts for the TotalNarcan for the month. 

data = data.drop(['HexandMonth'], axis = 1)
data = data.drop(['TotalNarcan'], axis = 1)



In [18]:

# split X and y into train and test datasets

(X_train, X_test, y_train, y_test) = cv.train_test_split(X, y, test_size=.03)

In [19]:
# instantiate a logistic regression model, and fit with X and y
model = LogisticRegression()
model.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [20]:
#Returns the probability of the sample for each class in the model
model.predict_proba(X_test)

array([[  9.94289690e-01,   5.71030986e-03],
       [  9.96229258e-01,   3.77074193e-03],
       [  9.98014557e-01,   1.98544326e-03],
       [  9.96859673e-01,   3.14032738e-03],
       [  1.08320234e-02,   9.89167977e-01],
       [  9.98440103e-01,   1.55989690e-03],
       [  9.95703425e-01,   4.29657498e-03],
       [  9.97907997e-01,   2.09200288e-03],
       [  9.97524939e-01,   2.47506142e-03],
       [  7.67413098e-03,   9.92325869e-01],
       [  9.97679489e-01,   2.32051132e-03],
       [  9.95206167e-01,   4.79383312e-03],
       [  1.90903397e-02,   9.80909660e-01],
       [  9.95939240e-01,   4.06076016e-03],
       [  7.24018800e-03,   9.92759812e-01],
       [  9.96763647e-01,   3.23635256e-03],
       [  9.97199656e-01,   2.80034382e-03],
       [  9.99297307e-01,   7.02693278e-04],
       [  9.98780204e-01,   1.21979574e-03],
       [  9.97612027e-01,   2.38797269e-03],
       [  9.95617161e-01,   4.38283854e-03],
       [  9.96132379e-01,   3.86762132e-03],
       [  

In [21]:
model.predict_proba(X_train).shape

(9160, 2)

In [22]:
#mean accuracy on the given test data and labels
model.score(X_test, y_test)

1.0

In [23]:
#Confidence scores per (sample, class) combination.
model.decision_function(X_train)

array([-6.0047795 , -5.30655822,  4.71765514, ..., -5.37438289,
       -6.71169744, -6.05475435])

In [24]:
#For each pair of train and test set, a prediction score
cross_val_score(model, X, y)

array([ 0.99968244,  0.99777637,  0.99872895])

In [25]:
grid = gs(model, {'C': np.logspace(-5, 5, 50)})
grid.fit(X_train, y_train)
grid.best_params_

{'C': 2.0235896477251556}

In [26]:
cv.cross_val_score(grid.best_estimator_, X, y)

array([ 0.99968244,  0.99777637,  0.99968224])

In [27]:
model.predict(X_test)

array([0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 0])

In [29]:
y_pred = model.predict(X_test)
print ('Precision: ', metrics.precision_score(y_test, y_pred))
print ('Recall: ', metrics.recall_score(y_test, y_pred, average='binary'))
print ('Accuracy', metrics.accuracy_score(y_test, y_pred))
print ('F1', metrics.f1_score(y_test, y_pred, average='binary'))

Precision:  1.0
Recall:  1.0
Accuracy 1.0
F1 1.0


In [30]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [31]:
yrfc_pred = rfc.predict(X_test)

print ('Precision: ', metrics.precision_score(y_test, y_pred))
print ('Recall: ', metrics.recall_score(y_test, y_pred, average='binary'))
print ('Accuracy', metrics.accuracy_score(y_test, y_pred))
print ('F1', metrics.f1_score(y_test, y_pred, average='binary'))

Precision:  1.0
Recall:  1.0
Accuracy 1.0
F1 1.0
