In [1]:
import os
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import cross_validation as cv
import scipy.stats as stats
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import GridSearchCV as gs
from sklearn import metrics as metrics
from sklearn.preprocessing import StandardScaler



In [2]:
PATH = os.path.abspath(os.path.join(os.path.dirname("."), "Data"))
FINAL_DATA = os.path.join(PATH, "Sample_Size_1661.csv")

FINAL_DATA

'/Users/wrangamujadidi/Desktop/Capstone/capstone_opioid/Data/Sample_Size_1661.csv'

In [3]:
data = pd.read_csv(FINAL_DATA)

In [4]:
data.head()

Unnamed: 0,HexagonNumber,HexandMonth,Month,Under20,20to29,30to39,40to49,50to59,60to69,70andabove,...,MiscCals,OverdoseCalls,PropertyCalls,QOLCalls,TrafficCalls,ViolentCalls,AverageTemperature,AverageHumidity,AveragePrecipitation,Unemployment
0,32,32-1,1,0,0,1,1,0,0,0,...,21,2,23,71,47,18,37.4075,0.541429,0.00025,0.071667
1,33,33-1,1,0,0,0,2,0,0,0,...,30,2,23,83,35,29,37.4075,0.541429,0.00025,0.255
2,35,35-1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,37.4075,0.541429,0.00025,0.1395
3,36,36-1,1,0,0,0,0,0,0,0,...,3,0,3,10,4,4,37.4075,0.541429,0.00025,0.192
4,43,43-1,1,0,0,0,0,0,0,1,...,25,0,40,76,44,3,37.4075,0.541429,0.00025,0.07875


In [5]:
features = data [[
    'Month','MPDCalls', 'FEMSCalls', 'MedicalCalls', 'MiscCals',
       'OverdoseCalls', 'PropertyCalls', 'QOLCalls', 'TrafficCalls',
       'ViolentCalls'
]]

X = features

y = data['NarcanAdministered']

print(X.shape)
print(y.shape)

(3322, 10)
(3322,)


In [6]:
#Drop the columns for HexandMonth and TotalNarcan.  HEXandMonth is not needed and TotalNarcan may cause bias because
#the Narcan Administered already accounts for the TotalNarcan for the month. 

data = data.drop(['HexandMonth'], axis = 1)
data = data.drop(['TotalNarcan'], axis = 1)



In [7]:

# split X and y into train and test datasets

(X_train, X_test, y_train, y_test) = cv.train_test_split(X, y, test_size=.02)

In [8]:
# instantiate a logistic regression model, and fit with X and y
model = LogisticRegression()
model.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [9]:
#Returns the probability of the sample for each class in the model
model.predict_proba(X_test)

array([[ 0.48675656,  0.51324344],
       [ 0.49431476,  0.50568524],
       [ 0.49145511,  0.50854489],
       [ 0.54391221,  0.45608779],
       [ 0.52990524,  0.47009476],
       [ 0.48955298,  0.51044702],
       [ 0.48675656,  0.51324344],
       [ 0.49260877,  0.50739123],
       [ 0.49491327,  0.50508673],
       [ 0.51170464,  0.48829536],
       [ 0.48236833,  0.51763167],
       [ 0.4982819 ,  0.5017181 ],
       [ 0.47529407,  0.52470593],
       [ 0.47914107,  0.52085893],
       [ 0.49816974,  0.50183026],
       [ 0.5074233 ,  0.4925767 ],
       [ 0.52579798,  0.47420202],
       [ 0.48951478,  0.51048522],
       [ 0.51487494,  0.48512506],
       [ 0.51411735,  0.48588265],
       [ 0.51487494,  0.48512506],
       [ 0.47371446,  0.52628554],
       [ 0.64723529,  0.35276471],
       [ 0.52462872,  0.47537128],
       [ 0.48566232,  0.51433768],
       [ 0.4773936 ,  0.5226064 ],
       [ 0.50915731,  0.49084269],
       [ 0.47983214,  0.52016786],
       [ 0.51487494,

In [10]:
model.predict_proba(X_train).shape

(3255, 2)

In [11]:
#mean accuracy on the given test data and labels
model.score(X_test, y_test)

0.46268656716417911

In [12]:
#Confidence scores per (sample, class) combination.
model.decision_function(X_train)

array([ 0.09736546,  0.04440917,  0.02014535, ..., -0.08108989,
       -0.11217516,  0.06060245])

In [13]:
#For each pair of train and test set, a prediction score
cross_val_score(model, X, y)

array([ 0.49819495,  0.50361011,  0.5       ])

In [14]:
grid = gs(model, {'C': np.logspace(-5, 5, 50)})
grid.fit(X_train, y_train)
grid.best_params_

{'C': 0.047148663634573942}

In [15]:
cv.cross_val_score(grid.best_estimator_, X, y)

array([ 0.48826715,  0.50270758,  0.5       ])

In [16]:
model.predict(X_test)

array([1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0])

In [17]:
y_pred = model.predict(X_test)
print ('Precision: ', metrics.precision_score(y_test, y_pred))
print ('Recall: ', metrics.recall_score(y_test, y_pred, average='binary'))
print ('Accuracy', metrics.accuracy_score(y_test, y_pred))
print ('F1', metrics.f1_score(y_test, y_pred, average='binary'))

Precision:  0.5
Recall:  0.472222222222
Accuracy 0.462686567164
F1 0.485714285714


In [18]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [19]:
yrfc_pred = rfc.predict(X_test)

print ('Precision: ', metrics.precision_score(y_test, y_pred))
print ('Recall: ', metrics.recall_score(y_test, y_pred, average='binary'))
print ('Accuracy', metrics.accuracy_score(y_test, y_pred))
print ('F1', metrics.f1_score(y_test, y_pred, average='binary'))

Precision:  0.5
Recall:  0.472222222222
Accuracy 0.462686567164
F1 0.485714285714
