In [1]:
%matplotlib inline
%matplotlib notebook

In [2]:
import os 
import pickle

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
PATH = os.path.abspath(os.path.join(os.path.dirname("."), "Data"))
FINAL_DATA = os.path.join(PATH, "finaldataframecopy_wWeather.csv")

FINAL_DATA

'/Users/wrangamujadidi/Desktop/Capstone/capstone_opioid/Data/finaldataframecopy_wWeather.csv'

In [4]:
data = pd.read_csv(FINAL_DATA)

In [5]:
#Drop the columns for HexandMonth and TotalNarcan.  HEXandMonth is not needed and TotalNarcan may cause bias because
#the Narcan Administered already accounts for the TotalNarcan for the month. 

data = data.drop(['HexandMonth'], axis=1)
data = data.drop(['TotalNarcan'], axis=1)

In [9]:
features = data [[
    'HexagonNumber', 'Month', 'Under20', '20to29', '30to39', '40to49',
       '50to59', '60to69', '70andabove', 'Male',
       'Female', 'MPDCalls', 'FEMSCalls', 'MedicalCalls', 'MiscCals',
       'OverdoseCalls', 'PropertyCalls', 'QOLCalls', 'TrafficCalls',
       'ViolentCalls', 'AverageTemperature', 'AverageHumidity',
       'AveragePrecipitation', 'Unemployment'
]]

X = features

y = data['NarcanAdministered']

print(X.shape)
print(y.shape)

(9444, 24)
(9444,)


In [10]:
from sklearn.model_selection import train_test_split as tts 

In [11]:
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

In [12]:
#Regularization using Lasso

model = Lasso()
model.fit(X, y)
print(list(zip(X, model.coef_.tolist())))

[('HexagonNumber', -0.00014975301215764032), ('Month', 0.0), ('Under20', 0.0), ('20to29', 0.0), ('30to39', 0.0), ('40to49', 0.0), ('50to59', 0.0), ('60to69', 0.0), ('70andabove', 0.0), ('Male', 0.0), ('Female', 0.0), ('MPDCalls', 0.0), ('FEMSCalls', 0.0), ('MedicalCalls', 0.0), ('MiscCals', 0.0), ('OverdoseCalls', 0.0), ('PropertyCalls', 0.0), ('QOLCalls', 0.0), ('TrafficCalls', 0.0), ('ViolentCalls', 0.0), ('AverageTemperature', 0.0), ('AverageHumidity', 0.0), ('AveragePrecipitation', 0.0), ('Unemployment', -0.0)]


In [13]:
model = Lasso(alpha=0.1)
sfm = SelectFromModel(model)
sfm.fit(X, y)
print(list(X[sfm.get_support(indices=True)]))

['HexagonNumber', 'Male', 'QOLCalls', 'AverageTemperature']


In [14]:
model = Ridge()
model.fit(X, y)
print(list(zip(X, model.coef_.tolist())))

[('HexagonNumber', -7.035975470650724e-05), ('Month', -0.00048588334991467194), ('Under20', 0.17699829327180044), ('20to29', 0.31450972852877673), ('30to39', 0.38754014952591936), ('40to49', 0.29065616590264903), ('50to59', 0.2992330279421151), ('60to69', 0.23791173471570093), ('70andabove', 0.4127958009872702), ('Male', 0.020209295204038327), ('Female', 0.061564979533682285), ('MPDCalls', 2.3995658594103203e-05), ('FEMSCalls', 0.0005992412362449308), ('MedicalCalls', -0.0005124795681971912), ('MiscCals', -2.9182487954286124e-05), ('OverdoseCalls', 0.0012958969213464446), ('PropertyCalls', -0.00040837220703219336), ('QOLCalls', 0.0002773742587651918), ('TrafficCalls', -0.0004236028385107806), ('ViolentCalls', 0.00010896963324936956), ('AverageTemperature', 0.0008942058185973536), ('AverageHumidity', 0.07705526559508703), ('AveragePrecipitation', 0.16498848626989374), ('Unemployment', -0.05973883944601209)]


In [15]:
model = Ridge()
sfm = SelectFromModel(model)
sfm.fit(X, y)
print(list(X[sfm.get_support(indices=True)]))

['Under20', '20to29', '30to39', '40to49', '50to59', '60to69', '70andabove', 'AveragePrecipitation']


In [16]:
model = ElasticNet(l1_ratio=0.10)
model.fit(X, y)
print(list(zip(X, model.coef_.tolist())))

[('HexagonNumber', -0.00021219814131857737), ('Month', 0.0), ('Under20', 0.0), ('20to29', 0.0), ('30to39', 0.0), ('40to49', 0.0), ('50to59', 0.0), ('60to69', 0.0), ('70andabove', 0.0), ('Male', 0.06509161810561387), ('Female', 0.0), ('MPDCalls', 0.0), ('FEMSCalls', 0.0), ('MedicalCalls', 0.0), ('MiscCals', -0.0), ('OverdoseCalls', 0.0), ('PropertyCalls', -0.0), ('QOLCalls', 4.8852339877658284e-05), ('TrafficCalls', 0.0), ('ViolentCalls', 0.0), ('AverageTemperature', 0.002674242826469309), ('AverageHumidity', 0.0), ('AveragePrecipitation', 0.0), ('Unemployment', -0.0)]


In [17]:
model = ElasticNet()
sfm = SelectFromModel(model)
sfm.fit(X, y)
print(list(X[sfm.get_support(indices=True)]))

['HexagonNumber', 'AverageTemperature']


In [23]:
features = data [['HexagonNumber', 'Male', 'QOLCalls', 'AverageTemperature']]

X = features

y = data['NarcanAdministered']

print(X.shape)
print(y.shape)

(9444, 4)
(9444,)


In [24]:
X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2)

In [25]:
from sklearn.metrics import classification_report

In [26]:
# Import the model form 
from sklearn.tree import DecisionTreeClassifier 

model = DecisionTreeClassifier() 
model.fit(X_test, y_test)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00      1570
          1       1.00      0.98      0.99       319

avg / total       1.00      1.00      1.00      1889



In [27]:
model = DecisionTreeClassifier(max_depth=5) 
model.fit(X_test, y_test)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.97      1.00      0.98      1570
          1       1.00      0.85      0.92       319

avg / total       0.97      0.97      0.97      1889



In [28]:
from sklearn.ensemble import RandomForestClassifier

# We'll select 50 trees and opt for 'out-of-bag' samples to estimate the generalization error.
rf = RandomForestClassifier(n_estimators=50, oob_score=True)

# Next split up the data with the 'train test split' method in the Cross Validation module
X_train, X_test, y_train, y_test = tts(X,y, test_size=0.2)

# ...and then run the 'fit' method to build a forest of trees
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=50, n_jobs=1, oob_score=True, random_state=None,
            verbose=0, warm_start=False)

In [29]:
rf.score(X_test, y_test)

0.96559025939650611

In [30]:
expected   = y_test
predicted  = rf.predict(X_test)
classificationReport = classification_report(expected, predicted, target_names=["NarcanAdministered",y_pred])
print(classificationReport)

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        

In [31]:
from sklearn.svm import SVC

model = SVC() 
model.fit(X_test, y_test)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.97      1.00      0.99      1551
          1       1.00      0.87      0.93       338

avg / total       0.98      0.98      0.98      1889

