In [136]:
%matplotlib inline
%matplotlib notebook

In [137]:
import os 
import pickle

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Reading In Data

In [138]:
PATH = os.path.abspath(os.path.join(os.path.dirname("."), "Data"))
FINAL_DATA = os.path.join(PATH, "finaldataframecopy_wWeather.csv")

FINAL_DATA

'/Users/wrangamujadidi/Desktop/Capstone/capstone_opioid/Data/finaldataframecopy_wWeather.csv'

In [139]:
data = pd.read_csv(FINAL_DATA)

In [140]:
data.head()

Unnamed: 0,HexagonNumber,HexandMonth,Month,Under20,20to29,30to39,40to49,50to59,60to69,70andabove,...,MiscCals,OverdoseCalls,PropertyCalls,QOLCalls,TrafficCalls,ViolentCalls,AverageTemperature,AverageHumidity,AveragePrecipitation,Unemployment
0,1,1-1,1,0,0,0,0,0,0,0,...,0,0,0,5,5,0,37.4075,0.541429,0.00025,0.274
1,1,1-2,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,37.4075,0.541429,0.00025,0.274
2,1,1-3,3,0,0,0,0,0,0,0,...,0,0,0,0,0,0,37.4075,0.541429,0.00025,0.274
3,1,1-4,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,37.4075,0.541429,0.00025,0.274
4,1,1-5,5,0,0,0,0,0,0,0,...,0,0,0,0,0,0,37.4075,0.541429,0.00025,0.274


In [141]:
#Drop the columns for HexandMonth and TotalNarcan.  HEXandMonth is not needed and TotalNarcan may cause bias because
#the Narcan Administered already accounts for the TotalNarcan for the month. 

data = data.drop(['HexandMonth'], axis = 1)
data = data.drop(['TotalNarcan'], axis = 1)


In [142]:
data.columns

Index(['HexagonNumber', 'Month', 'Under20', '20to29', '30to39', '40to49',
       '50to59', '60to69', '70andabove', 'NarcanAdministered', 'Male',
       'Female', 'MPDCalls', 'FEMSCalls', 'MedicalCalls', 'MiscCals',
       'OverdoseCalls', 'PropertyCalls', 'QOLCalls', 'TrafficCalls',
       'ViolentCalls', 'AverageTemperature', 'AverageHumidity',
       'AveragePrecipitation', 'Unemployment'],
      dtype='object')

# Test Train Split

In [143]:
from sklearn.model_selection import train_test_split as tts 

features = data [[
     'HexagonNumber', 'Month', 'Under20', '20to29', '30to39', '40to49',
       '50to59', '60to69', '70andabove', 'NarcanAdministered', 'Male',
       'Female', 'MPDCalls', 'FEMSCalls', 'MedicalCalls', 'MiscCals',
       'OverdoseCalls', 'PropertyCalls', 'QOLCalls', 'TrafficCalls',
       'ViolentCalls', 'AverageTemperature', 'AverageHumidity',
       'AveragePrecipitation', 'Unemployment'
]]

X = features

y = data['NarcanAdministered']

print(X.shape)
print(y.shape)

(9444, 25)
(9444,)


In [144]:
X_train, X_test, y_train, y_test = tts(X, y, test_size=0.5)

# Decision Tree

In [145]:
from sklearn.metrics import classification_report

In [146]:
# Import the model form 
from sklearn.tree import DecisionTreeClassifier 

model = DecisionTreeClassifier() 
model.fit(X_test, y_test)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00      3871
          1       1.00      1.00      1.00       851

avg / total       1.00      1.00      1.00      4722



In [147]:
model = DecisionTreeClassifier(max_depth=5) 
model.fit(X_test, y_test)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00      3871
          1       1.00      1.00      1.00       851

avg / total       1.00      1.00      1.00      4722



# Random Forest 

In [148]:
from sklearn.ensemble import RandomForestClassifier

In [149]:
# We'll select 50 trees and opt for 'out-of-bag' samples to estimate the generalization error.
rf = RandomForestClassifier(n_estimators=50, oob_score=True)

In [150]:
# Next split up the data with the 'train test split' method in the Cross Validation module
X_train, X_test, y_train, y_test = tts(X,y, test_size=0.2)

# ...and then run the 'fit' method to build a forest of trees
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=50, n_jobs=1, oob_score=True, random_state=None,
            verbose=0, warm_start=False)

In [151]:
rf.score(X_test, y_test)

1.0

In [152]:
expected   = y_test
predicted  = rf.predict(X_test)
classificationReport = classification_report(expected, predicted, target_names=["Perished","Survived"])
print(classificationReport)

             precision    recall  f1-score   support

   Perished       1.00      1.00      1.00      1566
   Survived       1.00      1.00      1.00       323

avg / total       1.00      1.00      1.00      1889



# SVC

In [153]:
from sklearn.svm import SVC

model = SVC() 
model.fit(X_test, y_test)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.98      1.00      0.99      1566
          1       1.00      0.91      0.95       323

avg / total       0.98      0.98      0.98      1889



# K Means Clustering 

In [157]:
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics.pairwise import euclidean_distances

In [None]:
K = range(1,10)
meandistortions = []

for k in K:

    elbow = KMeans(n_clusters=k, n_jobs=-1, random_state=1)
    elbow.fit(X)
    meandistortions.append(sum(np.min(euclidean_distances(X, elbow.cluster_centers_), axis=1)) / X.shape[0])

    
plt.plot(K, meandistortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Average distortion')
plt.title('Selecting k with the Elbow Method')
plt.show()