# Predicting Quality of Care

### 1. Import the tools we'll need

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression

### 2. Important the dataset we'll need, clean a little and preview

In [152]:
quality = pd.read_csv('quality_of_care.csv')
quality['StartedOnCombination'] = np.where(quality['StartedOnCombination']==True, 1,0)
quality['Pain'] = np.where(quality['Pain']>15, 1,0)
quality = quality.drop('MemberID', axis=1)
quality = quality.drop('TotalVisits', axis=1)
quality.head(100)

Unnamed: 0,InpatientDays,ERVisits,OfficeVisits,Narcotics,DaysSinceLastERVisit,Pain,ProviderCount,MedicalClaims,ClaimLines,StartedOnCombination,AcuteDrugGapSmall,PoorCare
0,0,0,18,1,731.000000,0,21,93,222,0,0,0
1,1,1,6,1,411.000000,0,27,19,115,0,1,0
2,0,0,5,3,731.000000,0,16,27,148,0,5,0
3,0,1,19,0,158.000000,1,14,59,242,0,0,0
4,8,2,19,3,449.000000,0,24,51,204,0,0,0
5,2,0,9,2,731.000000,0,40,53,156,0,4,1
6,16,1,8,1,173.958333,0,19,40,261,0,0,0
7,2,0,8,0,731.000000,0,11,28,87,0,0,0
8,2,1,4,3,45.000000,0,28,20,98,0,0,1
9,4,2,0,2,104.000000,0,21,17,66,0,0,0


In [144]:
quality.describe()

Unnamed: 0,InpatientDays,ERVisits,OfficeVisits,Narcotics,DaysSinceLastERVisit,Pain,ProviderCount,MedicalClaims,ClaimLines,StartedOnCombination,AcuteDrugGapSmall,PoorCare
count,131.0,131.0,131.0,131.0,131.0,131.0,131.0,131.0,131.0,131.0,131.0,131.0
mean,2.717557,1.496183,13.229008,4.572519,480.567112,15.557252,23.984733,43.244275,142.862595,0.045802,2.694656,0.251908
std,5.099434,2.167501,9.077076,9.713615,274.76724,20.418001,13.670285,28.88654,91.629921,0.209857,6.983379,0.435775
min,0.0,0.0,0.0,0.0,6.0,0.0,5.0,11.0,20.0,0.0,0.0,0.0
25%,0.0,0.0,7.0,0.0,206.958333,1.0,15.0,25.5,83.5,0.0,0.0,0.0
50%,0.0,1.0,12.0,1.0,640.958333,8.0,20.0,37.0,120.0,0.0,1.0,0.0
75%,3.0,2.0,18.5,3.0,731.0,23.0,30.0,49.5,185.0,0.0,3.0,0.5
max,30.0,11.0,46.0,59.0,731.0,104.0,82.0,194.0,577.0,1.0,71.0,1.0


### 3. Split the dataset into training and test, predictor and outcome

In [153]:
training_x = quality[0:100].drop('PoorCare', axis=1)
training_y = quality[['PoorCare']][0:100]
test_x = quality[100:].drop('PoorCare', axis=1)
test_y = quality[['PoorCare']][100:]

### 4. Estimate the model on training data

In [154]:
logisticRegr = LogisticRegression(solver='lbfgs', max_iter=1000)
logisticRegr.fit(training_x, training_y.values.ravel())

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=1000, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='lbfgs', tol=0.0001,
          verbose=0, warm_start=False)

### 5. Use that estimated model on test data

In [155]:
score = logisticRegr.score(test_x, test_y.values.ravel())
print(score)

0.774193548387


In [159]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.metrics import fbeta_score, accuracy_score
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(training_x)
X_test_scaled = scaler.transform(test_x)

clf = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(), random_state = 42)
parameters = {
    'base_estimator__max_depth':[1,2,4,8],
    'base_estimator__min_samples_split':[10, 100, 200, 500, 1000],
    'n_estimators': [50, 100, 200, 500],
    'learning_rate':[0.01, 0.1, 1]
}


scorer = make_scorer(accuracy_score)

grid_obj = GridSearchCV(clf, parameters, scoring=scorer)
grid_fit = grid_obj.fit(X_train_scaled, training_y.values.ravel())
best_clf = grid_fit.best_estimator_

best_clf.fit(X_train_scaled, training_y.values.ravel())
y_pred = best_clf.predict(X_test_scaled)
accuracy_score(y_pred, test_y)

0.54838709677419351

In [158]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(training_x)
X_test_scaled = scaler.transform(test_x)

clf = AdaBoostClassifier(dt, random_state = 17, n_estimators = 100, learning_rate = 0.2)
clf.fit(X_train_scaled, training_y.values.ravel())
y_pred = clf.predict(X_test_scaled)
accuracy_score(y_pred, test_y)

0.74193548387096775

In [91]:
lr = LogisticRegression(random_state=2, solver='lbfgs', max_iter=1000)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2)
Cs = np.logspace(-3, 1, 10)
reg_params = {'C': Cs}
lrg = GridSearchCV(lr, reg_params, n_jobs=-1, scoring =scorer, cv=skf)
lrg.fit(X_train_scaled, training_y.values.ravel())
best_lrg = grid_fit.best_estimator_
y_pred = best_lrg.predict(X_test_scaled)
accuracy_score(y_pred, test_y)

0.61290322580645162