In [1]:
import pandas as pd
import numpy as np

In [2]:
DiabetesAllDummy = pd.read_csv('DiabetesAllDummy.csv', index_col=0)
#DiabetesOrdMed = pd.read_csv('DiabetesOrdMed.csv', index_col=0)

In [117]:
#Set a random index of 20% of the values in our dataset. This is the test dataset and will *only* be used for testing our models (sparingly!)
np.random.seed(100)
length = len(DiabetesAllDummy)
testIdx = np.random.choice(range(length), size=int(round(0.2*length)), replace=False)
trainIdx = list(set(range(length))-set(testIdx))

In [118]:
#Let's just start by looking at a simple linear regression:
#Yes, we have a categorical variable as our output. This will not be valid, but will show right away variables which could be important:

DiabetesTrain = DiabetesAllDummy[DiabetesAllDummy['IsTrain']==1]
DiabetesTrain.index = list(range(len(DiabetesTrain)))

DiabetesTest = DiabetesAllDummy[DiabetesAllDummy['IsTrain']==0]
DiabetesTest.index = list(range(len(DiabetesTest)))

In [119]:
#Try simple K-fold (5) random forest classifier using default conditions:

from sklearn.ensemble import RandomForestClassifier as rfc

rfc = rfc()

import sklearn.model_selection as ms
ms_k5 = ms.KFold(n_splits=5, shuffle=True)

#First try with all features:

#Create X and Y variables (replace 2 to 1 and 1 to 0)
trainX = DiabetesTrain.drop('readmitted', axis=1)
trainY = DiabetesTrain['readmitted'].replace([2, 1], [1, 0])

testX = DiabetesTest.drop('readmitted', axis=1)
testY = DiabetesTest['readmitted'].replace([2, 1], [1, 0])


#Instantiate the K-fold generator object:
np.random.seed(0)
DiabetesAD5Fold = ms_k5.split(trainX, trainY)

#Produce list of test R^2 scores, and Actual vs Predicted lists for the individual runs
ScoreList = []
PredictList = []
PureTestScore = []
TrainScores = []
ActualValues = []
XXX = []

for train, test in DiabetesAD5Fold:

    
    #Run the fit using the train data for each K
    rfc.fit(trainX.iloc[train,], trainY[train])
    #Run your predicion for the "missing" K-part
    pxxx = rfc.predict_proba(trainX.iloc[train,])
    p = rfc.predict(trainX.iloc[test,])
    actual = trainY[train].values
    #Check your schore for the missing K-part
    R2 = rfc.score(trainX.iloc[test,], trainY[test])
    #Run a test on the completely untouched test 20%
    TestR2 = rfc.score(testX, testY)
    TrainScore = rfc.score(trainX.iloc[train,], trainY[train])
    
    #Append these scores to the lists above
    ScoreList.append(R2)
    PureTestScore.append(TestR2)
    PredictList.append(p)
    TrainScores.append(TrainScore)
    ActualValues.append(actual)
    XXX.append(pxxx)
    
    #Make predictions for the completely untouched 20%
    PredictionsTest = rfc.predict(testX)
    
    #Use these predictions to calculate RMSLE for the untouched 20% and append
    #RMSLEvalue = np.sqrt(np.mean(np.power(np.log1p(testY)-np.log1p(PredictionsTest), 2)))
    #RMSLE.append(RMSLEvalue)

    


In [120]:
#These are the internal train scores, based on the logistic regression
TrainScores

[0.9803486974573493,
 0.9801926108605054,
 0.9794277865359702,
 0.9797555683893424,
 0.9798027096210277]

In [121]:
#These are the K-test scores, based on the logistic regression
ScoreList

[0.883373915215084,
 0.8875569707186115,
 0.8841231191858651,
 0.8844352875070238,
 0.885551948051948]

In [122]:
#These are the scores for each of the 5 ensemble models, compared to the untouched 20% test index
PureTestScore

[0.8849725411882177,
 0.8853220169745382,
 0.8845232151772342,
 0.8845731402895657,
 0.8846729905142287]

In [123]:
#These are the prediction lists from the train, compared to actual Y values for the train:
PredictList

[array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
 array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
 array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
 array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
 array([0, 0, 0, ..., 0, 0, 0], dtype=int64)]

In [124]:
#These values may sound impressive, but they absolutely aren't. 11.1 percent of patients return under 30 days, so simply guessing "No" will get you 88.8% accuracy:
1-np.sum(trainY)/len(trainY)

0.8865441286649018

In [125]:
#The accuracy with reduced features was no better than the overall accuracy with all features. Let's check what might be going on:

In [126]:
np.sum(PredictList[0])

78

In [127]:
np.sum(trainY)

9086

In [128]:
len(PredictList[0])

16017

In [129]:
'''This seems to be the biggest problem. Out of ~16,000 observations (in the first K group, as an example), only 78 of them
were predicted to return within 30 days. We know that 11% should, or rougly 1,800. We need to tune this model to be more
generous in predicting a positive outcome'''

'This seems to be the biggest problem. Out of ~16,000 observations (in the first K group, as an example), only 78 of them\nwere predicted to return within 30 days. We know that 11% should, or rougly 1,800. We need to tune this model to be more\ngenerous in predicting a positive outcome'

In [130]:
#This is commented out because it took a VERY long time
'''from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier as rfc

#We will perform a grid search to find the optimal hyperparameters for our RF algorithm and test using our K-fold data:

# C value is the one most important for tuning a logistic regression. Let's see how varying this value affects the score:
min_samples_split = [2, 3, 5, 10]
min_samples_leaf = [1, 2, 4]
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1000, num = 10)]
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
max_features = ['sqrt', 'log2']


# Create the random grid
random_grid = {'min_samples_split': min_samples_split, 'n_estimators':n_estimators,
              'max_depth':max_depth, 'max_features':max_features, 'min_samples_leaf':min_samples_leaf}

# Use the random grid to search for best C hyperparameter:

# First create the base model to tune
rftune = rfc()

# Random search of parameters, using 5-fold cross validation, 
rf_random = RandomizedSearchCV(estimator = rftune, scoring='f1', param_distributions = random_grid, n_iter = 100, cv = 5, 
                               verbose=2, random_state=42, n_jobs = 3)

# Fit the random search model
rf_random.fit(trainX, trainY)

#Then print the best parameters using best_params_
rf_random.best_params_'''

"from sklearn.model_selection import RandomizedSearchCV\nfrom sklearn.ensemble import RandomForestClassifier as rfc\n\n#We will perform a grid search to find the optimal hyperparameters for our RF algorithm and test using our K-fold data:\n\n# C value is the one most important for tuning a logistic regression. Let's see how varying this value affects the score:\nmin_samples_split = [2, 3, 5, 10]\nmin_samples_leaf = [1, 2, 4]\nn_estimators = [int(x) for x in np.linspace(start = 100, stop = 1000, num = 10)]\nmax_depth = [int(x) for x in np.linspace(10, 110, num = 11)]\nmax_depth.append(None)\nmax_features = ['sqrt', 'log2']\n\n\n# Create the random grid\nrandom_grid = {'min_samples_split': min_samples_split, 'n_estimators':n_estimators,\n              'max_depth':max_depth, 'max_features':max_features, 'min_samples_leaf':min_samples_leaf}\n\n# Use the random grid to search for best C hyperparameter:\n\n# First create the base model to tune\nrftune = rfc()\n\n# Random search of parameters

In [161]:
from sklearn.ensemble import RandomForestClassifier as rfc

rfc = rfc()
rfc.set_params(n_estimators=200, min_samples_split=5, min_samples_leaf=1, max_features='sqrt', max_depth=60)

rfc.fit(trainX, trainY)
predictprobs = rfc.predict_proba(trainX)
predictvalues = rfc.predict(trainX)
actual = trainY.values

R2 = rfc.score(trainX, trainY)
#Run a test on the completely untouched test 20%
TestR2 = rfc.score(testX, testY)

In [164]:
predicttest = rfc.predict(testX)
predicttestprobs = rfc.predict_proba(testX)

In [165]:
from sklearn.metrics import roc_auc_score as AUC

AUC(testY, predicttestprobs[:,1])

0.6566596221552277

In [133]:
print(rfc.score(trainX, trainY))

0.9329079466560112


In [134]:
print(rfc.score(testX, testY))

0.8870693959061408


In [135]:
print(len(predicttestprobs[:,1][predicttestprobs[:,1]>0.20]))
#This predictor estimates that there is an 11.6 percent chance in the train set
#np.sum(predictvalues)
print(np.sum(predicttest))
print(len(predicttest))
#The test classifier predicts 20/20030 hits (less than 1%), despite >11% being positive
#We would need to set the threshhold to about 20% to get 11% hits

2257
21
20030


In [136]:
#I'm curious how effective that cutoff (20%) would be:
predictedY = testY[predicttestprobs[:,1]>0.20]
predictedN = testY[predicttestprobs[:,1]<=0.2]

print(len(predictedY))
print(len(predictedN))
print(np.sum(predictedY))
print(np.sum(predictedN))

print(np.sum((1-predictedY)**2) + np.sum(predictedN**2))

#At this cutoff, we made 20030 predictions. 2281 were yes (577 right, 1704 wrong); 17749 were no (16055 right, 1694 wrong). 
#This is only 83.0 percent accurate, worse than simply guessing no for everything

2257
17773
559
1712
3410


In [137]:
#I'm curious how effective that cutoff (20%) would be:
predictedY = testY[predicttestprobs[:,1]>0.50]
predictedN = testY[predicttestprobs[:,1]<=0.5]

print(len(predictedY))
print(len(predictedN))
print(np.sum(predictedY))
print(np.sum(predictedN))

print(np.sum((1-predictedY)**2) + np.sum(predictedN**2))

#Here, we predict 20 positives (yet still only 12 of them are actually positive... what??) and a slew of negatives, which have a standard 11% error rate.

21
20009
15
2256
2262


In [138]:
np.sum((predicttestprobs[:,1]-testY)**2)/len(testY)
#This prediction is an actual 9.6% error rate, better than simply predicting "No" for everything. How do we capture this?

0.0964159970920667

In [139]:

'''#Let's try the RF regression with c=0.001 and see how that works:

from sklearn.ensemble import RandomForestClassifier as rfc

rfc()
rfc.set_params(n_estimators=200, min_samples_split=5, min_samples_leaf=1, max_features='sqrt', max_depth=60)

import sklearn.model_selection as ms
ms_k5 = ms.KFold(n_splits=5, shuffle=True)

#First try with all features:

#Create X and Y variables (replace 2 to 1 and 1 to 0)
trainX = DiabetesTrain.drop('readmitted', axis=1)
trainY = DiabetesTrain['readmitted'].replace([2, 1], [1, 0])

testX = DiabetesTest.drop('readmitted', axis=1)
testY = DiabetesTest['readmitted'].replace([2, 1], [1, 0])


#Instantiate the K-fold generator object:
np.random.seed(0)
DiabetesAD5Fold = ms_k5.split(trainX, trainY)

#Produce list of test R^2 scores, and Actual vs Predicted lists for the individual runs
ScoreList = []
PredictList = []
PureTestScore = []
TrainScores = []
ActualValues = []

for train, test in DiabetesAD5Fold:

    
    #Run the fit using the train data for each K
    lgr.fit(trainX.iloc[train,], trainY[train])
    #Run your predicion for the "missing" K-part
    p = lgr.predict(trainX.iloc[test,])
    actual = trainY[train].values
    #Check your schore for the missing K-part
    R2 = lgr.score(trainX.iloc[test,], trainY[test])
    #Run a test on the completely untouched test 20%
    TestR2 = lgr.score(testX, testY)
    TrainScore = lgr.score(trainX.iloc[train,], trainY[train])
    
    #Append these scores to the lists above
    ScoreList.append(R2)
    PureTestScore.append(TestR2)
    PredictList.append(p)
    TrainScores.append(TrainScore)
    ActualValues.append(actual)
    
    #Make predictions for the completely untouched 20%
    PredictionsTest = lgr.predict(testX)
    
    #Use these predictions to calculate RMSLE for the untouched 20% and append
    #RMSLEvalue = np.sqrt(np.mean(np.power(np.log1p(testY)-np.log1p(PredictionsTest), 2)))
    #RMSLE.append(RMSLEvalue)'''

'#Let\'s try the RF regression with c=0.001 and see how that works:\n\nfrom sklearn.ensemble import RandomForestClassifier as rfc\n\nrfc()\nrfc.set_params(n_estimators=200, min_samples_split=5, min_samples_leaf=1, max_features=\'sqrt\', max_depth=60)\n\nimport sklearn.model_selection as ms\nms_k5 = ms.KFold(n_splits=5, shuffle=True)\n\n#First try with all features:\n\n#Create X and Y variables (replace 2 to 1 and 1 to 0)\ntrainX = DiabetesTrain.drop(\'readmitted\', axis=1)\ntrainY = DiabetesTrain[\'readmitted\'].replace([2, 1], [1, 0])\n\ntestX = DiabetesTest.drop(\'readmitted\', axis=1)\ntestY = DiabetesTest[\'readmitted\'].replace([2, 1], [1, 0])\n\n\n#Instantiate the K-fold generator object:\nnp.random.seed(0)\nDiabetesAD5Fold = ms_k5.split(trainX, trainY)\n\n#Produce list of test R^2 scores, and Actual vs Predicted lists for the individual runs\nScoreList = []\nPredictList = []\nPureTestScore = []\nTrainScores = []\nActualValues = []\n\nfor train, test in DiabetesAD5Fold:\n\n    \n

In [140]:
#PureTestScore

In [141]:
#Either logistic regression does not work for this data, or something is going wrong here. The predicted scores are up by the slightest amount

In [142]:
'''#Finally, try the logistic regression with c=0.001 and reduced features and see how that works:

from sklearn.linear_model import LogisticRegression as lgr

lgr = lgr()
lgr.set_params(C=0.001)

import sklearn.model_selection as ms
ms_k5 = ms.KFold(n_splits=5, shuffle=True)

#First try with all features:



#Create X and Y variables (replace 2 to 1 and 1 to 0)
trainX = DiabetesTrainM.drop('readmitted', axis=1)
trainY = DiabetesTrainM['readmitted'].replace([2, 1], [1, 0])

testX = DiabetesTestM.drop('readmitted', axis=1)
testY = DiabetesTestM['readmitted'].replace([2, 1], [1, 0])


#Instantiate the K-fold generator object:
np.random.seed(0)
DiabetesAD5Fold = ms_k5.split(trainX, trainY)

#Produce list of test R^2 scores, and Actual vs Predicted lists for the individual runs
ScoreList = []
PredictList = []
PureTestScore = []
TrainScores = []
ActualValues = []

for train, test in DiabetesAD5Fold:

    
    #Run the fit using the train data for each K
    lgr.fit(trainX.iloc[train,], trainY[train])
    #Run your predicion for the "missing" K-part
    p = lgr.predict(trainX.iloc[test,])
    actual = trainY[train].values
    #Check your schore for the missing K-part
    R2 = lgr.score(trainX.iloc[test,], trainY[test])
    #Run a test on the completely untouched test 20%
    TestR2 = lgr.score(testX, testY)
    TrainScore = lgr.score(trainX.iloc[train,], trainY[train])
    
    #Append these scores to the lists above
    ScoreList.append(R2)
    PureTestScore.append(TestR2)
    PredictList.append(p)
    TrainScores.append(TrainScore)
    ActualValues.append(actual)
    
    #Make predictions for the completely untouched 20%
    PredictionsTest = lgr.predict(testX)
    
    #Use these predictions to calculate RMSLE for the untouched 20% and append
    #RMSLEvalue = np.sqrt(np.mean(np.power(np.log1p(testY)-np.log1p(PredictionsTest), 2)))
    #RMSLE.append(RMSLEvalue)'''

'#Finally, try the logistic regression with c=0.001 and reduced features and see how that works:\n\nfrom sklearn.linear_model import LogisticRegression as lgr\n\nlgr = lgr()\nlgr.set_params(C=0.001)\n\nimport sklearn.model_selection as ms\nms_k5 = ms.KFold(n_splits=5, shuffle=True)\n\n#First try with all features:\n\n\n\n#Create X and Y variables (replace 2 to 1 and 1 to 0)\ntrainX = DiabetesTrainM.drop(\'readmitted\', axis=1)\ntrainY = DiabetesTrainM[\'readmitted\'].replace([2, 1], [1, 0])\n\ntestX = DiabetesTestM.drop(\'readmitted\', axis=1)\ntestY = DiabetesTestM[\'readmitted\'].replace([2, 1], [1, 0])\n\n\n#Instantiate the K-fold generator object:\nnp.random.seed(0)\nDiabetesAD5Fold = ms_k5.split(trainX, trainY)\n\n#Produce list of test R^2 scores, and Actual vs Predicted lists for the individual runs\nScoreList = []\nPredictList = []\nPureTestScore = []\nTrainScores = []\nActualValues = []\n\nfor train, test in DiabetesAD5Fold:\n\n    \n    #Run the fit using the train data for ea

In [143]:
#PureTestScore

In [144]:
#Finally, try the logistic regression with c=0.001 and reduced features and see how that works:

'''from sklearn.linear_model import LogisticRegression as lgr

lgr = lgr()
lgr.set_params(C=1000)

import sklearn.model_selection as ms
ms_k5 = ms.KFold(n_splits=5, shuffle=True)

#First try with all features:



#Create X and Y variables (replace 2 to 1 and 1 to 0)
trainX = DiabetesTrainM.drop('readmitted', axis=1)
trainY = DiabetesTrainM['readmitted'].replace([2, 1], [1, 0])

testX = DiabetesTestM.drop('readmitted', axis=1)
testY = DiabetesTestM['readmitted'].replace([2, 1], [1, 0])


#Instantiate the K-fold generator object:
np.random.seed(0)
DiabetesAD5Fold = ms_k5.split(trainX, trainY)

#Produce list of test R^2 scores, and Actual vs Predicted lists for the individual runs
ScoreList = []
PredictList = []
PureTestScore = []
TrainScores = []
ActualValues = []

for train, test in DiabetesAD5Fold:

    
    #Run the fit using the train data for each K
    lgr.fit(trainX.iloc[train,], trainY[train])
    #Run your predicion for the "missing" K-part
    p = lgr.predict(trainX.iloc[test,])
    actual = trainY[train].values
    #Check your schore for the missing K-part
    R2 = lgr.score(trainX.iloc[test,], trainY[test])
    #Run a test on the completely untouched test 20%
    TestR2 = lgr.score(testX, testY)
    TrainScore = lgr.score(trainX.iloc[train,], trainY[train])
    
    #Append these scores to the lists above
    ScoreList.append(R2)
    PureTestScore.append(TestR2)
    PredictList.append(p)
    TrainScores.append(TrainScore)
    ActualValues.append(actual)
    
    #Make predictions for the completely untouched 20%
    PredictionsTest = lgr.predict(testX)
    
    #Use these predictions to calculate RMSLE for the untouched 20% and append
    #RMSLEvalue = np.sqrt(np.mean(np.power(np.log1p(testY)-np.log1p(PredictionsTest), 2)))
    #RMSLE.append(RMSLEvalue)'''

'from sklearn.linear_model import LogisticRegression as lgr\n\nlgr = lgr()\nlgr.set_params(C=1000)\n\nimport sklearn.model_selection as ms\nms_k5 = ms.KFold(n_splits=5, shuffle=True)\n\n#First try with all features:\n\n\n\n#Create X and Y variables (replace 2 to 1 and 1 to 0)\ntrainX = DiabetesTrainM.drop(\'readmitted\', axis=1)\ntrainY = DiabetesTrainM[\'readmitted\'].replace([2, 1], [1, 0])\n\ntestX = DiabetesTestM.drop(\'readmitted\', axis=1)\ntestY = DiabetesTestM[\'readmitted\'].replace([2, 1], [1, 0])\n\n\n#Instantiate the K-fold generator object:\nnp.random.seed(0)\nDiabetesAD5Fold = ms_k5.split(trainX, trainY)\n\n#Produce list of test R^2 scores, and Actual vs Predicted lists for the individual runs\nScoreList = []\nPredictList = []\nPureTestScore = []\nTrainScores = []\nActualValues = []\n\nfor train, test in DiabetesAD5Fold:\n\n    \n    #Run the fit using the train data for each K\n    lgr.fit(trainX.iloc[train,], trainY[train])\n    #Run your predicion for the "missing" K-p

In [145]:
#PureTestScore

In [146]:
#np.sum(PredictList[0])

In [166]:
#Let's just start by looking at a simple linear regression:
#Yes, we have a categorical variable as our output. This will not be valid, but will show right away variables which could be important:


DiabetesAllDummy = pd.read_csv('DiabetesAllDummy.csv', index_col=0)

DiabetesTrain = DiabetesAllDummy[DiabetesAllDummy['IsTrain']==1].drop('IsTrain', axis=1)
DiabetesTrain.index = list(range(len(DiabetesTrain)))

DiabetesTest = DiabetesAllDummy[DiabetesAllDummy['IsTrain']==0].drop('IsTrain', axis=1)
DiabetesTest.index = list(range(len(DiabetesTest)))

trainX = DiabetesTrain.drop('readmitted', axis=1)
trainY = DiabetesTrain['readmitted'].replace([2, 1], [1, 0])

testX = DiabetesTest.drop('readmitted', axis=1)
testY = DiabetesTest['readmitted'].replace([2, 1], [1, 0])

#Import and run the model:
from sklearn.ensemble import RandomForestClassifier as rfc

rfc = rfc()
rfc.set_params(n_estimators=200, min_samples_split=5, min_samples_leaf=1, max_features='sqrt', max_depth=60)

rfc.fit(trainX, trainY)
predictprobs = rfc.predict_proba(trainX)
predictvalues = rfc.predict(trainX)
actual = trainY.values

R2 = rfc.score(trainX, trainY)
#Run a test on the completely untouched test 20%
TestR2 = rfc.score(testX, testY)

predicttest = rfc.predict(testX)
predicttestprobs = rfc.predict_proba(testX)

In [167]:
from sklearn.metrics import roc_auc_score as AUC

AUC(testY, predicttestprobs[:,1])

0.6565731991337912

In [148]:
print(R2)
print(TestR2)

0.9323460366615054
0.8866699950074888


In [149]:
#I'm curious how effective that cutoff (20%) would be:
predictedY = testY[predicttestprobs[:,1]>0.50]
predictedN = testY[predicttestprobs[:,1]<=0.5]

print(len(predictedY))
print(len(predictedN))
print(np.sum(predictedY))
print(np.sum(predictedN))

print('SquaredScore: ' + str(np.sum((1-predictedY)**2) + np.sum(predictedN**2)))

#Here, we predict 17 positives (yet still only 10 of them are actually positive... what??) and a slew of negatives, which have a standard 11% error rate.
#SquaredScore would have been 2271, guessing everything as "No"

17
20013
9
2262
SquaredScore: 2270


In [150]:
#I'm curious how effective that cutoff (20%) would be:
predictedY = testY[predicttestprobs[:,1]>0.20]
predictedN = testY[predicttestprobs[:,1]<=0.2]

print(len(predictedY))
print(len(predictedN))
print(np.sum(predictedY))
print(np.sum(predictedN))

print('SquaredScore: ' + str(np.sum((1-predictedY)**2) + np.sum(predictedN**2)))

#At this cutoff, we made 20030 predictions. 2227 were yes (576 right, 1651 wrong); 17803 were no (16055 right, 1694 wrong). 
#This is only 83.0 percent accurate, worse than simply guessing no for everything

2231
17799
573
1698
SquaredScore: 3356


In [151]:
print(np.sum(trainY)/len(trainY))

0.11345587133509814


In [152]:
#Is your actual prediction more accurate??  What would the squared score be just guessing the trained P (0.1134)?
print('SquaredScore: ' + str(np.sum((0.11345587-testY)**2)))
#You could get down to a score of 2013.5

SquaredScore: 2013.5142942422617


In [153]:
#Now what would the squared score be for our actual prediction?
print('SquaredScore: ' + str(np.sum((predicttestprobs[:,1]-testY)**2)))
#The predicted probabilities are significantly better than guessing the train probability for every value.

SquaredScore: 1928.0117989782798


In [168]:
#Now do this same analysis for dataset2 (meds reduced from up down steady no to change steady no)
#Set2


DiabetesAnyChange = pd.read_csv('DiabetesAnyChange.csv', index_col=0)

DiabetesTrain2 = DiabetesAnyChange[DiabetesAnyChange['IsTrain']==1].drop('IsTrain', axis=1)
DiabetesTrain2.index = list(range(len(DiabetesTrain2)))

DiabetesTest2 = DiabetesAnyChange[DiabetesAnyChange['IsTrain']==0].drop('IsTrain', axis=1)
DiabetesTest2.index = list(range(len(DiabetesTest2)))

trainX2 = DiabetesTrain2.drop('readmitted', axis=1)
trainY2 = DiabetesTrain2['readmitted'].replace([2, 1], [1, 0])

testX2 = DiabetesTest2.drop('readmitted', axis=1)
testY2 = DiabetesTest2['readmitted'].replace([2, 1], [1, 0])

#Import and run the model:
from sklearn.ensemble import RandomForestClassifier as rfc

rfc = rfc()
rfc.set_params(n_estimators=200, min_samples_split=5, min_samples_leaf=1, max_features='sqrt', max_depth=60)

rfc.fit(trainX2, trainY2)
predictprobs = rfc.predict_proba(trainX2)
predictvalues = rfc.predict(trainX2)
actual = trainY2.values

R2 = rfc.score(trainX2, trainY2)
#Run a test on the completely untouched test 20%
TestR2 = rfc.score(testX2, testY2)

predicttest = rfc.predict(testX2)
predicttestprobs = rfc.predict_proba(testX2)

In [169]:
from sklearn.metrics import roc_auc_score as AUC

AUC(testY2, predicttestprobs[:,1])

0.6585450970103685

In [155]:
#Now what would the squared score be for our actual prediction?
print('SquaredScore: ' + str(np.sum((predicttestprobs[:,1]-testY)**2)))
#The predicted probabilities are significantly better than guessing the train probability for every value.
#Virtually the same (slightest bit better) than the AllDummy model

SquaredScore: 1933.9284406748222


In [156]:
#Now do this same analysis for dataset2 (meds reduced from up down steady no to yes no)
#Set2


DiabetesTakingMed = pd.read_csv('DiabetesTakingMed.csv', index_col=0)

DiabetesTrain3 = DiabetesTakingMed[DiabetesTakingMed['IsTrain']==1].drop('IsTrain', axis=1)
DiabetesTrain3.index = list(range(len(DiabetesTrain3)))

DiabetesTest3 = DiabetesTakingMed[DiabetesTakingMed['IsTrain']==0].drop('IsTrain', axis=1)
DiabetesTest3.index = list(range(len(DiabetesTest3)))

trainX3 = DiabetesTrain3.drop('readmitted', axis=1)
trainY3 = DiabetesTrain3['readmitted'].replace([2, 1], [1, 0])

testX3 = DiabetesTest3.drop('readmitted', axis=1)
testY3 = DiabetesTest3['readmitted'].replace([2, 1], [1, 0])

#Import and run the model:
from sklearn.ensemble import RandomForestClassifier as rfc

rfc = rfc()
rfc.set_params(n_estimators=200, min_samples_split=5, min_samples_leaf=1, max_features='sqrt', max_depth=60)

rfc.fit(trainX3, trainY3)
predictprobs = rfc.predict_proba(trainX3)
predictvalues = rfc.predict(trainX3)
actual = trainY3.values

R2 = rfc.score(trainX3, trainY3)
#Run a test on the completely untouched test 20%
TestR2 = rfc.score(testX3, testY3)

predicttest = rfc.predict(testX3)
predicttestprobs = rfc.predict_proba(testX3)

In [157]:
#Now what would the squared score be for our actual prediction?
print('SquaredScore: ' + str(np.sum((predicttestprobs[:,1]-testY)**2)))
#The predicted probabilities are significantly better than guessing the train probability for every value.
#Virtually the same (slightest bit better) than the AllDummy model

SquaredScore: 1930.9910475381848


In [159]:
from sklearn.metrics import roc_auc_score as AUC

AUC(testY3, predicttestprobs[:,1])

0.6571001799646914

In [160]:
AUC(testY3, predicttest)

0.5024167700184838

In [158]:
#Test the "AnyChange" DF using other models, to see if they can give better scores.