In [3]:
import pandas as pd
import numpy as np

## We know (from Model_Parameter_Testing_F) that removing middle observations (came back >30 days) with linear regression score >0.75 (look strongly like those who returned <30 days) helps the model. We need to do this to create our final stacked model

In [5]:
DiabetesTakingMed = pd.read_csv('DiabetesTakingMedF.csv', index_col=0)

DiabetesTrain = DiabetesTakingMed[DiabetesTakingMed['IsTrain']==1].drop('IsTrain', axis=1)
DiabetesTrain.index = list(range(len(DiabetesTrain)))

DiabetesTest = DiabetesTakingMed[DiabetesTakingMed['IsTrain']==0].drop('IsTrain', axis=1)
DiabetesTest.index = list(range(len(DiabetesTest)))

#Start with a train to predict the placement of the middle group:

DiabetesTrainHL = DiabetesTrain[DiabetesTrain['readmitted']!=1]
trainX01 = DiabetesTrainHL.drop('readmitted', axis=1)
trainY01 = DiabetesTrainHL['readmitted'].replace([2], [1])

testX01 = DiabetesTest.drop('readmitted', axis=1)
testY01 = DiabetesTest['readmitted'].replace([2], [1])

In [7]:
from sklearn.linear_model import LinearRegression as lm

lm = lm()

lm.fit(trainX01, trainY01)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [8]:
middledf = DiabetesTrain[DiabetesTrain['readmitted']==1]
middledfX = middledf.drop('readmitted', axis=1)
middledfY = middledf['readmitted']

predictarray = lm.predict(middledfX)

In [9]:
middledf75 = middledf.loc[predictarray<0.75]

In [12]:
#Make the complete dataset with intermediate observations >0.75 removed:

train75 = pd.concat([DiabetesTrainHL, middledf75], axis=0)
train75.index = list(range(len(train75)))

In [30]:
#Split this now-complete DF into X (features) and Y (output) objects:

trainX = train75.drop('readmitted', axis=1)
trainY = train75['readmitted'].replace([2, 1], [1, 0])

In [36]:
#As seen in other runs, trimming based on AIC does not improve the model. (0.6646 vs 0.66735)
#Based on this, the best score we have is 0.66735, based on keeping >0.75 predictions of the >30 return train group. 

In [None]:
#Remove specific features (found in Model_Parameter_Testing_F) that improve the logistic regression:

TrainLR = train75.drop(['diabfeat_neurologic', 'race_AfricanAmerican', 'A1Cresult_>7', 'primarydiag_injury', 'number_diagnoses', 
    'med_glimepiride', 'med_insulin', 'diag_infection', 'medical_specialty_Orthopedics', 'med_nateglinide', 'discharge_disposition_leftAMA', 
    'admission_source_id_3', 'change_Ch', 'diag_circulatory', 'medical_specialty_Gastroenterology', 'medical_specialty_Surgery',
    'primarydiag_infection', 'primarydiag_mentaldis'], axis=1)
TrainLRX = TrainLR.drop('readmitted', axis=1)
TrainLRY = TrainLR['readmitted'].replace([2,1], [1,0])

In [None]:
# Perform the logistic regression with the optimal parameteres

from sklearn.linear_model import LogisticRegression as lgr

lgr = lgr()
lgr.set_params(C=0.1, class_weight={0:.2, 1:.8})

lgr.fit(TrainLRX, TrainLRY)
predictprobsLRW = lgr.predict_proba(testX01)

from sklearn.metrics import roc_auc_score as AUC

AUC(testY03, predictprobsLRW[:,1])

In [31]:
#Perform logistic regression with optimal conditions:
from sklearn.ensemble import RandomForestClassifier as rfc

rfc = rfc()
rfc.set_params(n_estimators=1000, min_samples_split=5, min_samples_leaf=1, max_features='sqrt', 
               max_depth=60, random_state=42, class_weight={0:.2, 1:.8})

rfc.fit(trainX, trainY)

predictRFW = rfc.predict(testX03)
predictprobsRFW5 = rfc.predict_proba(testX03)

from sklearn.metrics import roc_auc_score as AUC

AUC(testY03, predictprobsRFW[:,1])

  return f(*args, **kwds)


0.6706926578913641

In [37]:
#Using class weights, we very slightly increased the output from LR to 0.66774

# AUC of 0.66774 for class-weighted logistic regression
# AUC of 0.68011 for parameter-optimized XGBoost
# AUC of 0.67069 for class-weighted hyperparameter-optimized random forest

In [23]:
#Now let's try enhaning the XGB score using scale_pos_weight (the # of negative samples/# positive)

from xgboost.sklearn import XGBClassifier as xgb

xgb = xgb()
xgb.set_params(n_estimators=500, min_child_weight=10, max_depth=5, gamma=5, colsample_bytree=0.6, max_delta_step=5,
              random_state=42, scale_pos_weight=4)

xgb.fit(trainX, trainY)

predictXBW = xgb.predict(testX03)
predictprobsXBW = xgb.predict_proba(testX03)

AUC(testY03, predictprobsXBW[:,1])

  if diff:


In [29]:
#Import the XGB model with optimal parameters:

from xgboost.sklearn import XGBClassifier as xgb

xgb = xgb()
xgb.set_params(n_estimators=500, min_child_weight=10, max_depth=5, gamma=5, colsample_bytree=0.6, max_delta_step=5,
              random_state=42, scale_pos_weight=1)

xgb.fit(trainX, trainY)

predictXBW = xgb.predict(testX03)
predictprobsXBW = xgb.predict_proba(testX03)

AUC(testY03, predictprobsXBW[:,1])

  if diff:


0.6801164864800598

In [31]:
#These are our targets. Let's make sure we have the correct prediction arrays:

# AUC of 0.6725 for class-weighted logistic regression
# AUC of 0.68011 for parameter-optimized XGBoost
# AUC of 0.67069 for class-weighted hyperparameter-optimized random forest

#let's make sure we have the correct 3 prediction arrays:
print(AUC(testY03, predictprobsXBW[:,1]))
print(AUC(testY03, predictprobsLRW[:,1]))
print(AUC(testY03, predictprobsRFW[:,1]))

0.6801164864800598
0.6725013797805438
0.6706926578913641


In [32]:
StackDFScores = []
RFPercents = []
LGPercents = []
XGPercents = []

AUC(testY03, predictprobsRFW5[:,1])
AUC(testY03, predictprobsvartest[:,1])
AUC(testY03, predictprobsXBW[:,1])

for i in range(101):
    for j in range(101):
        for k in range(101):
            if i + j + k == 100:
                StackPredict = AUC(testY03, (k*predictprobsRFW5[:,1] + i*predictprobsvartest[:,1] + 
                                j*predictprobsXBW[:,1])/100)
                StackDFScores.append(StackPredict)
                RFPercents.append(k)
                LGPercents.append(i)
                XGPercents.append(j)

StackDF = pd.DataFrame({'Score':pd.Series(StackDFScores), 'LogRegPct':pd.Series(LGPercents),
                       'RFPercents':pd.Series(RFPercents), 'XGPercents':pd.Series(XGPercents)})

In [33]:
StackDF.sort_values('Score', ascending=False)

Unnamed: 0,Score,LogRegPct,RFPercents,XGPercents
1967,0.684145,21,23,56
1887,0.684141,20,23,57
1966,0.684141,21,24,55
1885,0.684140,20,25,55
1886,0.684138,20,24,56
1806,0.684137,19,23,58
1965,0.684136,21,25,54
2046,0.684135,22,23,55
1805,0.684135,19,24,57
2045,0.684135,22,24,54


In [35]:
predictprob = (56*predictprobsXBW[:,1] + 23*predictprobsRFW5[:,1] + 21*predictprobsvartest[:,1])/100


TestValues = pd.DataFrame({'num_lab_procedures':testX03['num_lab_procedures'], 'predict_prob':pd.Series(predictprob)})

In [36]:
TestValues['predict_actual'] = 0
TestValues['predict_actual'][TestValues['predict_prob']>0.5]=1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [39]:
AUC(testY03, TestValues['predict_prob'])

0.6841445976784577

In [40]:
TestValues.to_csv('TestPredictions.csv')