In [None]:
import pandas as pd
import numpy as np

## The purpose of this notebook is to tune hyperparameters and perform other tests to optimize our individual machine learning models. The first section tests whether removing small subsections of data (from the middle group, who returned to the hospital after 30 days) from the train set improves the model

In [5]:
DiabetesTakingMed = pd.read_csv('DiabetesTakingMedF.csv', index_col=0)

DiabetesTrain = DiabetesTakingMed[DiabetesTakingMed['IsTrain']==1].drop('IsTrain', axis=1)
DiabetesTrain.index = list(range(len(DiabetesTrain)))

DiabetesTest = DiabetesTakingMed[DiabetesTakingMed['IsTrain']==0].drop('IsTrain', axis=1)
DiabetesTest.index = list(range(len(DiabetesTest)))

#Start with a train to predict the placement of the middle group:

DiabetesTrainHL = DiabetesTrain[DiabetesTrain['readmitted']!=1]
trainX01 = DiabetesTrainHL.drop('readmitted', axis=1)
trainY01 = DiabetesTrainHL['readmitted'].replace([2], [1])

testX01 = DiabetesTest.drop('readmitted', axis=1)
testY01 = DiabetesTest['readmitted'].replace([2], [1])

In [7]:
from sklearn.linear_model import LinearRegression as lm

lm = lm()

lm.fit(trainX01, trainY01)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [8]:
middledf = DiabetesTrain[DiabetesTrain['readmitted']==1]
middledfX = middledf.drop('readmitted', axis=1)
middledfY = middledf['readmitted']

predictarray = lm.predict(middledfX)

In [9]:
middledf100 = middledf.loc[predictarray<1]
middledf75 = middledf.loc[predictarray<0.75]
middledf50 = middledf.loc[predictarray<0.50]
middledf25 = middledf.loc[predictarray<0.25]

In [10]:
trainX01.shape

(51648, 112)

In [11]:
#Create possible train indices with portions of the middle data removed:
DiabetesTrainHL = DiabetesTakingMed[DiabetesTakingMed['IsTrain']==1].drop('IsTrain', axis=1)
DiabetesTrainHL.index = list(range(len(DiabetesTrainHL)))
DiabetesTrainHL = DiabetesTrainHL[DiabetesTrain3['readmitted']!=1]

train25 = pd.concat([DiabetesTrainHL, middledf25], axis=0)
train25.index = list(range(len(train25)))

In [12]:
train50 = pd.concat([DiabetesTrain3, middledf50], axis=0)
train50.index = list(range(len(train50)))

train75 = pd.concat([DiabetesTrain3, middledf75], axis=0)
train75.index = list(range(len(train75)))

train100 = pd.concat([DiabetesTrain3, middledf100], axis=0)
train100.index = list(range(len(train100)))

In [13]:
#First, try logistic regression with all observations:

DiabetesTrain3 = DiabetesTakingMed[DiabetesTakingMed['IsTrain']==1].drop('IsTrain', axis=1)
DiabetesTrain3.index = list(range(len(DiabetesTrain3)))

trainX02 = DiabetesTrain3.drop('readmitted', axis=1)
trainY02 = DiabetesTrain3['readmitted'].replace([2, 1], [1, 0])

testX02 = DiabetesTest3.drop('readmitted', axis=1)
testY02 = DiabetesTest3['readmitted'].replace([2, 1], [1, 0])

from sklearn.linear_model import LogisticRegression as lgr

lgr = lgr()
lgr.set_params(C=0.1)

lgr.fit(trainX02, trainY02)
predictprobs02 = lgr.predict_proba(trainX02)
predictvalues02 = lgr.predict(trainX02)
actual02 = trainY02.values

predicttest02 = lgr.predict(testX02)
predicttestprobs02 = lgr.predict_proba(testX02)

In [14]:
from sklearn.metrics import roc_auc_score as AUC

AUC(testY02, predicttestprobs02[:,1])

0.6664455199364435

In [15]:
#Now, let's try shaving off a few of the observations at a time: (Any >30 with linear P prediction over 1.0)

trainX03 = train100.drop('readmitted', axis=1)
trainY03 = train100['readmitted'].replace([2, 1], [1, 0])

testX03 = DiabetesTest3.drop('readmitted', axis=1)
testY03 = DiabetesTest3['readmitted'].replace([2, 1], [1, 0])

from sklearn.linear_model import LogisticRegression as lgr

lgr = lgr()
lgr.set_params(C=0.1)

lgr.fit(trainX03, trainY03)
predictprobs03 = lgr.predict_proba(trainX03)
predictvalues03 = lgr.predict(trainX03)
actual03 = trainY03.values

predicttest03 = lgr.predict(testX03)
predicttestprobs03 = lgr.predict_proba(testX03)

In [16]:
from sklearn.metrics import roc_auc_score as AUC

AUC(testY03, predicttestprobs03[:,1])

0.6671563681939577

In [17]:
#Now, let's try shaving off a few of the observations at a time: (Any >30 with linear P prediction over 0.75)

trainX04 = train75.drop('readmitted', axis=1)
trainY04 = train75['readmitted'].replace([2, 1], [1, 0])

from sklearn.linear_model import LogisticRegression as lgr

lgr = lgr()
lgr.set_params(C=0.1)

lgr.fit(trainX04, trainY04)
predictprobs04 = lgr.predict_proba(trainX04)
predictvalues04 = lgr.predict(trainX04)
actual04 = trainY04.values

predicttest04 = lgr.predict(testX03)
predicttestprobs04 = lgr.predict_proba(testX03)

In [18]:
from sklearn.metrics import roc_auc_score as AUC

AUC(testY03, predicttestprobs04[:,1])

0.667349050248063

In [19]:
#Now, let's try shaving off a few of the observations at a time: (Any >30 with linear P prediction over 0.50)

trainX05 = train50.drop('readmitted', axis=1)
trainY05 = train50['readmitted'].replace([2, 1], [1, 0])

from sklearn.linear_model import LogisticRegression as lgr

lgr = lgr()
lgr.set_params(C=0.1)

lgr.fit(trainX05, trainY05)
predictprobs05 = lgr.predict_proba(trainX05)
predictvalues05 = lgr.predict(trainX05)
actual05 = trainY05.values

predicttest05 = lgr.predict(testX03)
predicttestprobs05 = lgr.predict_proba(testX03)

In [20]:
from sklearn.metrics import roc_auc_score as AUC

AUC(testY03, predicttestprobs05[:,1])

0.6664180222658731

In [21]:
#We see that 0.75 was better then 1.0 and much better than 0.5. Let's try 0.7, 0.8, 0.9:
middledf90 = middledf.loc[predictarray<0.9]
middledf80 = middledf.loc[predictarray<0.8]
middledf70 = middledf.loc[predictarray<0.7]

In [22]:
DiabetesTakingMed = pd.read_csv('DiabetesTakingMedF.csv', index_col=0)

DiabetesTrain3 = DiabetesTakingMed[DiabetesTakingMed['IsTrain']==1].drop('IsTrain', axis=1)
DiabetesTrain3.index = list(range(len(DiabetesTrain3)))
DiabetesTrain3 = DiabetesTrain3[DiabetesTrain3['readmitted']!=1]

train90 = pd.concat([DiabetesTrain3, middledf90], axis=0)
train90.index = list(range(len(train90)))

train80 = pd.concat([DiabetesTrain3, middledf80], axis=0)
train80.index = list(range(len(train80)))

train70 = pd.concat([DiabetesTrain3, middledf70], axis=0)
train70.index = list(range(len(train70)))

In [23]:
#Now, let's try shaving off a few of the observations at a time: (Any >30 with linear P prediction over 0.90)

trainX06 = train90.drop('readmitted', axis=1)
trainY06 = train90['readmitted'].replace([2, 1], [1, 0])

from sklearn.linear_model import LogisticRegression as lgr

lgr = lgr()
lgr.set_params(C=0.1)

lgr.fit(trainX06, trainY06)

predicttestprobs06 = lgr.predict_proba(testX03)

In [24]:
from sklearn.metrics import roc_auc_score as AUC

AUC(testY03, predicttestprobs06[:,1])

0.6671855767204969

In [25]:
#Now, let's try shaving off a few of the observations at a time: (Any >30 with linear P prediction over 0.80)

trainX07 = train80.drop('readmitted', axis=1)
trainY07 = train80['readmitted'].replace([2, 1], [1, 0])

from sklearn.linear_model import LogisticRegression as lgr

lgr = lgr()
lgr.set_params(C=0.1)

lgr.fit(trainX07, trainY07)

predicttestprobs07 = lgr.predict_proba(testX03)

In [26]:
from sklearn.metrics import roc_auc_score as AUC

AUC(testY03, predicttestprobs07[:,1])

0.6672197194548299

In [27]:
#Now, let's try shaving off a few of the observations at a time: (Any >30 with linear P prediction over 0.80)

trainX08 = train70.drop('readmitted', axis=1)
trainY08 = train70['readmitted'].replace([2, 1], [1, 0])

from sklearn.linear_model import LogisticRegression as lgr

lgr = lgr()
lgr.set_params(C=0.1)

lgr.fit(trainX08, trainY08)

predicttestprobs08 = lgr.predict_proba(testX03)

In [28]:
from sklearn.metrics import roc_auc_score as AUC

AUC(testY03, predicttestprobs08[:,1])

0.6672467212251197

In [1]:
#Based on this, the best score we have is 0.66735 with cutoff of >0.75. 
# (If we remove the returned>30 observations with linear-predcted likelihood of returning >0.75)

In [30]:
#Call Train75, and Train75 our new training DF (trainX and trainY):

trainX = train75.drop('readmitted', axis=1)
trainY = train75['readmitted'].replace([2, 1], [1, 0])

## Next, we tried two different methods to find removable features for our logistic regression model -- first using AIC optimization and secondly by removing individual features and testing their K-fold AUC:

In [31]:
#We can write a function to find a candidate feature list using AIC engineering:

def FindLowestAICNonLogBackward(df, dependent):
    '''Input: DF to AIC-modify and the dependent variable. WILL RETURN: A tuple: [0] is the modified DF (with dependent)
    and tuple[1] will give you the summary DF'''
    df2 = df.copy()
    df2X = df2.drop(dependent, axis=1)
    df2Y = df2[dependent]
    FeatureList = list(df2X.columns)
    X2 = sm.add_constant(df2X)
    est = sm.OLS(df2Y, X2)
    CurrentAIC = est.fit().aic
    CanBeBetter = True
    ModList = []
    AddedSubtracted = []
    AIC = []
    TriesSinceReset = 0
    
    tempColumnList = list(df2X.columns)
    tempDF2X = df2X[tempColumnList]
    
    while CanBeBetter == True:
        Choice = np.random.choice(list(df2X.columns))
        
        HeadsTails = np.random.randint(2)
        if Choice in tempColumnList:
            HeadsTails = 0
        if Choice not in tempColumnList:
            HeadsTails = 1
        
        if HeadsTails == 1:
            tempColumnList.append(Choice)
            tempDF2X[Choice] = df2X[Choice]
        
        if HeadsTails == 0:

            tempColumnList.remove(Choice)
            tempDF2X = tempDF2X[tempColumnList]
            
        est = sm.OLS(df2Y, sm.add_constant(tempDF2X))
        NewAIC = est.fit().aic
        
        if NewAIC < CurrentAIC:
            TriesSinceReset = 0
            CurrentAIC = NewAIC
            
            if HeadsTails == 1:
                print(Choice + " added: New AIC = " + str(CurrentAIC))
                ModList.append(Choice)
                AddedSubtracted.append('Added')
                AIC.append(CurrentAIC)
                
            if HeadsTails == 0:
                print(Choice + " removed: New AIC = " + str(CurrentAIC))
                ModList.append(Choice)
                AddedSubtracted.append('Subtracted')
                AIC.append(CurrentAIC)
            continue
            
        else:
            TriesSinceReset += 1
            
            if HeadsTails == 1:
                tempColumnList.remove(Choice)
                tempDF2X = tempDF2X[tempColumnList]
                
            if HeadsTails == 0:
                tempColumnList.append(Choice)
                tempDF2X[Choice] = df2X[Choice]
                
            if TriesSinceReset > 100:
                CanBeBetter = False
                
    SummaryDF = pd.DataFrame({'Feature': ModList, 'AddOrSubtract': AddedSubtracted, 'AIC': AIC})
    NewDF = pd.concat([tempDF2X, df2[[dependent]]], axis=1)
    
    return NewDF, SummaryDF

In [32]:
import statsmodels.api as sm
trainXAIC = FindLowestAICNonLogBackward(train75, 'readmitted')[0]

diag_mentaldis removed: New AIC = 159023.43156495487
med_glyburide.metformin removed: New AIC = 159021.71547449904
diag_pregnancy removed: New AIC = 159020.0776425167
diag_blooddis removed: New AIC = 159018.1782232389
med_tolbutamide removed: New AIC = 159016.46157787263
diabfeat_hyperosmolarity removed: New AIC = 159014.61752138872
diag_infection removed: New AIC = 159012.66088537028
max_glu_serum_>200 removed: New AIC = 159010.66130931786
med_miglitol removed: New AIC = 159009.6224451469
diag_injury removed: New AIC = 159007.76959854743
diag_musculoskeletal removed: New AIC = 159006.03307917976
primarydiag_mentaldis removed: New AIC = 159004.2020932989
admission_type_id_4 removed: New AIC = 159002.49988734938
med_pioglitazone removed: New AIC = 159000.87160404486
med_glipizide.metformin removed: New AIC = 158999.00522946296
primarydiag_nervous removed: New AIC = 158998.5314527838
discharge_disposition_unknown removed: New AIC = 158996.8467899012
primarydiag_blooddis removed: New AIC 

In [34]:
train75Trim = train75[list(trainXAIC.columns)]
trainX75Trim = train75Trim.drop('readmitted', axis=1)
trainY75Trim = train75Trim['readmitted'].replace([2, 1], [1, 0])

DiabetesTakingMed = pd.read_csv('DiabetesTakingMedF.csv', index_col=0)

DiabetesTrain3 = DiabetesTakingMed[DiabetesTakingMed['IsTrain']==1].drop('IsTrain', axis=1)
DiabetesTrain3.index = list(range(len(DiabetesTrain3)))

DiabetesTest3 = DiabetesTakingMed[DiabetesTakingMed['IsTrain']==0].drop('IsTrain', axis=1)
DiabetesTest3.index = list(range(len(DiabetesTest3)))

testX75Trim = DiabetesTest3[list(trainXAIC.columns)].drop('readmitted', axis=1)

from sklearn.linear_model import LogisticRegression as lgr

lgr = lgr()
lgr.set_params(C=0.1)

lgr.fit(trainX75Trim, trainY75Trim)

predicttestprobs75Trim = lgr.predict_proba(testX75Trim)

In [35]:
#Removing these features reduces -- does not improve -- AUC:

from sklearn.metrics import roc_auc_score as AUC

AUC(testY03, predicttestprobs75Trim[:,1])

0.664640046194103

In [36]:
#As seen in other runs, trimming based on AIC does not improve the model. (0.6646 vs 0.66735)
#Based on this, the best score we have is 0.66735, based on keeping >0.75 predictions of the >30 return train group. 

### Changing class weights can improve the logistic regression model:

In [None]:
#Let's try to slightly improve our linear regression by assigning weights to the classes:

testX01 = testX03.copy()

trainX01 = trainX.copy()


from sklearn.linear_model import LogisticRegression as lgr

lgr = lgr()
lgr.set_params(C=0.1, class_weight={0:.2, 1:.8})

lgr.fit(trainX, trainY)
predictprobsLRW = lgr.predict_proba(testX01)

AUC(testY03, predictprobsLRW[:,1])

#Class_weights 0.2/0.8 was the strongest weighting parameter

## Cut here

In [19]:
from xgboost.sklearn import XGBClassifier as xgb

xgb = xgb()
xgb.set_params(n_estimators=500, min_child_weight=10, max_depth=5, gamma=5, colsample_bytree=0.6, max_delta_step=5,
              random_state=42)

xgb.fit(trainX, trainY)

predictXB = xgb.predict(testX03)
predictprobsXB = xgb.predict_proba(testX03)

  if diff:


In [20]:
from sklearn.metrics import roc_auc_score as AUC

AUC(testY03, predictprobsXB[:,1])

0.6801164864800598

In [None]:
# AUC of 0.66735 for Logistic
# AUC of 0.68011 for XGBoost

In [50]:
from sklearn.ensemble import RandomForestClassifier as rfc

rfc = rfc()
rfc.set_params(n_estimators=1000, min_samples_split=5, min_samples_leaf=1, max_features='sqrt', max_depth=60, random_state=42)

rfc.fit(trainX, trainY)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=60, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=5,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [51]:
predictRF = rfc.predict(testX03)
predictprobsRF = rfc.predict_proba(testX03)

In [52]:
from sklearn.metrics import roc_auc_score as AUC

AUC(testY03, predictprobsRF[:,1])

0.66353068007343

In [36]:
# AUC of 0.66735 for logistic regression
# AUC of 0.68011 for parameter-optimized XGBoost
# AUC of 0.66353 for random forest

In [None]:
'''from sklearn.neighbors import KNeighborsClassifier as knn

knn = knn()
knn.set_params(n_neighbors=10)

knn.fit(trainX, trainY)

predictKN = knn.predict(testX03)
predictprobsKN = knn.predict_proba(testX03)'''

In [None]:
#KNN is atrocious
'''from sklearn.metrics import roc_auc_score as AUC

AUC(testY03, predictprobsKN[:,1])'''

## Cut here

In [46]:
#Test removal of individual features, and determine which ones improve test AUC:
RemoveListByLogR = []


for var in list(trainX.columns):
    
    from sklearn.linear_model import LogisticRegression as lgr

    trainXtemp = trainX.drop(var, axis=1)
    testXtemp = testX03.drop(var, axis=1)

    lgr = lgr()
    lgr.set_params(C=0.1, class_weight={0:.2, 1:.8})

    lgr.fit(trainXtemp, trainY)
    predictprobsvartest = lgr.predict_proba(testXtemp)
    tempAUC = AUC(testY03, predictprobsvartest[:,1])
    
    print(var + ": " + str(tempAUC))
    
    if tempAUC > 0.66774:
        RemoveListByLogR.append(var)

age: 0.6675590640169822
time_in_hospital: 0.6676144560783477
num_lab_procedures: 0.6676007568330905
num_procedures: 0.6677144295749571
num_medications: 0.6677314637496026
number_outpatient: 0.6677647138634304
number_emergency: 0.6661429959701408
number_inpatient: 0.6410418626867496
number_diagnoses: 0.6680921568188434
med_metformin: 0.6670021184116145
med_repaglinide: 0.667665484217242
med_nateglinide: 0.6679256087095362
med_chlorpropamide: 0.667694568768711
med_glimepiride: 0.6679471307817231
med_glipizide: 0.6677229590597868
med_glyburide: 0.66773476148647
med_tolbutamide: 0.6677395221291657
med_pioglitazone: 0.6677041892341588
med_rosiglitazone: 0.6676607979595887
med_acarbose: 0.6676774602090234
med_miglitol: 0.6677396461042359
med_tolazamide: 0.6677408610599239
med_insulin: 0.6679312867677515
med_glyburide.metformin: 0.6677782023510681
med_glipizide.metformin: 0.6677449026472124
diag_blooddis: 0.667602901601805
diag_circulatory: 0.6679086985099609
diag_digestive: 0.667626729610297

In [24]:
trainXtemp = trainX.drop(['diabfeat_neurologic', 'race_AfricanAmerican', 'A1Cresult_>7', 'primarydiag_injury'], axis=1)
testXtemp = testX03.drop(['diabfeat_neurologic', 'race_AfricanAmerican', 'A1Cresult_>7', 'primarydiag_injury'], axis=1)

from sklearn.linear_model import LogisticRegression as lgr

lgr = lgr()
lgr.set_params(C=0.1, class_weight={0:.2, 1:.8})

lgr.fit(trainXtemp, trainY)
predictprobsvartest = lgr.predict_proba(testXtemp)
AUC(testY03, predictprobsvartest[:,1])

0.6692817472074429

In [50]:
import sklearn.model_selection as ms
ms_k5 = ms.KFold(n_splits=5, shuffle=True)

np.random.seed(0)
DiabetesAD5Fold = ms_k5.split(trainX, trainY)

RemoveListByLogR = []


for var in list(trainXtemp.columns):
    
    from sklearn.linear_model import LogisticRegression as lgr

    trainXtemp2 = trainXtemp.drop(var, axis=1)
    testXtemp2 = testXtemp.drop(var, axis=1)

    lgr = lgr()
    lgr.set_params(C=0.1, class_weight={0:.2, 1:.8})

    lgr.fit(trainXtemp2, trainY)
    predictprobsvartest = lgr.predict_proba(testXtemp2)
    tempAUC = AUC(testY03, predictprobsvartest[:,1])
    
    print(var + ": " + str(tempAUC))
    
    if tempAUC > 0.66928:
        RemoveListByLogR.append(var)

age: 0.6691470111011494
time_in_hospital: 0.6691637973256545
num_lab_procedures: 0.6691722400279351
num_procedures: 0.669300963343324
num_medications: 0.6694101109951283
number_outpatient: 0.6693141047007652
number_emergency: 0.6675891403690127
number_inpatient: 0.6413768185314166
number_diagnoses: 0.6697863753331861
med_metformin: 0.6684715949186983
med_repaglinide: 0.6692245699050665
med_nateglinide: 0.6695010095165992
med_chlorpropamide: 0.6692282643621585
med_glimepiride: 0.6694497582225782
med_glipizide: 0.6692925082435364
med_glyburide: 0.6692234045394067
med_tolbutamide: 0.6692647378278115
med_pioglitazone: 0.6692587126393998
med_rosiglitazone: 0.6692266278912319
med_acarbose: 0.669211131007457
med_miglitol: 0.6692803586866567
med_tolazamide: 0.6692893096867252
med_insulin: 0.6694838761618974
med_glyburide.metformin: 0.6693132616702878
med_glipizide.metformin: 0.6692838547836364
diag_blooddis: 0.669099801394417
diag_circulatory: 0.6693698934823553
diag_digestive: 0.6691151990981

In [25]:
trainXtemp2 = trainXtemp.drop(['number_diagnoses', 'med_glimepiride', 'med_insulin', 'diag_infection', 'medical_specialty_Orthopedics'], axis=1)
testXtemp2 = testXtemp.drop(['number_diagnoses', 'med_glimepiride', 'med_insulin', 'diag_infection', 'medical_specialty_Orthopedics'], axis=1)

from sklearn.linear_model import LogisticRegression as lgr

lgr = lgr()
lgr.set_params(C=0.1, class_weight={0:.2, 1:.8})

lgr.fit(trainXtemp2, trainY)
predictprobsvartest = lgr.predict_proba(testXtemp2)
AUC(testY03, predictprobsvartest[:,1])

0.6707366690412851

In [None]:
trainXtemp3 = trainXtemp2.drop(['number_diagnoses', 'med_glimepiride', 'med_insulin', 'diag_infection', 'medical_specialty_Orthopedics'], axis=1)
testXtemp3 = testXtemp2.drop(['number_diagnoses', 'med_glimepiride', 'med_insulin', 'diag_infection', 'medical_specialty_Orthopedics'], axis=1)

from sklearn.linear_model import LogisticRegression as lgr

lgr = lgr()
lgr.set_params(C=0.1, class_weight={0:.2, 1:.8})

lgr.fit(trainXtemp2, trainY)
predictprobsvartest = lgr.predict_proba(testXtemp2)
AUC(testY03, predictprobsvartest[:,1])

In [53]:
import sklearn.model_selection as ms
ms_k5 = ms.KFold(n_splits=5, shuffle=True)

np.random.seed(0)
DiabetesAD5Fold = ms_k5.split(trainX, trainY)

RemoveListByLogR = []


for var in list(trainXtemp2.columns):
    
    from sklearn.linear_model import LogisticRegression as lgr

    trainXtemp3 = trainXtemp2.drop(var, axis=1)
    testXtemp3 = testXtemp2.drop(var, axis=1)

    lgr = lgr()
    lgr.set_params(C=0.1, class_weight={0:.2, 1:.8})

    lgr.fit(trainXtemp3, trainY)
    predictprobsvartest = lgr.predict_proba(testXtemp3)
    tempAUC = AUC(testY03, predictprobsvartest[:,1])
    
    print(var + ": " + str(tempAUC))
    
    if tempAUC > 0.67073:
        RemoveListByLogR.append(var)

age: 0.6705542769180066
time_in_hospital: 0.6705122493692087
num_lab_procedures: 0.6705888411675784
num_procedures: 0.670670962254079
num_medications: 0.670780444638573
number_outpatient: 0.6707485458530104
number_emergency: 0.6690716342584677
number_inpatient: 0.6422194026985257
med_metformin: 0.6696719711384054
med_repaglinide: 0.6706221408714341
med_nateglinide: 0.6708890591975754
med_chlorpropamide: 0.6706787478884876
med_glipizide: 0.6707905734018083
med_glyburide: 0.6706826902957199
med_tolbutamide: 0.6707203539220469
med_pioglitazone: 0.6705812786882961
med_rosiglitazone: 0.6706213970210131
med_acarbose: 0.6706367699297178
med_miglitol: 0.6707290569719748
med_tolazamide: 0.6707286354567361
med_glyburide.metformin: 0.6707369913764677
med_glipizide.metformin: 0.6707054769136227
diag_blooddis: 0.670580658812945
diag_circulatory: 0.6709646096053554
diag_digestive: 0.6705269776075484
diag_injury: 0.670630075275927
diag_mentaldis: 0.6707226102683245
diag_metabolic: 0.6706460928549968


In [26]:
trainXtemp3 = trainXtemp2.drop(['med_nateglinide', 'discharge_disposition_leftAMA', 'admission_source_id_3', 'change_Ch'], axis=1)
testXtemp3 = testXtemp2.drop(['med_nateglinide', 'discharge_disposition_leftAMA', 'admission_source_id_3', 'change_Ch'], axis=1)

from sklearn.linear_model import LogisticRegression as lgr

lgr = lgr()
lgr.set_params(C=0.1, class_weight={0:.2, 1:.8})

lgr.fit(trainXtemp3, trainY)
predictprobsvartest = lgr.predict_proba(testXtemp3)
AUC(testY03, predictprobsvartest[:,1])

0.6716049904329677

In [57]:
import sklearn.model_selection as ms
ms_k5 = ms.KFold(n_splits=5, shuffle=True)

np.random.seed(0)
DiabetesAD5Fold = ms_k5.split(trainX, trainY)

RemoveListByLogR = []


for var in list(trainXtemp3.columns):
    
    from sklearn.linear_model import LogisticRegression as lgr

    trainXtemp4 = trainXtemp3.drop(var, axis=1)
    testXtemp4 = testXtemp3.drop(var, axis=1)

    lgr = lgr()
    lgr.set_params(C=0.1, class_weight={0:.2, 1:.8})

    lgr.fit(trainXtemp4, trainY)
    predictprobsvartest = lgr.predict_proba(testXtemp4)
    tempAUC = AUC(testY03, predictprobsvartest[:,1])
    
    print(var + ": " + str(tempAUC))
    
    if tempAUC > 0.67160:
        RemoveListByLogR.append(var)

age: 0.6714334833208527
time_in_hospital: 0.6713912574119425
num_lab_procedures: 0.6714426574760475
num_procedures: 0.6715457799394402
num_medications: 0.6716539729832038
number_outpatient: 0.6716147100784715
number_emergency: 0.6699343767719912
number_inpatient: 0.6431721263180007
med_metformin: 0.6704804125711813
med_repaglinide: 0.6715212328755404
med_chlorpropamide: 0.6715371016845262
med_glipizide: 0.6716246528791014
med_glyburide: 0.671519670789656
med_tolbutamide: 0.6715941054218042
med_pioglitazone: 0.6715399283161267
med_rosiglitazone: 0.6714789573766022
med_acarbose: 0.6715260679032784
med_miglitol: 0.6715891960090243
med_tolazamide: 0.6715962129979977
med_glyburide.metformin: 0.6716230659982031
med_glipizide.metformin: 0.6715923449758074
diag_blooddis: 0.6714461287780131
diag_circulatory: 0.6718420059721766
diag_digestive: 0.6714292929634801
diag_injury: 0.6715492016513777
diag_mentaldis: 0.6715693104077642
diag_metabolic: 0.6715157779724517
diag_musculoskeletal: 0.671618950

In [27]:
trainXtemp4 = trainXtemp3.drop(['diag_circulatory', 'medical_specialty_Gastroenterology', 'medical_specialty_Surgery',
                               'primarydiag_infection', 'primarydiag_mentaldis'], axis=1)
testXtemp4 = testXtemp3.drop(['diag_circulatory', 'medical_specialty_Gastroenterology', 'medical_specialty_Surgery',
                               'primarydiag_infection', 'primarydiag_mentaldis'], axis=1)
from sklearn.linear_model import LogisticRegression as lgr

lgr = lgr()
lgr.set_params(C=0.1, class_weight={0:.2, 1:.8})

lgr.fit(trainXtemp4, trainY)
predictprobsvartest = lgr.predict_proba(testXtemp4)
AUC(testY03, predictprobsvartest[:,1])

0.6725013797805438

In [2]:
#This work gave us a list of approximately 20 features to remove from our logistic regression model

In [21]:
#Let's try to slightly improve our linear regression by assigning weights to the classes:

testX01 = testX03.copy()

trainX01 = trainX.copy()


from sklearn.linear_model import LogisticRegression as lgr

lgr = lgr()
lgr.set_params(C=0.1, class_weight={0:.2, 1:.8})

lgr.fit(trainX, trainY)
predictprobsLRW = lgr.predict_proba(testX01)

AUC(testY03, predictprobsLRW[:,1])

0.6677426958909628

In [78]:
#Let's start by checking 40/60:
from sklearn.ensemble import RandomForestClassifier as rfc

rfc = rfc()
rfc.set_params(n_estimators=1000, min_samples_split=5, min_samples_leaf=1, max_features='sqrt', 
               max_depth=60, random_state=42, class_weight={0:.4, 1:.6})

rfc.fit(trainX, trainY)

predictRFW = rfc.predict(testX03)
predictprobsRFW = rfc.predict_proba(testX03)

from sklearn.metrics import roc_auc_score as AUC

AUC(testY03, predictprobsRFW[:,1])
#0.66563 is significant improvement. Let's see if we can increase this some more:

0.6656335824066879

In [79]:
#0.66563 is significant improvement. Let's see if we can increase this some more (35/65):
from sklearn.ensemble import RandomForestClassifier as rfc

rfc = rfc()
rfc.set_params(n_estimators=1000, min_samples_split=5, min_samples_leaf=1, max_features='sqrt', 
               max_depth=60, random_state=42, class_weight={0:.35, 1:.65})

rfc.fit(trainX, trainY)

predictRFW = rfc.predict(testX03)
predictprobsRFW2 = rfc.predict_proba(testX03)

from sklearn.metrics import roc_auc_score as AUC

AUC(testY03, predictprobsRFW2[:,1])
#0.6663 is evern better. Let's keep scaling down and see when the improvement stops:

0.6663424966531071

In [80]:
#0.66563 is significant improvement. Let's see if we can increase this some more:
from sklearn.ensemble import RandomForestClassifier as rfc

rfc = rfc()
rfc.set_params(n_estimators=1000, min_samples_split=5, min_samples_leaf=1, max_features='sqrt', 
               max_depth=60, random_state=42, class_weight={0:.3, 1:.7})

rfc.fit(trainX, trainY)

predictRFW = rfc.predict(testX03)
predictprobsRFW3 = rfc.predict_proba(testX03)

from sklearn.metrics import roc_auc_score as AUC

AUC(testY03, predictprobsRFW3[:,1])
#0.66719. Let's go to 25/75 next:

0.6671912299836981

In [81]:
#0.66563 is significant improvement. Let's see if we can increase this some more:
from sklearn.ensemble import RandomForestClassifier as rfc

rfc = rfc()
rfc.set_params(n_estimators=1000, min_samples_split=5, min_samples_leaf=1, max_features='sqrt', 
               max_depth=60, random_state=42, class_weight={0:.25, 1:.75})

rfc.fit(trainX, trainY)

predictRFW = rfc.predict(testX03)
predictprobsRFW4 = rfc.predict_proba(testX03)

from sklearn.metrics import roc_auc_score as AUC

AUC(testY03, predictprobsRFW4[:,1])
#0.66974. Let's go to 20/80:

0.6697473975711151

In [31]:
#Try 20/80 weights (best choice for logistic regression):
from sklearn.ensemble import RandomForestClassifier as rfc

rfc = rfc()
rfc.set_params(n_estimators=1000, min_samples_split=5, min_samples_leaf=1, max_features='sqrt', 
               max_depth=60, random_state=42, class_weight={0:.2, 1:.8})

rfc.fit(trainX, trainY)

predictRFW = rfc.predict(testX03)
predictprobsRFW5 = rfc.predict_proba(testX03)

from sklearn.metrics import roc_auc_score as AUC

AUC(testY03, predictprobsRFW5[:,1])
#0.6707. Let's check if 15/85 improves (it may not):

  return f(*args, **kwds)


0.6706926578913641

In [32]:
import pickle
with open('rfc.pickle', 'wb') as f:
    pickle.dump(rfc, f, -1)

In [33]:
from sklearn.externals import joblib
joblib.dump(rfc,  'newrfc.pkl', compress=9)

['newrfc.pkl']

In [83]:
#Try 15/85 weights (stronger choice than logistic regression):
from sklearn.ensemble import RandomForestClassifier as rfc

rfc = rfc()
rfc.set_params(n_estimators=1000, min_samples_split=5, min_samples_leaf=1, max_features='sqrt', 
               max_depth=60, random_state=42, class_weight={0:.15, 1:.85})

rfc.fit(trainX, trainY)

predictRFW = rfc.predict(testX03)
predictprobsRFW6 = rfc.predict_proba(testX03)

from sklearn.metrics import roc_auc_score as AUC

AUC(testY03, predictprobsRFW6[:,1])
#0.6707. Let's check if 15/85 improves (it may not):
#0.6701. So it dropped. So 20/80 was the ideal split for both logistic regression and random forest.

0.6701178102858595

In [37]:
#Using class weights, we very slightly increased the output from LR to 0.66774

# AUC of 0.66774 for class-weighted logistic regression
# AUC of 0.68011 for parameter-optimized XGBoost
# AUC of 0.67069 for class-weighted hyperparameter-optimized random forest

In [23]:
#Now let's try enhaning the XGB score using scale_pos_weight (the # of negative samples/# positive)

from xgboost.sklearn import XGBClassifier as xgb

xgb = xgb()
xgb.set_params(n_estimators=500, min_child_weight=10, max_depth=5, gamma=5, colsample_bytree=0.6, max_delta_step=5,
              random_state=42, scale_pos_weight=4)

xgb.fit(trainX, trainY)

predictXBW = xgb.predict(testX03)
predictprobsXBW = xgb.predict_proba(testX03)

AUC(testY03, predictprobsXBW[:,1])

  if diff:


In [25]:
from xgboost.sklearn import XGBClassifier as xgb

xgb = xgb()
xgb.set_params(n_estimators=500, min_child_weight=10, max_depth=5, gamma=5, colsample_bytree=0.6, max_delta_step=5,
              random_state=42, scale_pos_weight=5)

xgb.fit(trainX, trainY)

predictXBW = xgb.predict(testX03)
predictprobsXBW = xgb.predict_proba(testX03)

AUC(testY03, predictprobsXBW[:,1])

  if diff:


0.671829769632748

In [26]:
from xgboost.sklearn import XGBClassifier as xgb

xgb = xgb()
xgb.set_params(n_estimators=500, min_child_weight=10, max_depth=5, gamma=5, colsample_bytree=0.6, max_delta_step=5,
              random_state=42, scale_pos_weight=3)

xgb.fit(trainX, trainY)

predictXBW = xgb.predict(testX03)
predictprobsXBW = xgb.predict_proba(testX03)

AUC(testY03, predictprobsXBW[:,1])

  if diff:


0.6763980774045293

In [27]:
from xgboost.sklearn import XGBClassifier as xgb

xgb = xgb()
xgb.set_params(n_estimators=500, min_child_weight=10, max_depth=5, gamma=5, colsample_bytree=0.6, max_delta_step=5,
              random_state=42, scale_pos_weight=2)

xgb.fit(trainX, trainY)

predictXBW = xgb.predict(testX03)
predictprobsXBW = xgb.predict_proba(testX03)

AUC(testY03, predictprobsXBW[:,1])

  if diff:


0.6766497096045148

In [28]:
from xgboost.sklearn import XGBClassifier as xgb

xgb = xgb()
xgb.set_params(n_estimators=500, min_child_weight=10, max_depth=5, gamma=5, colsample_bytree=0.6, max_delta_step=5,
              random_state=42, scale_pos_weight=1.5)

xgb.fit(trainX, trainY)

predictXBW = xgb.predict(testX03)
predictprobsXBW = xgb.predict_proba(testX03)

AUC(testY03, predictprobsXBW[:,1])

  if diff:


0.6796362194556111

In [29]:
from xgboost.sklearn import XGBClassifier as xgb

xgb = xgb()
xgb.set_params(n_estimators=500, min_child_weight=10, max_depth=5, gamma=5, colsample_bytree=0.6, max_delta_step=5,
              random_state=42, scale_pos_weight=1.2)

xgb.fit(trainX, trainY)

predictXBW = xgb.predict(testX03)
predictprobsXBW = xgb.predict_proba(testX03)

AUC(testY03, predictprobsXBW[:,1])

  if diff:


0.6789578402689822

In [29]:
from xgboost.sklearn import XGBClassifier as xgb

xgb = xgb()
xgb.set_params(n_estimators=500, min_child_weight=10, max_depth=5, gamma=5, colsample_bytree=0.6, max_delta_step=5,
              random_state=42, scale_pos_weight=1)

xgb.fit(trainX, trainY)

predictXBW = xgb.predict(testX03)
predictprobsXBW = xgb.predict_proba(testX03)

AUC(testY03, predictprobsXBW[:,1])

  if diff:


0.6801164864800598

In [31]:
from xgboost.sklearn import XGBClassifier as xgb

xgb = xgb()
xgb.set_params(n_estimators=500, min_child_weight=10, max_depth=5, gamma=5, colsample_bytree=0.6, max_delta_step=5,
              random_state=42, scale_pos_weight=0.8)

xgb.fit(trainX, trainY)

predictXBW = xgb.predict(testX03)
predictprobsXBW = xgb.predict_proba(testX03)

AUC(testY03, predictprobsXBW[:,1])

  if diff:


0.6784103291664568

In [32]:
from xgboost.sklearn import XGBClassifier as xgb

xgb = xgb()
xgb.set_params(n_estimators=500, min_child_weight=10, max_depth=5, gamma=5, colsample_bytree=0.6, max_delta_step=5,
              random_state=42, scale_pos_weight=0.6)

xgb.fit(trainX, trainY)

predictXBW = xgb.predict(testX03)
predictprobsXBW = xgb.predict_proba(testX03)

AUC(testY03, predictprobsXBW[:,1])

  if diff:


0.6769804378992882

In [33]:
from xgboost.sklearn import XGBClassifier as xgb

xgb = xgb()
xgb.set_params(n_estimators=500, min_child_weight=10, max_depth=5, gamma=5, colsample_bytree=0.6, max_delta_step=5,
              random_state=42, scale_pos_weight=0.4)

xgb.fit(trainX, trainY)

predictXBW = xgb.predict(testX03)
predictprobsXBW = xgb.predict_proba(testX03)

AUC(testY03, predictprobsXBW[:,1])

  if diff:


0.6694922444791359

In [34]:
from xgboost.sklearn import XGBClassifier as xgb

xgb = xgb()
xgb.set_params(n_estimators=500, min_child_weight=10, max_depth=5, gamma=5, colsample_bytree=0.6, max_delta_step=5,
              random_state=42, scale_pos_weight=0.2)

xgb.fit(trainX, trainY)

predictXBW = xgb.predict(testX03)
predictprobsXBW = xgb.predict_proba(testX03)

AUC(testY03, predictprobsXBW[:,1])

  if diff:


0.65860795237096

In [35]:
from xgboost.sklearn import XGBClassifier as xgb

xgb = xgb()
xgb.set_params(n_estimators=500, min_child_weight=10, max_depth=5, gamma=5, colsample_bytree=0.6, max_delta_step=5,
              random_state=42, scale_pos_weight=0.1)

xgb.fit(trainX, trainY)

predictXBW = xgb.predict(testX03)
predictprobsXBW = xgb.predict_proba(testX03)

AUC(testY03, predictprobsXBW[:,1])

  if diff:


0.6508026679137567

In [66]:
#Try 20/80 weights (best choice for logistic regression):
from sklearn.ensemble import RandomForestClassifier as rfc

rfc = rfc()
rfc.set_params(n_estimators=1000, min_samples_split=5, min_samples_leaf=1, max_features='sqrt', 
               max_depth=60, random_state=42, class_weight={0:.2, 1:.8})

rfc.fit(trainXtemp4, trainY)

predictRFW = rfc.predict(testXtemp4)
predictprobsRFW5T = rfc.predict_proba(testXtemp4)

from sklearn.metrics import roc_auc_score as AUC

AUC(testY03, predictprobsRFW5T[:,1])
#0.6707. Let's check if 15/85 improves (it may not):
#Reduced feature list did not improve the RF model

0.6685348222045004

In [67]:
from xgboost.sklearn import XGBClassifier as xgb

xgb = xgb()
xgb.set_params(n_estimators=500, min_child_weight=10, max_depth=5, gamma=5, colsample_bytree=0.6, max_delta_step=5,
              random_state=42, scale_pos_weight=1)

xgb.fit(trainXtemp4, trainY)

predictXBW = xgb.predict(testXtemp4)
predictprobsXBWT = xgb.predict_proba(testXtemp4)

AUC(testY03, predictprobsXBWT[:,1])
#Reduced feature list did not improve the XGB model

  if diff:


0.6783329439276378

In [31]:
#These are our targets. Let's make sure we have the correct prediction arrays:

# AUC of 0.66774 for class-weighted logistic regression
# AUC of 0.68011 for parameter-optimized XGBoost
# AUC of 0.67069 for class-weighted hyperparameter-optimized random forest

#let's make sure we have the correct 3 prediction arrays:
print(AUC(testY03, predictprobsXBW[:,1]))
print(AUC(testY03, predictprobsvartest[:,1]))
print(AUC(testY03, predictprobsRFW5[:,1]))

0.6801164864800598
0.6725013797805438
0.6706926578913641


In [None]:
#The target AUC is 0.6603, taken from a 5-k fold of the train data. A number of features, when removed, 

In [32]:
StackDFScores = []
RFPercents = []
LGPercents = []
XGPercents = []

AUC(testY03, predictprobsRFW5[:,1])
AUC(testY03, predictprobsvartest[:,1])
AUC(testY03, predictprobsXBW[:,1])

for i in range(101):
    for j in range(101):
        for k in range(101):
            if i + j + k == 100:
                StackPredict = AUC(testY03, (k*predictprobsRFW5[:,1] + i*predictprobsvartest[:,1] + 
                                j*predictprobsXBW[:,1])/100)
                StackDFScores.append(StackPredict)
                RFPercents.append(k)
                LGPercents.append(i)
                XGPercents.append(j)

StackDF = pd.DataFrame({'Score':pd.Series(StackDFScores), 'LogRegPct':pd.Series(LGPercents),
                       'RFPercents':pd.Series(RFPercents), 'XGPercents':pd.Series(XGPercents)})

In [33]:
StackDF.sort_values('Score', ascending=False)

Unnamed: 0,Score,LogRegPct,RFPercents,XGPercents
1967,0.684145,21,23,56
1887,0.684141,20,23,57
1966,0.684141,21,24,55
1885,0.684140,20,25,55
1886,0.684138,20,24,56
1806,0.684137,19,23,58
1965,0.684136,21,25,54
2046,0.684135,22,23,55
1805,0.684135,19,24,57
2045,0.684135,22,24,54


In [35]:
predictprob = (56*predictprobsXBW[:,1] + 23*predictprobsRFW5[:,1] + 21*predictprobsvartest[:,1])/100


TestValues = pd.DataFrame({'num_lab_procedures':testX03['num_lab_procedures'], 'predict_prob':pd.Series(predictprob)})

In [36]:
TestValues['predict_actual'] = 0
TestValues['predict_actual'][TestValues['predict_prob']>0.5]=1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [39]:
AUC(testY03, TestValues['predict_prob'])

0.6841445976784577

In [40]:
TestValues.to_csv('TestPredictions.csv')

In [68]:
trainX.head()

Unnamed: 0,age,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,med_metformin,...,primarydiag_metabolic,primarydiag_musculoskeletal,primarydiag_neoplasm,primarydiag_nervous,primarydiag_other,primarydiag_pregnancy,primarydiag_respiratory,primarydiag_skin,primarydiag_urogenital,diabchange
0,1,1,41,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,4,2,44,1,16,0,0,0,7,0,...,0,0,0,0,0,0,0,0,0,1
2,5,1,51,0,8,0,0,0,5,0,...,0,0,1,0,0,0,0,0,0,0
3,7,4,70,1,21,0,0,0,7,1,...,0,0,0,0,0,0,0,0,0,0
4,10,12,33,3,18,0,0,0,8,0,...,0,0,0,0,0,0,0,0,0,0


In [70]:
testX03.shape

(20030, 112)

In [44]:
len(predictarray[predictarray>1])

60

In [45]:
len(predictarray[predictarray<0])

8

In [46]:
#We have a predictarray essentially bounded by 0 and 1, with very few exceptions

In [47]:
#We also have an average of approximately 20%
np.mean(predictarray)

0.21584492882744952

In [48]:
predictarrayF = predictarray.copy()

predictarrayF[predictarrayF<0] = 0
predictarrayF[predictarrayF>1] = 1

In [54]:
#What if we shuffled our data to reflect differences in probability of these events occurring?

DiabetesAltered = pd.read_csv('DiabetesTakingMed.csv', index_col=0)
DiabetesAlteredTrain = DiabetesAltered[DiabetesAltered['IsTrain']==1]

In [58]:
DiabetesAlteredTrain['readmittedprob'] = DiabetesAlteredTrain['readmitted'].copy()
DiabetesAlteredTrain.loc[DiabetesAlteredTrain['readmittedprob']==1,'readmittedprob'] = predictarrayF

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [66]:
#Set 'readmitted' column to the correct values (2 to 1, 1 to 0):
DiabetesAlteredTrain['readmitted'] = DiabetesAlteredTrain['readmitted'].replace([2, 1], [1, 0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [61]:
'''Right now, we have all returns <30 days coded as 2; all returns >30 days coded between 0 and 1 based on probability taken
from a linear regression trained on only the never-returning group and the <30 returning group.  We need to save a copy of
this DF because we will adjust the numbers based on how we want to tune them.'''

'Right now, we have all returns <30 days coded as 2; all returns >30 days coded between 0 and 1 based on probability taken\nfrom a linear regression trained on only the never-returning group and the <30 returning group.  We need to save a copy of\nthis DF because we will adjust the numbers based on how we want to tune them.'

In [76]:
#First, make a dataset where 1.0 values from the >30 group get equal weight to the <30 returning group:
DiabetesAlteredTrain100 = DiabetesAlteredTrain.copy()
DiabetesAlteredTrain100['readmittedprob'] = DiabetesAlteredTrain100['readmittedprob'].replace([2], [1])

In [77]:
#Make readmittedprob adjust to the total size of readmitted:
DiabetesAlteredTrain100['readmittedprob'] = (DiabetesAlteredTrain100['readmittedprob'] * 
                                             np.sum(DiabetesAlteredTrain100['readmitted'])/np.sum(DiabetesAlteredTrain100['readmittedprob']))

In [78]:
#This process has forced all positive values to shrink to ~60% probability, which may not be ideal. Let's find out:
DiabetesAlteredTrain100['readmittedprob'].describe()

count    80084.000000
mean         0.113456
std          0.189136
min          0.000000
25%          0.000000
50%          0.000000
75%          0.127133
max          0.597408
Name: readmittedprob, dtype: float64

In [82]:
problist = DiabetesAlteredTrain100['readmittedprob'].copy()

Diabetes100SetOf20 = DiabetesAlteredTrain100.copy()
Diabetes100SetOf20 = Diabetes100SetOf20.drop(['readmittedprob', 'IsTrain'], axis=1)
PredictedYesProbs = np.zeros(20030)

for i in range(20):
    newreadmitted = np.zeros(80084)
    for j in range(80084):
        newreadmitted[j] = np.random.choice([0,1], size=1, p=[1-problist[j], problist[j]])
    Diabetes100SetOf20['readmitted'] = pd.Series(newreadmitted)
    
    temptrainX = Diabetes100SetOf20.drop('readmitted', axis=1)
    temptrainY = Diabetes100SetOf20['readmitted']
    
    from sklearn.linear_model import LogisticRegression as lgr

    lgr = lgr()
    lgr.set_params(C=0.1)

    lgr.fit(temptrainX, temptrainY)

    PredictedYesProbs += lgr.predict_proba(testX03)[:,1]/20

In [83]:
#This is interesting. It worked, in the sense that we got an AUC, and it was only slightly lower than our normal LR AUC (0.66)
#Would toning it down give us a better score? (Assigning 1.0 in the midle category to a value less than the actual <30)
AUC(testY03, PredictedYesProbs)

0.6555740964405543

In [104]:
DiabetesAlteredTrain80 = DiabetesAlteredTrain.copy()
DiabetesAlteredTrain80['readmittedprob'] = 0.8*DiabetesAlteredTrain80['readmittedprob']
DiabetesAlteredTrain80['readmittedprob'] = DiabetesAlteredTrain80['readmittedprob'].replace([1.6], [1])

In [105]:
#Make readmittedprob adjust to the total size of readmitted:
DiabetesAlteredTrain80['readmittedprob'] = (DiabetesAlteredTrain80['readmittedprob'] * 
                                             np.sum(DiabetesAlteredTrain80['readmitted'])/np.sum(DiabetesAlteredTrain80['readmittedprob']))

DiabetesAlteredTrain80['readmittedprob'].describe()
#Now, the original <30's are only penalized down to 0.65

count    80084.000000
mean         0.113456
std          0.202962
min          0.000000
25%          0.000000
50%          0.000000
75%          0.110613
max          0.649722
Name: readmittedprob, dtype: float64

In [106]:
problist = DiabetesAlteredTrain80['readmittedprob'].copy()

Diabetes80SetOf20 = DiabetesAlteredTrain80.copy()
Diabetes80SetOf20 = Diabetes80SetOf20.drop(['readmittedprob', 'IsTrain'], axis=1)
PredictedYesProbs = np.zeros(20030)

for i in range(20):
    newreadmitted = np.zeros(80084)
    for j in range(80084):
        newreadmitted[j] = np.random.choice([0,1], size=1, p=[1-problist[j], problist[j]])
    Diabetes80SetOf20['readmitted'] = pd.Series(newreadmitted)
    
    temptrainX = Diabetes80SetOf20.drop('readmitted', axis=1)
    temptrainY = Diabetes80SetOf20['readmitted']
    
    from sklearn.linear_model import LogisticRegression as lgr

    lgr = lgr()
    lgr.set_params(C=0.1)

    lgr.fit(temptrainX, temptrainY)

    PredictedYesProbs += lgr.predict_proba(testX03)[:,1]/20

In [107]:
#Giving 80% of the value is still not ideal
AUC(testY03, PredictedYesProbs)

0.6571373724857514

In [108]:
DiabetesAlteredTrain60 = DiabetesAlteredTrain.copy()
DiabetesAlteredTrain60['readmittedprob'] = 0.6*DiabetesAlteredTrain60['readmittedprob']
DiabetesAlteredTrain60['readmittedprob'] = DiabetesAlteredTrain60['readmittedprob'].replace([1.2], [1])

#Make readmittedprob adjust to the total size of readmitted:
DiabetesAlteredTrain60['readmittedprob'] = (DiabetesAlteredTrain60['readmittedprob'] * 
                                             np.sum(DiabetesAlteredTrain60['readmitted'])/np.sum(DiabetesAlteredTrain60['readmittedprob']))

DiabetesAlteredTrain60['readmittedprob'].describe()
#At 60%, 1.0 penalized down to 71%

count    80084.000000
mean         0.113456
std          0.220967
min          0.000000
25%          0.000000
50%          0.000000
75%          0.090922
max          0.712079
Name: readmittedprob, dtype: float64

In [113]:
problist = DiabetesAlteredTrain60['readmittedprob'].copy()

Diabetes60SetOf20 = DiabetesAlteredTrain60.copy()
Diabetes60SetOf20 = Diabetes60SetOf20.drop(['readmittedprob', 'IsTrain'], axis=1)
PredictedYesProbs = np.zeros(20030)

for i in range(20):
    newreadmitted = np.zeros(80084)
    for j in range(80084):
        newreadmitted[j] = np.random.choice([0,1], size=1, p=[1-problist[j], problist[j]])
    Diabetes60SetOf20['readmitted'] = pd.Series(newreadmitted)
    
    temptrainX = Diabetes60SetOf20.drop('readmitted', axis=1)
    temptrainY = Diabetes60SetOf20['readmitted']
    
    from sklearn.linear_model import LogisticRegression as lgr

    lgr = lgr()
    lgr.set_params(C=0.1)

    lgr.fit(temptrainX, temptrainY)

    PredictedYesProbs += lgr.predict_proba(testX03)[:,1]/20
    
AUC(testY03, PredictedYesProbs)

0.6593965453949968

In [114]:
DiabetesAlteredTrain50 = DiabetesAlteredTrain.copy()
DiabetesAlteredTrain50['readmittedprob'] = 0.5*DiabetesAlteredTrain50['readmittedprob']
DiabetesAlteredTrain50['readmittedprob'] = DiabetesAlteredTrain50['readmittedprob'].replace([1.0], [1])

#Make readmittedprob adjust to the total size of readmitted:
DiabetesAlteredTrain50['readmittedprob'] = (DiabetesAlteredTrain50['readmittedprob'] * 
                                             np.sum(DiabetesAlteredTrain50['readmitted'])/np.sum(DiabetesAlteredTrain50['readmittedprob']))

DiabetesAlteredTrain50['readmittedprob'].describe()
#At 50%, 1.0 only penalized to 0.75

count    80084.000000
mean         0.113456
std          0.231943
min          0.000000
25%          0.000000
50%          0.000000
75%          0.079587
max          0.747971
Name: readmittedprob, dtype: float64

In [115]:
problist = DiabetesAlteredTrain50['readmittedprob'].copy()

Diabetes50SetOf20 = DiabetesAlteredTrain50.copy()
Diabetes50SetOf20 = Diabetes50SetOf20.drop(['readmittedprob', 'IsTrain'], axis=1)
PredictedYesProbs = np.zeros(20030)

for i in range(20):
    newreadmitted = np.zeros(80084)
    for j in range(80084):
        newreadmitted[j] = np.random.choice([0,1], size=1, p=[1-problist[j], problist[j]])
    Diabetes50SetOf20['readmitted'] = pd.Series(newreadmitted)
    
    temptrainX = Diabetes50SetOf20.drop('readmitted', axis=1)
    temptrainY = Diabetes50SetOf20['readmitted']
    
    from sklearn.linear_model import LogisticRegression as lgr

    lgr = lgr()
    lgr.set_params(C=0.1)

    lgr.fit(temptrainX, temptrainY)

    PredictedYesProbs += lgr.predict_proba(testX03)[:,1]/20
    
AUC(testY03, PredictedYesProbs)

0.6599108683712296

In [116]:
DiabetesAlteredTrain40 = DiabetesAlteredTrain.copy()
DiabetesAlteredTrain40['readmittedprob'] = 0.4*DiabetesAlteredTrain40['readmittedprob']
DiabetesAlteredTrain40['readmittedprob'] = DiabetesAlteredTrain40['readmittedprob'].replace([0.8], [1])

#Make readmittedprob adjust to the total size of readmitted:
DiabetesAlteredTrain40['readmittedprob'] = (DiabetesAlteredTrain40['readmittedprob'] * 
                                             np.sum(DiabetesAlteredTrain40['readmitted'])/np.sum(DiabetesAlteredTrain40['readmittedprob']))

DiabetesAlteredTrain40['readmittedprob'].describe()

count    80084.000000
mean         0.113456
std          0.244515
min          0.000000
25%          0.000000
50%          0.000000
75%          0.067050
max          0.787675
Name: readmittedprob, dtype: float64

In [118]:
problist = DiabetesAlteredTrain40['readmittedprob'].copy()

Diabetes40SetOf20 = DiabetesAlteredTrain40.copy()
Diabetes40SetOf20 = Diabetes40SetOf20.drop(['readmittedprob', 'IsTrain'], axis=1)
PredictedYesProbs = np.zeros(20030)

for i in range(20):
    newreadmitted = np.zeros(80084)
    for j in range(80084):
        newreadmitted[j] = np.random.choice([0,1], size=1, p=[1-problist[j], problist[j]])
    Diabetes40SetOf20['readmitted'] = pd.Series(newreadmitted)
    
    temptrainX = Diabetes40SetOf20.drop('readmitted', axis=1)
    temptrainY = Diabetes40SetOf20['readmitted']
    
    from sklearn.linear_model import LogisticRegression as lgr

    lgr = lgr()
    lgr.set_params(C=0.1)

    lgr.fit(temptrainX, temptrainY)

    PredictedYesProbs += lgr.predict_proba(testX03)[:,1]/20
    
AUC(testY03, PredictedYesProbs)

0.6609319766394272

In [123]:
DiabetesAlteredTrain30 = DiabetesAlteredTrain.copy()
DiabetesAlteredTrain30['readmittedprob'] = 0.3*DiabetesAlteredTrain30['readmittedprob']
DiabetesAlteredTrain30['readmittedprob'] = DiabetesAlteredTrain30['readmittedprob'].replace([0.6], [1])

#Make readmittedprob adjust to the total size of readmitted:
DiabetesAlteredTrain30['readmittedprob'] = (DiabetesAlteredTrain30['readmittedprob'] * 
                                             np.sum(DiabetesAlteredTrain30['readmitted'])/np.sum(DiabetesAlteredTrain30['readmittedprob']))

DiabetesAlteredTrain30['readmittedprob'].describe()

count    80084.000000
mean         0.113456
std          0.258951
min          0.000000
25%          0.000000
50%          0.000000
75%          0.053106
max          0.831829
Name: readmittedprob, dtype: float64

In [124]:
problist = DiabetesAlteredTrain30['readmittedprob'].copy()

Diabetes30SetOf20 = DiabetesAlteredTrain30.copy()
Diabetes30SetOf20 = Diabetes30SetOf20.drop(['readmittedprob', 'IsTrain'], axis=1)
PredictedYesProbs = np.zeros(20030)

for i in range(20):
    newreadmitted = np.zeros(80084)
    for j in range(80084):
        newreadmitted[j] = np.random.choice([0,1], size=1, p=[1-problist[j], problist[j]])
    Diabetes30SetOf20['readmitted'] = pd.Series(newreadmitted)
    
    temptrainX = Diabetes30SetOf20.drop('readmitted', axis=1)
    temptrainY = Diabetes30SetOf20['readmitted']
    
    from sklearn.linear_model import LogisticRegression as lgr

    lgr = lgr()
    lgr.set_params(C=0.1)

    lgr.fit(temptrainX, temptrainY)

    PredictedYesProbs += lgr.predict_proba(testX03)[:,1]/20
    
AUC(testY03, PredictedYesProbs)

0.6618649634277263

In [125]:
DiabetesAlteredTrain20 = DiabetesAlteredTrain.copy()
DiabetesAlteredTrain20['readmittedprob'] = 0.2*DiabetesAlteredTrain20['readmittedprob']
DiabetesAlteredTrain20['readmittedprob'] = DiabetesAlteredTrain20['readmittedprob'].replace([0.4], [1])

#Make readmittedprob adjust to the total size of readmitted:
DiabetesAlteredTrain20['readmittedprob'] = (DiabetesAlteredTrain20['readmittedprob'] * 
                                             np.sum(DiabetesAlteredTrain20['readmitted'])/np.sum(DiabetesAlteredTrain20['readmittedprob']))

DiabetesAlteredTrain20['readmittedprob'].describe()

count    80084.000000
mean         0.113456
std          0.275579
min          0.000000
25%          0.000000
50%          0.000000
75%          0.037507
max          0.881228
Name: readmittedprob, dtype: float64

In [127]:
problist = DiabetesAlteredTrain20['readmittedprob'].copy()

Diabetes20SetOf20 = DiabetesAlteredTrain20.copy()
Diabetes20SetOf20 = Diabetes20SetOf20.drop(['readmittedprob', 'IsTrain'], axis=1)
PredictedYesProbs = np.zeros(20030)

for i in range(20):
    newreadmitted = np.zeros(80084)
    for j in range(80084):
        newreadmitted[j] = np.random.choice([0,1], size=1, p=[1-problist[j], problist[j]])
    Diabetes20SetOf20['readmitted'] = pd.Series(newreadmitted)
    
    temptrainX = Diabetes20SetOf20.drop('readmitted', axis=1)
    temptrainY = Diabetes20SetOf20['readmitted']
    
    from sklearn.linear_model import LogisticRegression as lgr

    lgr = lgr()
    lgr.set_params(C=0.1)

    lgr.fit(temptrainX, temptrainY)

    PredictedYesProbs += lgr.predict_proba(testX03)[:,1]/20
    
AUC(testY03, PredictedYesProbs)

0.6627061590740491

In [129]:
#Let's try a 40% XGB and see if that is any better than a base 0% xgb:

from xgboost.sklearn import XGBClassifier as xgb2

problist = DiabetesAlteredTrain40['readmittedprob'].copy()

Diabetes40SetOf20 = DiabetesAlteredTrain40.copy()
Diabetes40SetOf20 = Diabetes40SetOf20.drop(['readmittedprob', 'IsTrain'], axis=1)
PredictedYesProbs = np.zeros(20030)

for i in range(20):
    newreadmitted = np.zeros(80084)
    for j in range(80084):
        newreadmitted[j] = np.random.choice([0,1], size=1, p=[1-problist[j], problist[j]])
    Diabetes40SetOf20['readmitted'] = pd.Series(newreadmitted)
    
    temptrainX = Diabetes40SetOf20.drop('readmitted', axis=1)
    temptrainY = Diabetes40SetOf20['readmitted']
    
    xgb = xgb2()
    xgb.set_params(n_estimators=500, min_child_weight=10, max_depth=5, gamma=5, colsample_bytree=0.6, max_delta_step=5)

    xgb.fit(temptrainX, temptrainY)

    PredictedYesProbs += xgb.predict_proba(testX03)[:,1]/20
    
AUC(testY03, PredictedYesProbs)

0.6745317195052136

In [130]:
#Let's try a 20% XGB and see if that is any better than a base 0% xgb:

from xgboost.sklearn import XGBClassifier as xgb2

problist = DiabetesAlteredTrain20['readmittedprob'].copy()

Diabetes20SetOf20 = DiabetesAlteredTrain20.copy()
Diabetes20SetOf20 = Diabetes20SetOf20.drop(['readmittedprob', 'IsTrain'], axis=1)
PredictedYesProbs = np.zeros(20030)

for i in range(20):
    newreadmitted = np.zeros(80084)
    for j in range(80084):
        newreadmitted[j] = np.random.choice([0,1], size=1, p=[1-problist[j], problist[j]])
    Diabetes40SetOf20['readmitted'] = pd.Series(newreadmitted)
    
    temptrainX = Diabetes20SetOf20.drop('readmitted', axis=1)
    temptrainY = Diabetes20SetOf20['readmitted']
    
    xgb = xgb2()
    xgb.set_params(n_estimators=500, min_child_weight=10, max_depth=5, gamma=5, colsample_bytree=0.6, max_delta_step=5)

    xgb.fit(temptrainX, temptrainY)

    PredictedYesProbs += xgb.predict_proba(testX03)[:,1]/20
    
AUC(testY03, PredictedYesProbs)

0.6753665800254492

In [158]:
devontrain = pd.read_csv('db_train.csv')
devontest = pd.read_csv('db_test.csv')

devontrain['IsTrain'] = 1
devontest['IsTrain'] = 0

devonDF = pd.concat([devontrain, devontest], axis=0)

In [160]:
devonDF = devonDF.astype('str')

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

for item in ['race', 'gender', 'age', 'A1Cresult', 'metformin', 'glipizide', 'glyburide', 'pioglitazone', 'rosiglitazone', 
             'insulin', 'change', 'diabetesMed', 'diag_1_cat', 'diag_Circulatory', 'diag_Diabetes', 'diag_Digestive', 
             'diag_Genitourinary', 'diag_Injury', 'diag_Musculoskeletal', 'diag_Neoplasms', 'diag_Other', 
             'diag_Respiratory', 'admission_type_id_cat', 'discharge_disposition_id_cat', 'admission_source_id_cat', 
             'glimepiride.bi']:
    

    devonDF[item] = le.fit_transform(devonDF[item])

In [162]:
for var in list(devonDF.columns):
    devonDF[var] = pd.to_numeric(devonDF[var])

In [172]:
devontrainX = devonDF[devonDF['IsTrain']==1].copy().drop('readmitted', axis=1)
devontrainY = devonDF[devonDF['IsTrain']==1].copy()['readmitted']
devontrainY[devontrainY<0] = 0
devontrainY[devontrainY>0] = 1
devontestX = devonDF[devonDF['IsTrain']==0].copy().drop('readmitted', axis=1)
devontestY = devonDF[devonDF['IsTrain']==0].copy()['readmitted']
devontestY[devontestY<0] = 0
devontestY[devontestY>0] = 1

from xgboost.sklearn import XGBClassifier as xgb

xgb2 = xgb()
xgb2.set_params(n_estimators=500, min_child_weight=10, max_depth=5, gamma=5, colsample_bytree=0.6, max_delta_step=5)

xgb2.fit(devontrainX, devontrainY)

predictDXB = xgb2.predict(devontestX)
predictprobsDXB = xgb2.predict_proba(devontestX)

  if diff:


In [174]:
AUC(devontestY, predictprobsDXB[:,1])

0.6785622482174802

In [176]:
from sklearn.linear_model import LogisticRegression as lgr

lgr = lgr()
lgr.set_params(C=0.1)

lgr.fit(devontrainX, devontrainY)

predicttestprobsLGRD = lgr.predict_proba(devontestX)

AUC(devontestY, predicttestprobsLGRD[:,1])

0.6576307436751204

In [177]:
from sklearn.ensemble import RandomForestClassifier as rfc

rfc = rfc()
rfc.set_params(n_estimators=1000, min_samples_split=5, min_samples_leaf=1, max_features='sqrt', max_depth=60)

rfc.fit(devontrainX, devontrainY)

predicttestprobsRFD = rfc.predict_proba(devontestX)

AUC(devontestY, predicttestprobsRFD[:,1])

Exception ignored in: <bound method DMatrix.__del__ of <xgboost.core.DMatrix object at 0x0000021893225710>>
Traceback (most recent call last):
  File "C:\Users\Dave\Anaconda3\lib\site-packages\xgboost\core.py", line 482, in __del__
    if self.handle is not None:
AttributeError: 'DMatrix' object has no attribute 'handle'


0.6602773262812346

In [179]:
###Feature names mismatch with Tim's data
timtrain = pd.read_csv('../Tim/multi_tim_12.1.csv')
timtest = pd.read_csv('../Tim/test_tim_12.5.csv')

timtrainX = timtrain.drop('readmitted_<30', axis=1)
timtrainY = timtrain['readmitted_<30']

timtestX = timtest.drop(['readmitted_>30', 'readmitted_<30'], axis=1)
timtestY = timtest['readmitted_<30']

from xgboost.sklearn import XGBClassifier as xgb

xgb2 = xgb()
xgb2.set_params(n_estimators=500, min_child_weight=10, max_depth=5, gamma=5, colsample_bytree=0.6, max_delta_step=5)

xgb2.fit(timtrainX, timtrainY)

predictTXB = xgb2.predict(timtestX)
predictprobsTXB = xgb2.predict_proba(timtestX)

ValueError: feature_names mismatch: ['Unnamed: 0', 'age', 'time_in_hospital', 'num_lab_procedures', 'num_procedures', 'num_medications', 'number_outpatient', 'number_emergency', 'number_inpatient', 'number_diagnoses', 'race_Caucasian', 'race_AfricanAmerican', 'race_Other', 'race_Asian', 'race_Hispanic', 'gender_Male', 'gender_Unknown/Invalid', 'admission_type_id_Unknown', 'admission_type_id_Emergency', 'admission_type_id_Urgent', 'admission_type_id_Elective', 'discharge_disposition_id_Unknown', 'discharge_disposition_id_Discharged to home', 'discharge_disposition_id_Discharged/transferred to SNF', 'discharge_disposition_id_Discharged/transferred to home with home health service', 'discharge_disposition_id_Discharged/transferred to another short term hospital', 'discharge_disposition_id_Discharged/transferred to another type of inpatient care institution', 'discharge_disposition_id_Discharged/transferred to ICF', 'discharge_disposition_id_Discharged/transferred/referred to this institution for outpatient services', 'discharge_disposition_id_Discharged/transferred to another rehab fac including rehab units of a hospital.', 'discharge_disposition_id_Discharged/transferred/referred to a psychiatric hospital of psychiatric distinct part unit of a hospital', 'admission_source_id_Physician Referral', 'admission_source_id_Emergency Room', 'admission_source_id_Clinic Referral', 'admission_source_id_Transfer from a hospital', 'admission_source_id_Unknown', 'admission_source_id_Transfer from another health care facility', 'admission_source_id_HMO Referral', 'admission_source_id_Court/Law Enforcement', 'medical_specialty_InternalMedicine', 'medical_specialty_Family/GeneralPractice', 'medical_specialty_Cardiology', 'medical_specialty_Surgery-General', 'medical_specialty_Surgery-Cardiovascular/Thoracic', 'medical_specialty_Nephrology', 'medical_specialty_Orthopedics-Reconstructive', 'medical_specialty_Gastroenterology', 'medical_specialty_Psychiatry', 'medical_specialty_Pulmonology', 'medical_specialty_Emergency/Trauma', 'medical_specialty_Pediatrics-CriticalCare', 'medical_specialty_Surgery-Plastic', 'medical_specialty_Orthopedics', 'medical_specialty_Radiologist', 'diag_1_Diabetes', 'diag_1_Neoplasms', 'diag_1_Circulatory', 'diag_1_Respiratory', 'diag_1_Injury', 'diag_1_Musculoskeletal', 'diag_1_Digestive', 'diag_1_Genitourinary', 'diag_1_Mental Disorders', 'diag_2_Diabetes', 'diag_2_Neoplasms', 'diag_2_Circulatory', 'diag_2_Respiratory', 'diag_2_Injury', 'diag_2_Musculoskeletal', 'diag_2_Genitourinary', 'diag_2_Digestive', 'diag_2_Mental Disorders', 'diag_3_Neoplasms', 'diag_3_Circulatory', 'diag_3_Diabetes', 'diag_3_Respiratory', 'diag_3_Genitourinary', 'diag_3_Injury', 'diag_3_Musculoskeletal', 'diag_3_Digestive', 'max_glu_serum_None', 'max_glu_serum_>300', 'max_glu_serum_Norm', 'max_glu_serum_>200', 'A1Cresult_>7', 'A1Cresult_>8', 'A1Cresult_Norm', 'metformin_Steady', 'metformin_Up', 'metformin_Down', 'repaglinide_Steady', 'repaglinide_Down', 'repaglinide_Up', 'glimepiride_Steady', 'glimepiride_Down', 'glimepiride_Up', 'glipizide_Steady', 'glipizide_Up', 'glipizide_Down', 'glyburide_Steady', 'glyburide_Up', 'glyburide_Down', 'pioglitazone_Up', 'pioglitazone_Steady', 'pioglitazone_Down', 'rosiglitazone_Steady', 'rosiglitazone_Up', 'rosiglitazone_Down', 'insulin_Up', 'insulin_Steady', 'insulin_Down', 'change_Ch', 'diabetesMed_Yes'] ['Unnamed: 0', 'age', 'time_in_hospital', 'num_lab_procedures', 'num_procedures', 'num_medications', 'number_outpatient', 'number_emergency', 'number_inpatient', 'number_diagnoses', 'race_AfricanAmerican', 'race_Caucasian', 'race_Hispanic', 'race_Asian', 'gender_Male', 'admission_type_id_Emergency', 'admission_type_id_Urgent', 'admission_type_id_Elective', 'admission_type_id_Unknown', 'admission_type_id_Trauma Center', 'discharge_disposition_id_Discharged to home', 'discharge_disposition_id_Discharged/transferred to SNF', 'discharge_disposition_id_Discharged/transferred to another type of inpatient care institution', 'discharge_disposition_id_Discharged/transferred to another short term hospital', 'discharge_disposition_id_Discharged/transferred to home with home health service', 'discharge_disposition_id_Unknown', 'discharge_disposition_id_Left AMA', 'discharge_disposition_id_Discharged/transferred to ICF', 'discharge_disposition_id_Discharged/transferred to home under care of Home IV provider', 'discharge_disposition_id_Hospice / medical facility', 'discharge_disposition_id_Hospice / home', 'discharge_disposition_id_Discharged/transferred to another rehab fac including rehab units of a hospital.', 'discharge_disposition_id_Admitted as an inpatient to this hospital', 'discharge_disposition_id_Discharged/transferred to a long term care hospital.', 'discharge_disposition_id_Discharged/transferred/referred another institution for outpatient services', 'discharge_disposition_id_Discharged/transferred/referred to this institution for outpatient services', 'discharge_disposition_id_Discharged/transferred within this institution to Medicare approved swing bed', 'discharge_disposition_id_Discharged/transferred/referred to a psychiatric hospital of psychiatric distinct part unit of a hospital', 'discharge_disposition_id_Discharged/transferred to a federal health care facility.', 'discharge_disposition_id_Discharged/transferred to a nursing facility certified under Medicaid but not certified under Medicare.', 'admission_source_id_Emergency Room', 'admission_source_id_Transfer from a hospital', 'admission_source_id_Clinic Referral', 'admission_source_id_Physician Referral', 'admission_source_id_Unknown', 'admission_source_id_Transfer from a Skilled Nursing Facility (SNF)', 'admission_source_id_HMO Referral', 'admission_source_id_Transfer from another health care facility', 'admission_source_id_Extramural Birth', 'admission_source_id_Transfer from critical access hospital', 'admission_source_id_Court/Law Enforcement', 'admission_source_id_Transfer from hospital inpt/same fac reslt in a sep claim', 'admission_source_id_Transfer from Ambulatory Surgery Center', 'medical_specialty_Family/GeneralPractice', 'medical_specialty_Cardiology', 'medical_specialty_Orthopedics', 'medical_specialty_Gastroenterology', 'medical_specialty_InternalMedicine', 'medical_specialty_Emergency/Trauma', 'medical_specialty_Surgery-General', 'medical_specialty_Psychiatry', 'medical_specialty_Orthopedics-Reconstructive', 'medical_specialty_ObstetricsandGynecology', 'medical_specialty_Surgery-Neuro', 'medical_specialty_Hematology/Oncology', 'medical_specialty_Pulmonology', 'medical_specialty_Surgery-Cardiovascular/Thoracic', 'medical_specialty_Nephrology', 'medical_specialty_Pediatrics-CriticalCare', 'medical_specialty_Endocrinology', 'medical_specialty_Pediatrics-Endocrinology', 'medical_specialty_Urology', 'medical_specialty_Radiology', 'medical_specialty_Anesthesiology-Pediatric', 'medical_specialty_Psychiatry-Child/Adolescent', 'medical_specialty_Pediatrics', 'medical_specialty_Pediatrics-Pulmonology', 'medical_specialty_Gynecology', 'medical_specialty_Obsterics&Gynecology-GynecologicOnco', 'medical_specialty_Surgery-Plastic', 'medical_specialty_Otolaryngology', 'medical_specialty_PhysicalMedicineandRehabilitation', 'medical_specialty_InfectiousDiseases', 'medical_specialty_Neurology', 'medical_specialty_Oncology', 'medical_specialty_Psychology', 'medical_specialty_Pediatrics-Hematology-Oncology', 'medical_specialty_Rheumatology', 'medical_specialty_Surgery-Colon&Rectal', 'medical_specialty_Dentistry', 'medical_specialty_Pediatrics-AllergyandImmunology', 'medical_specialty_Pediatrics-Neurology', 'medical_specialty_AllergyandImmunology', 'medical_specialty_Surgery-Vascular', 'medical_specialty_Surgery-Thoracic', 'medical_specialty_Surgery-Pediatric', 'medical_specialty_Surgery-Cardiovascular', 'medical_specialty_Osteopath', 'medical_specialty_Anesthesiology', 'medical_specialty_Podiatry', 'medical_specialty_Surgeon', 'medical_specialty_Hematology', 'medical_specialty_Obstetrics', 'medical_specialty_Surgery-Maxillofacial', 'medical_specialty_Radiologist', 'medical_specialty_Speech', 'medical_specialty_Pathology', 'medical_specialty_Ophthalmology', 'medical_specialty_Cardiology-Pediatric', 'medical_specialty_SurgicalSpecialty', 'medical_specialty_Hospitalist', 'medical_specialty_PhysicianNotFound', 'medical_specialty_OutreachServices', 'diag_1_Circulatory', 'diag_1_Diabetes', 'diag_1_Neoplasms', 'diag_1_Respiratory', 'diag_1_Injury', 'diag_1_Musculoskeletal', 'diag_1_Digestive', 'diag_1_Genitourinary', 'diag_1_Mental Disorders', 'diag_2_Diabetes', 'diag_2_Circulatory', 'diag_2_Respiratory', 'diag_2_Neoplasms', 'diag_2_Digestive', 'diag_2_Injury', 'diag_2_Mental Disorders', 'diag_2_Genitourinary', 'diag_2_Musculoskeletal', 'diag_3_Injury', 'diag_3_Diabetes', 'diag_3_Circulatory', 'diag_3_Neoplasms', 'diag_3_Digestive', 'diag_3_Respiratory', 'diag_3_Musculoskeletal', 'diag_3_Genitourinary', 'max_glu_serum_None', 'max_glu_serum_>300', 'max_glu_serum_>200', 'max_glu_serum_Norm', 'A1Cresult_Norm', 'A1Cresult_>8', 'A1Cresult_>7', 'metformin_Steady', 'metformin_Up', 'metformin_Down', 'repaglinide_Up', 'repaglinide_Steady', 'repaglinide_Down', 'glimepiride_Steady', 'glimepiride_Up', 'glimepiride_Down', 'glipizide_Steady', 'glipizide_Up', 'glipizide_Down', 'glyburide_Steady', 'glyburide_Up', 'glyburide_Down', 'pioglitazone_Steady', 'pioglitazone_Down', 'pioglitazone_Up', 'rosiglitazone_Steady', 'rosiglitazone_Down', 'rosiglitazone_Up', 'insulin_Steady', 'insulin_Down', 'insulin_Up', 'change_Ch', 'diabetesMed_Yes']
expected gender_Unknown/Invalid, race_Other in input data
training data did not have the following fields: medical_specialty_Surgery-Cardiovascular, medical_specialty_Rheumatology, admission_source_id_Transfer from critical access hospital, medical_specialty_Pediatrics-Endocrinology, medical_specialty_Psychiatry-Child/Adolescent, medical_specialty_Pediatrics-Neurology, medical_specialty_Surgeon, medical_specialty_Cardiology-Pediatric, discharge_disposition_id_Left AMA, discharge_disposition_id_Discharged/transferred to home under care of Home IV provider, medical_specialty_Urology, discharge_disposition_id_Discharged/transferred to a federal health care facility., medical_specialty_Ophthalmology, medical_specialty_Surgery-Maxillofacial, medical_specialty_Anesthesiology, medical_specialty_Pediatrics, medical_specialty_Surgery-Neuro, discharge_disposition_id_Discharged/transferred/referred another institution for outpatient services, admission_source_id_Transfer from hospital inpt/same fac reslt in a sep claim, medical_specialty_Pediatrics-Hematology-Oncology, admission_source_id_Transfer from a Skilled Nursing Facility (SNF), discharge_disposition_id_Hospice / home, admission_source_id_Extramural Birth, medical_specialty_Obsterics&Gynecology-GynecologicOnco, medical_specialty_Endocrinology, admission_type_id_Trauma Center, discharge_disposition_id_Hospice / medical facility, medical_specialty_Surgery-Vascular, medical_specialty_AllergyandImmunology, discharge_disposition_id_Discharged/transferred within this institution to Medicare approved swing bed, medical_specialty_Osteopath, medical_specialty_Hospitalist, medical_specialty_Pathology, medical_specialty_Gynecology, medical_specialty_OutreachServices, medical_specialty_ObstetricsandGynecology, medical_specialty_Pediatrics-AllergyandImmunology, medical_specialty_Oncology, medical_specialty_Speech, medical_specialty_Surgery-Pediatric, medical_specialty_InfectiousDiseases, medical_specialty_Radiology, medical_specialty_PhysicalMedicineandRehabilitation, discharge_disposition_id_Admitted as an inpatient to this hospital, medical_specialty_Surgery-Thoracic, discharge_disposition_id_Discharged/transferred to a long term care hospital., medical_specialty_Psychology, medical_specialty_PhysicianNotFound, medical_specialty_Otolaryngology, medical_specialty_Hematology, medical_specialty_SurgicalSpecialty, medical_specialty_Podiatry, medical_specialty_Anesthesiology-Pediatric, admission_source_id_Transfer from Ambulatory Surgery Center, medical_specialty_Hematology/Oncology, medical_specialty_Pediatrics-Pulmonology, medical_specialty_Neurology, medical_specialty_Surgery-Colon&Rectal, medical_specialty_Obstetrics, medical_specialty_Dentistry, discharge_disposition_id_Discharged/transferred to a nursing facility certified under Medicaid but not certified under Medicare.

In [183]:
###Feature names mismatch with Anisha's data; column name issues
anishatrain = pd.read_csv('../anisha/dataset_diabetes/new_train.csv', index_col=0)
anishatest = pd.read_csv('../anisha/dataset_diabetes/new_test.csv', index_col=0)

anishatrainX = anishatrain.drop('readmitted_Yes', axis=1)
anishatrainY = anishatrain['readmitted_Yes']

anishatestX = anishatest.drop('readmitted_Yes', axis=1)
anishatestY = anishatest['readmitted_Yes']

from xgboost.sklearn import XGBClassifier as xgb

xgb2 = xgb()
xgb2.set_params(n_estimators=500, min_child_weight=10, max_depth=5, gamma=5, colsample_bytree=0.6, max_delta_step=5)

xgb2.fit(anishatrainX, anishatrainY)

predictAXB = xgb2.predict(anishatestX)
predictprobsAXB = xgb2.predict_proba(anishatestX)

ValueError: feature_names may not contain [, ] or <

In [184]:
anishacols = list(anishatrain.columns)

In [195]:
anishacols[9] = 'Age2'
anishacols[10] = 'Age3'
anishacols[11] = 'Age4'
anishacols[12] = 'Age5'
anishacols[13] = 'Age6'
anishacols[14] = 'Age7'
anishacols[15] = 'Age8'
anishacols[16] = 'Age9'

In [196]:
anishatrain.columns = anishacols
anishatest.columns = anishacols

In [198]:
anishatrainX = anishatrain.drop('readmitted_Yes', axis=1)
anishatrainX = anishatrainX.drop('encounter_id', axis=1)
anishatrainY = anishatrain['readmitted_Yes']

anishatestX = anishatest.drop('readmitted_Yes', axis=1)
anishatestX = anishatestX.drop('encounter_id', axis=1)
anishatestY = anishatest['readmitted_Yes']

from xgboost.sklearn import XGBClassifier as xgb

xgb2 = xgb()
xgb2.set_params(n_estimators=500, min_child_weight=10, max_depth=5, gamma=5, colsample_bytree=0.6, max_delta_step=5)

xgb2.fit(anishatrainX, anishatrainY)

predictAXB = xgb2.predict(anishatestX)
predictprobsAXB = xgb2.predict_proba(anishatestX)

  if diff:


In [200]:
AUC(anishatestY, predictprobsAXB[:,1])

0.675882142752384

In [2]:
#Try importing the "combined" DF with inputs from multiple team members
DiabetesTakingMedF = pd.read_csv('DiabetesTakingMedF.csv', index_col=0)

DiabetesTrainF = DiabetesTakingMedF[DiabetesTakingMedF['IsTrain']==1].drop('IsTrain', axis=1)
DiabetesTrainF.index = list(range(len(DiabetesTrainF)))

DiabetesTestF = DiabetesTakingMedF[DiabetesTakingMedF['IsTrain']==0].drop('IsTrain', axis=1)
DiabetesTestF.index = list(range(len(DiabetesTestF)))

In [5]:
DiabetesTrainLMF = DiabetesTrainF[DiabetesTrainF['readmitted']!=1]
trainXF = DiabetesTrainLMF.drop('readmitted', axis=1)
trainYF = DiabetesTrainLMF['readmitted'].replace([2], [1])

testXF = DiabetesTestF.drop('readmitted', axis=1)
testYF = DiabetesTestF['readmitted'].replace([2], [1])

from sklearn.linear_model import LinearRegression as lm

lm = lm()

lm.fit(trainXF, trainYF)

middleFdf = DiabetesTrainF[DiabetesTrainF['readmitted']==1]
middleFdfX = middleFdf.drop('readmitted', axis=1)
middleFdfY = middleFdf['readmitted']

predictarray = lm.predict(middleFdfX)

In [9]:
#Cutting out intermediate values with linear score >0.75 (close to the <30 group) helps the analysis (above). Let's do that:
middleFdf75 = middleFdf.loc[predictarray<0.75]

In [10]:
train75F = pd.concat([DiabetesTrainLMF, middleFdf75], axis=0)
train75F.index = list(range(len(train75F)))

In [11]:
#Let's illustrate again that training with the shaved group is better than using all entries:
#This is all entries:

trainXFALL = DiabetesTrainF.drop('readmitted', axis=1)
trainYFALL = DiabetesTrainF['readmitted'].replace([2, 1], [1, 0])

from sklearn.linear_model import LogisticRegression as lgr

lgr = lgr()
lgr.set_params(C=0.1)

lgr.fit(trainXFALL, trainYFALL)

predicttestprobsLRFALL = lgr.predict_proba(testXF)

In [12]:
from sklearn.metrics import roc_auc_score as AUC

AUC(testYF, predicttestprobsLRFALL[:,1])

0.6330516932440414

In [13]:
#Let's illustrate again that training with the shaved group is better than using all entries:
#This is all entries:

trainXF75 = train75F.drop('readmitted', axis=1)
trainYF75 = train75F['readmitted'].replace([2, 1], [1, 0])

from sklearn.linear_model import LogisticRegression as lgr

lgr = lgr()
lgr.set_params(C=0.1)

lgr.fit(trainXF75, trainYF75)

predicttestprobsLRF75 = lgr.predict_proba(testXF)

In [15]:
# The score is better; but the large size of this new DF suggests we need to trim features (at least for linear). Let's do that:

from sklearn.metrics import roc_auc_score as AUC

AUC(testYF, predicttestprobsLRF75[:,1])

0.6371546693093887

In [18]:
#Function to trim DF (do twice and collect common features):
'''import statsmodels.api as sm 
Run1 = FindLowestAICNonLogBackward(train75F, 'readmitted')'''

med_nateglinide removed: New AIC = 159022.97340646473
med_chlorpropamide removed: New AIC = 159020.97354285518
diag_infection removed: New AIC = 159019.03224256058
diabfeat_hyperosmolarity removed: New AIC = 159017.18859571678
med_tolbutamide removed: New AIC = 159015.4778184043
admission_type_id_3 removed: New AIC = 159014.41605810364
medical_specialty_Psychiatry removed: New AIC = 159014.07179113367
medical_specialty_Pulmonology removed: New AIC = 159012.7247548651
med_glipizide.metformin removed: New AIC = 159010.8726569487
diag_blooddis removed: New AIC = 159008.94804254323
diag_musculoskeletal removed: New AIC = 159007.20981926465
primarydiag_mentaldis removed: New AIC = 159005.21272287925
primarydiag_pregnancy removed: New AIC = 159003.30533395364
diag_nervous removed: New AIC = 159002.4152113428
medical_specialty_Radiologist removed: New AIC = 159000.94486805185
medical_specialty_Urology removed: New AIC = 158999.0876989959
diag_digestive removed: New AIC = 158998.99626857534
me

In [19]:
#Run2 = FindLowestAICNonLogBackward(train75F, 'readmitted')

primarydiag_nervous removed: New AIC = 159024.31445527272
admission_type_id_4 removed: New AIC = 159022.61935314458
diag_injury removed: New AIC = 159020.71663459612
medical_specialty_Other removed: New AIC = 159019.21176693996
med_repaglinide removed: New AIC = 159018.96491419908
diag_digestive removed: New AIC = 159018.6084849541
primarydiag_blooddis removed: New AIC = 159016.91388182575
medical_specialty_Urology removed: New AIC = 159015.02156218514
admission_type_id_3 removed: New AIC = 159013.93904026374
diag_blooddis removed: New AIC = 159012.1320679473
medical_specialty_Radiologist removed: New AIC = 159010.66827630345
diag_pregnancy removed: New AIC = 159009.04142420294
medical_specialty_Emergency/Trauma removed: New AIC = 159007.51470509195
med_miglitol removed: New AIC = 159006.49139291834
diag_mentaldis removed: New AIC = 159005.26901764824
med_chlorpropamide removed: New AIC = 159003.26962715155
max_glu_serum_>200 removed: New AIC = 159001.27088966971
primarydiag_mentaldis 

In [22]:
#We now create the joint list and print it, and recreate it (can then comment this out later)
FeatureList = list(set(list(Run1[0].columns)) & set(list(Run2[0].columns)))
FeatureList

['diag_neoplasm',
 'diag_urogenital',
 'medical_specialty_Nephrology',
 'primarydiag_other',
 'number_inpatient',
 'med_insulin',
 'admission_source_id_6',
 'max_glu_serum_>300',
 'discharge_disposition_psych',
 'primarydiag_Nothing',
 'A1Cresult_Norm',
 'diag_circulatory',
 'A1Cresult_>7',
 'admission_source_id_8',
 'diabfeat_ketoacidosis',
 'admission_source_id_5',
 'readmitted',
 'primarydiag_neoplasm',
 'admission_source_id_2',
 'discharge_disposition_hospice',
 'med_any_No',
 'admission_type_id_5',
 'race_Asian',
 'medical_specialty_OBGYN',
 'number_emergency',
 'primarydiag_digestive',
 'diag_skin',
 'medical_specialty_Oncology',
 'med_metformin',
 'gender_Male',
 'diabfeat_circulatory',
 'admission_type_id_2',
 'primarydiag_injury',
 'race_Hispanic',
 'medical_specialty_Surgery',
 'race_Other',
 'diabchange',
 'admission_source_id_4',
 'discharge_disposition_leftAMA',
 'primarydiag_infection',
 'medical_specialty_Gastroenterology',
 'time_in_hospital',
 'diabfeat_renal',
 'numbe

In [23]:
FeatureList = ['diag_neoplasm',
 'diag_urogenital',
 'medical_specialty_Nephrology',
 'primarydiag_other',
 'number_inpatient',
 'med_insulin',
 'admission_source_id_6',
 'max_glu_serum_>300',
 'discharge_disposition_psych',
 'primarydiag_Nothing',
 'A1Cresult_Norm',
 'diag_circulatory',
 'A1Cresult_>7',
 'admission_source_id_8',
 'diabfeat_ketoacidosis',
 'admission_source_id_5',
 'readmitted',
 'primarydiag_neoplasm',
 'admission_source_id_2',
 'discharge_disposition_hospice',
 'med_any_No',
 'admission_type_id_5',
 'race_Asian',
 'medical_specialty_OBGYN',
 'number_emergency',
 'primarydiag_digestive',
 'diag_skin',
 'medical_specialty_Oncology',
 'med_metformin',
 'gender_Male',
 'diabfeat_circulatory',
 'admission_type_id_2',
 'primarydiag_injury',
 'race_Hispanic',
 'medical_specialty_Surgery',
 'race_Other',
 'diabchange',
 'admission_source_id_4',
 'discharge_disposition_leftAMA',
 'primarydiag_infection',
 'medical_specialty_Gastroenterology',
 'time_in_hospital',
 'diabfeat_renal',
 'number_outpatient',
 'num_medications',
 'diag_metabolic',
 'medical_specialty_Orthopedics',
 'race_AfricanAmerican',
 'primarydiag_respiratory',
 'primarydiag_skin',
 'diabfeat_ophthalmic',
 'age',
 'medical_specialty_Cardiology',
 'med_glipizide',
 'admission_type_id_7',
 'num_lab_procedures',
 'num_procedures',
 'number_diagnoses',
 'discharge_disposition_nursing',
 'diabfeat_neurologic',
 'medical_specialty_Pediatrics',
 'discharge_disposition_outpatient',
 'admission_source_id_1',
 'max_glu_serum_Norm',
 'primarydiag_metabolic',
 'discharge_disposition_hhealth',
 'diabfeat_other',
 'diag_respiratory',
 'medical_specialty_Family/GeneralPractice',
 'discharge_disposition_hospital',
 'A1Cresult_>8',
 'primarydiag_urogenital',
 'admission_source_id_3']

In [26]:
#This is all entries:
DiabetesFTrim = DiabetesTrainF[FeatureList]
DiabetesFTestTrim = DiabetesTestF[FeatureList]

trainXFTrimALL = DiabetesFTrim.drop('readmitted', axis=1)
trainYFTrimALL = DiabetesFTrim['readmitted'].replace([2, 1], [1, 0])

testXTrim = DiabetesFTestTrim.drop('readmitted', axis=1)

from sklearn.linear_model import LogisticRegression as lgr

lgr = lgr()
lgr.set_params(C=0.1)

lgr.fit(trainXFTrimALL, trainYFTrimALL)

predicttestprobsLRFTrimALL = lgr.predict_proba(testXTrim)

In [27]:
from sklearn.metrics import roc_auc_score as AUC

AUC(testYF, predicttestprobsLRFTrimALL[:,1])

0.636494299128102

In [30]:
np.sum(DiabetesFTrim).sort_values()

admission_type_id_7                        15
discharge_disposition_psych               108
admission_source_id_3                     149
medical_specialty_Gastroenterology        429
medical_specialty_Oncology                440
diabfeat_ophthalmic                       449
medical_specialty_Pediatrics              450
race_Asian                                487
discharge_disposition_leftAMA             492
medical_specialty_OBGYN                   608
admission_source_id_5                     639
admission_source_id_2                     863
diabfeat_circulatory                      896
max_glu_serum_>300                        934
race_Other                               1167
medical_specialty_Nephrology             1208
diabfeat_renal                           1433
discharge_disposition_hospice            1575
race_Hispanic                            1615
discharge_disposition_outpatient         1624
admission_source_id_6                    1784
diabfeat_ketoacidosis             

In [32]:
from xgboost.sklearn import XGBClassifier as xgb

xgb = xgb()
xgb.set_params(n_estimators=500, min_child_weight=10, max_depth=5, gamma=5, colsample_bytree=0.6, max_delta_step=5)

xgb.fit(trainXF75, trainYF75)

predictXBF = xgb.predict(testXF)
predictprobsXBF75 = xgb.predict_proba(testXF)


  if diff:


In [33]:
AUC(testYF, predictprobsXBF75[:,1])

0.6510857983723235