In [1]:
import pandas as pd
import numpy as np

In [2]:
DiabetesAllDummy = pd.read_csv('DiabetesAllDummy.csv', index_col=0)
#DiabetesOrdMed = pd.read_csv('DiabetesOrdMed.csv', index_col=0)

In [3]:
#Let's just start by looking at a simple linear regression:
#Yes, we have a categorical variable as our output. This will not be valid, but will show right away variables which could be important:

DiabetesTrain = DiabetesAllDummy[DiabetesAllDummy['IsTrain']==1]
DiabetesTrain.index = list(range(len(DiabetesTrain)))

DiabetesTest = DiabetesAllDummy[DiabetesAllDummy['IsTrain']==0]
DiabetesTest.index = list(range(len(DiabetesTest)))

In [4]:
#Try simple linear model:

import statsmodels.api as sm

DiabetesX = DiabetesTrain.drop('readmitted', axis=1)
DiabetesY = DiabetesTrain['readmitted']

X2 = sm.add_constant(DiabetesX)

est = sm.OLS(DiabetesY, X2)
est2 = est.fit()
print(est2.summary())

                            OLS Regression Results                            
Dep. Variable:             readmitted   R-squared:                       0.088
Model:                            OLS   Adj. R-squared:                  0.087
Method:                 Least Squares   F-statistic:                     69.42
Date:                Mon, 03 Dec 2018   Prob (F-statistic):               0.00
Time:                        21:39:20   Log-Likelihood:                -79734.
No. Observations:               80084   AIC:                         1.597e+05
Df Residuals:                   79972   BIC:                         1.607e+05
Df Model:                         111                                         
Covariance Type:            nonrobust                                         
                                       coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------------------
age     

In [5]:
'''Based on this analysis, we see some variables which have a positive coefficient (age, time_in_hospital, number_emergency,
diag_circulatory, diabfeat_circulatory, diabfeat_neurologic, diabfeat_renal, max_glu_serum_>300, etc). We also see significant
variables with negative coefficients (all race categories, **discharge_disposition_died**, d_d_hospice, all AIC results, med_any_No,
etc.) Variables with negative coefficients are candidaites to lower readmission rates, those with positive coefficients are
candidates to increase readmission rates'''

#Of note, 0 is not admitted, 1 is readmitted post 30 days, 2 is readmitted in less than 30 days

#This is crude, but gives us a basic indication of what may be significant.

'Based on this analysis, we see some variables which have a positive coefficient (age, time_in_hospital, number_emergency,\ndiag_circulatory, diabfeat_circulatory, diabfeat_neurologic, diabfeat_renal, max_glu_serum_>300, etc). We also see significant\nvariables with negative coefficients (all race categories, **discharge_disposition_died**, d_d_hospice, all AIC results, med_any_No,\netc.) Variables with negative coefficients are candidaites to lower readmission rates, those with positive coefficients are\ncandidates to increase readmission rates'

In [6]:
#let's write a function to see which variables are stongly correlated with each other:

def CreateCorrelationMatrix(df, dependent):
    df2 = df.drop(dependent, axis=1)
    for i in range(len(df2.columns)):
        corrarray = []
        indexarray = []
        for j in range(len(df2.columns)):
            corr12 = df2[df2.columns[i]].corr(df2[df2.columns[j]])
            corrarray.append(corr12)
            indexarray.append(df2.columns[j])
        seriesi = pd.Series(corrarray, index=indexarray)
        
        if i > 0:
            corrDF = pd.concat([corrDF, seriesi], axis=1)
        
        else:
            corrDF = pd.DataFrame(seriesi)
     
    #Rename the columns to be the same as the indices (a self matrix)
    corrDF.columns = corrDF.index
    
    #reset all self-covariances to 0
    for var in corrDF.columns:
        corrDF.loc[var, var] = 0
    
    return corrDF 

In [7]:
CM = CreateCorrelationMatrix(DiabetesAllDummy, 'readmitted')

In [8]:
#Check the matrix out, eliminate a variable then rerun the process:

np.max(CM).sort_values(ascending=False)

#No variable is more than 60% correlated with any other variable, which is encouraging.
#The most corrrelated variables are admission_type_id_3 and admission_source_id_1 (59.6%),
# followed by admission_type_id_5 and admission_source_id_8 (47.5%)

change_Ch                         0.657410
diabchange                        0.657410
med_insulin_Down                  0.603793
admission_type_id_3               0.596060
admission_source_id_1             0.596060
med_insulin_Up                    0.577175
admission_type_id_5               0.471436
admission_source_id_8             0.471436
num_medications                   0.464082
time_in_hospital                  0.464082
max_glu_serum_Norm                0.449592
admission_type_id_6               0.435850
num_procedures                    0.381644
max_glu_serum_>200                0.327865
num_lab_procedures                0.319754
med_metformin_Steady              0.289861
discharge_disposition_nursing     0.283049
age                               0.283049
number_emergency                  0.267015
number_inpatient                  0.267015
number_diagnoses                  0.261036
max_glu_serum_>300                0.220670
admission_type_id_2               0.215506
diag_circul

In [9]:
#Let's also check which rows are extremely poorly represented. These are candidates for removal:

np.sum(DiabetesAllDummy).sort_values()
#We see that 4 features (med_tolazamide_Up, med_chloropropamide_Down, med_acarbose_Down, med_troglitazone_Steady) are 
#very low. We are unlikely to glean anything from this, and they could be removed. Let's keep this in mind going forward:

med_tolazamide_Up                       1
med_chlorpropamide_Down                 1
med_miglitol_Up                         2
med_acarbose_Down                       3
med_miglitol_Down                       5
med_chlorpropamide_Up                   6
med_glyburide.metformin_Down            6
med_glyburide.metformin_Up              8
med_acarbose_Up                        10
admission_type_id_4                    10
med_nateglinide_Down                   11
med_glipizide.metformin_Steady         13
admission_type_id_7                    18
med_tolbutamide_Steady                 21
med_nateglinide_Up                     24
med_miglitol_Steady                    31
med_tolazamide_Steady                  38
med_repaglinide_Down                   45
med_chlorpropamide_Steady              78
diabfeat_coma                          78
med_rosiglitazone_Down                 86
med_repaglinide_Up                    109
med_pioglitazone_Down                 117
discharge_disposition_psych       

In [10]:
#We can write a function to find a candidate feature list using AIC engineering:

def FindLowestAICNonLogBackward(df, dependent):
    '''Input: DF to AIC-modify and the dependent variable. WILL RETURN: A tuple: [0] is the modified DF (with dependent)
    and tuple[1] will give you the summary DF'''
    df2 = df.copy()
    df2X = df2.drop(dependent, axis=1)
    df2Y = df2[dependent]
    FeatureList = list(df2X.columns)
    X2 = sm.add_constant(df2X)
    est = sm.OLS(df2Y, X2)
    CurrentAIC = est.fit().aic
    CanBeBetter = True
    ModList = []
    AddedSubtracted = []
    AIC = []
    TriesSinceReset = 0
    
    tempColumnList = list(df2X.columns)
    tempDF2X = df2X[tempColumnList]
    
    while CanBeBetter == True:
        Choice = np.random.choice(list(df2X.columns))
        
        HeadsTails = np.random.randint(2)
        if Choice in tempColumnList:
            HeadsTails = 0
        if Choice not in tempColumnList:
            HeadsTails = 1
        
        if HeadsTails == 1:
            tempColumnList.append(Choice)
            tempDF2X[Choice] = df2X[Choice]
        
        if HeadsTails == 0:

            tempColumnList.remove(Choice)
            tempDF2X = tempDF2X[tempColumnList]
            
        est = sm.OLS(df2Y, sm.add_constant(tempDF2X))
        NewAIC = est.fit().aic
        
        if NewAIC < CurrentAIC:
            TriesSinceReset = 0
            CurrentAIC = NewAIC
            
            if HeadsTails == 1:
                print(Choice + " added: New AIC = " + str(CurrentAIC))
                ModList.append(Choice)
                AddedSubtracted.append('Added')
                AIC.append(CurrentAIC)
                
            if HeadsTails == 0:
                print(Choice + " removed: New AIC = " + str(CurrentAIC))
                ModList.append(Choice)
                AddedSubtracted.append('Subtracted')
                AIC.append(CurrentAIC)
            continue
            
        else:
            TriesSinceReset += 1
            
            if HeadsTails == 1:
                tempColumnList.remove(Choice)
                tempDF2X = tempDF2X[tempColumnList]
                
            if HeadsTails == 0:
                tempColumnList.append(Choice)
                tempDF2X[Choice] = df2X[Choice]
                
            if TriesSinceReset > 100:
                CanBeBetter = False
                
    SummaryDF = pd.DataFrame({'Feature': ModList, 'AddOrSubtract': AddedSubtracted, 'AIC': AIC})
    NewDF = pd.concat([tempDF2X, df2[[dependent]]], axis=1)
    
    return NewDF, SummaryDF  

In [11]:
DiabetesAICTest01 = FindLowestAICNonLogBackward(DiabetesAllDummy, 'readmitted')

med_acarbose_Steady removed: New AIC = 199621.8079620847
diag_other removed: New AIC = 199619.83645888825
med_pioglitazone_Up removed: New AIC = 199619.46282049903
IsTrain removed: New AIC = 199617.6610630309
discharge_disposition_unknown removed: New AIC = 199616.24361564283
med_pioglitazone_Steady removed: New AIC = 199614.28107980517
med_nateglinide_Up removed: New AIC = 199613.55729298055
med_metformin_Down removed: New AIC = 199611.56148157423
med_tolazamide_Up removed: New AIC = 199609.89937179926
med_glipizide.metformin_Steady removed: New AIC = 199608.14010364644
med_rosiglitazone_Up removed: New AIC = 199607.6569934644
med_glimepiride_Down removed: New AIC = 199605.70283328445
admission_type_id_3 removed: New AIC = 199604.1089680509
change_Ch removed: New AIC = 199602.71460771534
max_glu_serum_>200 removed: New AIC = 199601.65150080743
med_repaglinide_Up removed: New AIC = 199599.92096598027
med_glyburide_Steady removed: New AIC = 199598.4132298202
diag_blooddis removed: New A

In [12]:
DiabetesAICTest01 = DiabetesAICTest01[0]

In [13]:
#With run 1, we were left with 67 feature columns:
DiabetesAICTest01.head()

Unnamed: 0,med_insulin_Down,num_medications,discharge_disposition_hospital,diag_metabolic,med_any_No,med_acarbose_Up,med_repaglinide_Steady,diag_musculoskeletal,diabfeat_renal,diabfeat_other,...,med_glipizide_Down,med_glipizide_Up,diag_respiratory,admission_type_id_5,discharge_disposition_hhealth,A1Cresult_Norm,admission_source_id_1,admission_type_id_6,admission_type_id_7,readmitted
0,0,1,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,1,1,0,0
1,0,18,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,16,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,0,8,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,16,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [14]:
DiabetesAICTest02 = FindLowestAICNonLogBackward(DiabetesAllDummy, 'readmitted')[0]

admission_type_id_3 removed: New AIC = 199620.50026743417
med_tolbutamide_Steady removed: New AIC = 199619.50626262845
diag_mentaldis removed: New AIC = 199617.51499546572
med_tolazamide_Up removed: New AIC = 199615.87332737556
diag_other removed: New AIC = 199613.90418135736
med_nateglinide_Up removed: New AIC = 199613.1826825273
med_pioglitazone_Down removed: New AIC = 199612.3219868846
med_glimepiride_Up removed: New AIC = 199610.8644640789
med_glyburide.metformin_Down removed: New AIC = 199609.07114312553
med_chlorpropamide_Up removed: New AIC = 199607.75924995966
change_Ch removed: New AIC = 199606.57330332292
med_chlorpropamide_Steady removed: New AIC = 199604.8043417394
diag_blooddis removed: New AIC = 199602.8396469733
max_glu_serum_>200 removed: New AIC = 199601.8079275792
med_pioglitazone_Steady removed: New AIC = 199599.82502573085
diabfeat_hyperosmolarity removed: New AIC = 199599.00449420762
diabchange removed: New AIC = 199597.65668861702
med_metformin_Down removed: New A

In [15]:
#This feature list (run 2) contains 66 features
DiabetesAICTest02.head()

Unnamed: 0,diag_nan,med_insulin_Steady,diag_skin,admission_type_id_6,admission_source_id_4,discharge_disposition_psych,discharge_disposition_leftAMA,num_lab_procedures,med_glipizide_Steady,admission_source_id_3,...,med_any_No,diag_pregnancy,diabfeat_ophthalmic,number_emergency,number_diagnoses,med_glipizide_Up,med_metformin_Steady,gender_Male,diag_musculoskeletal,readmitted
0,1,0,0,1,0,0,0,41,0,0,...,1,0,0,0,1,0,0,0,0,0
1,0,0,0,0,0,0,0,59,0,0,...,0,0,0,0,9,0,0,0,0,1
2,0,0,0,0,0,0,0,44,0,0,...,0,0,0,0,7,0,0,1,0,0
3,0,1,0,0,0,0,0,51,1,0,...,0,0,0,0,5,0,0,1,0,0
4,0,1,0,0,0,0,0,31,0,0,...,0,0,0,0,9,0,0,1,0,1


In [16]:
DiabetesAICTest03 = FindLowestAICNonLogBackward(DiabetesAllDummy, 'readmitted')[0]

diag_mentaldis removed: New AIC = 199620.09368676064
med_repaglinide_Up removed: New AIC = 199618.35829509218
med_chlorpropamide_Up removed: New AIC = 199617.0164006888
med_tolazamide_Steady removed: New AIC = 199616.5457007491
med_glimepiride_Up removed: New AIC = 199615.09630901844
med_glyburide_Steady removed: New AIC = 199613.14623557133
diag_injury removed: New AIC = 199612.76454685436
max_glu_serum_Norm removed: New AIC = 199610.76837289694
med_glimepiride_Down removed: New AIC = 199608.84411491294
med_pioglitazone_Up removed: New AIC = 199608.5048163894
discharge_disposition_unknown removed: New AIC = 199607.170461974
med_glyburide_Down removed: New AIC = 199605.1756907131
diag_nan removed: New AIC = 199603.23702804942
diabchange removed: New AIC = 199601.25044748577
race_AfricanAmerican removed: New AIC = 199599.46954165428
diag_infection removed: New AIC = 199597.89369355334
med_tolbutamide_Steady removed: New AIC = 199596.88695797243
med_metformin_Down removed: New AIC = 1995

In [17]:
#This results in 66 features common to all three lists:
FeatureIntersect = set(DiabetesAICTest01.columns) & set(DiabetesAICTest02.columns) & set(DiabetesAICTest03.columns)
FeatureIntersect = list(FeatureIntersect)
print(len(FeatureIntersect))
FeatureIntersect

65


['med_any_No',
 'number_emergency',
 'diabfeat_ketoacidosis',
 'readmitted',
 'med_glyburide.metformin_Up',
 'number_inpatient',
 'A1Cresult_>8',
 'diag_circulatory',
 'number_diagnoses',
 'race_Asian',
 'diabfeat_circulatory',
 'med_rosiglitazone_Steady',
 'race_Hispanic',
 'admission_source_id_5',
 'admission_source_id_2',
 'admission_source_id_8',
 'discharge_disposition_hospital',
 'med_glipizide_Up',
 'med_glipizide_Down',
 'A1Cresult_Norm',
 'admission_source_id_6',
 'admission_source_id_3',
 'diag_metabolic',
 'discharge_disposition_outpatient',
 'admission_source_id_1',
 'diag_pregnancy',
 'race_Other',
 'number_outpatient',
 'admission_type_id_6',
 'diabfeat_neurologic',
 'num_lab_procedures',
 'med_metformin_Steady',
 'diag_respiratory',
 'med_miglitol_Down',
 'med_acarbose_Up',
 'med_rosiglitazone_Down',
 'num_medications',
 'diag_skin',
 'discharge_disposition_psych',
 'discharge_disposition_nursing',
 'med_glipizide_Steady',
 'admission_type_id_2',
 'diabfeat_other',
 'med

In [18]:
#Let's try the simplest logistic regression model and see how it looks, whether using all features, or our own:

from sklearn.linear_model import LogisticRegression as lgr

lgr = lgr()

import sklearn.model_selection as ms
ms_k5 = ms.KFold(n_splits=5, shuffle=True)

#First try with all features:

#Create X and Y variables (replace 2 to 1 and 1 to 0)
trainX = DiabetesTrain.drop('readmitted', axis=1)
trainY = DiabetesTrain['readmitted'].replace([2, 1], [1, 0])

testX = DiabetesTest.drop('readmitted', axis=1)
testY = DiabetesTest['readmitted'].replace([2, 1], [1, 0])


#Instantiate the K-fold generator object:
np.random.seed(0)
DiabetesAD5Fold = ms_k5.split(trainX, trainY)

#Produce list of test R^2 scores, and Actual vs Predicted lists for the individual runs
ScoreList = []
PredictList = []
PureTestScore = []
TrainScores = []
ActualValues = []

for train, test in DiabetesAD5Fold:

    
    #Run the fit using the train data for each K
    lgr.fit(trainX.iloc[train,], trainY[train])
    #Run your predicion for the "missing" K-part
    p = lgr.predict(trainX.iloc[test,])
    actual = trainY[train].values
    #Check your schore for the missing K-part
    R2 = lgr.score(trainX.iloc[test,], trainY[test])
    #Run a test on the completely untouched test 20%
    TestR2 = lgr.score(testX, testY)
    TrainScore = lgr.score(trainX.iloc[train,], trainY[train])
    
    #Append these scores to the lists above
    ScoreList.append(R2)
    PureTestScore.append(TestR2)
    PredictList.append(p)
    TrainScores.append(TrainScore)
    ActualValues.append(actual)
    
    #Make predictions for the completely untouched 20%
    PredictionsTest = lgr.predict(testX)
    
    #Use these predictions to calculate RMSLE for the untouched 20% and append
    #RMSLEvalue = np.sqrt(np.mean(np.power(np.log1p(testY)-np.log1p(PredictionsTest), 2)))
    #RMSLE.append(RMSLEvalue)

    


In [19]:
#These are the internal train scores, based on the logistic regression
TrainScores

[0.8866343047122544,
 0.8856821764715064,
 0.8867123480106763,
 0.8863845661573041,
 0.8863551226821502]

In [20]:
#These are the K-test scores, based on the logistic regression
ScoreList

[0.8855590934631954,
 0.8888680776674783,
 0.8846225884997191,
 0.8861834301055129,
 0.8863011988011988]

In [21]:
#These are the scores for each of the 5 ensemble models, compared to the untouched 20% test index
PureTestScore

[0.8421367948077884,
 0.8407388916625063,
 0.8432850723914129,
 0.8406390414378432,
 0.8429355966050923]

In [22]:
#These are the prediction lists from the train, compared to actual Y values for the train:
PredictList

[array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
 array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
 array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
 array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
 array([0, 0, 0, ..., 0, 0, 0], dtype=int64)]

In [23]:
#These values may sound impressive, but they absolutely aren't. 11.1 percent of patients return under 30 days, so simply guessing "No" will get you 88.8% accuracy:
1-np.sum(trainY)/len(trainY)

0.8865441286649018

In [24]:
#Let's try the regression with our modified features:

from sklearn.linear_model import LogisticRegression as lgr

lgr2 = lgr()

import sklearn.model_selection as ms
ms_k5 = ms.KFold(n_splits=5, shuffle=True)

#Let's try it with our modified features:

DiabetesTrainM = DiabetesTrain[FeatureIntersect]
DiabetesTestM = DiabetesTest[FeatureIntersect]

#Create X and Y variables (replace 2 to 1 and 1 to 0)
trainX = DiabetesTrainM.drop('readmitted', axis=1)
trainY = DiabetesTrainM['readmitted'].replace([2, 1], [1, 0])

testX = DiabetesTestM.drop('readmitted', axis=1)
testY = DiabetesTestM['readmitted'].replace([2, 1], [1, 0])


#Instantiate the K-fold generator object:
np.random.seed(0)
DiabetesAD5Fold = ms_k5.split(trainX, trainY)

#Produce list of test R^2 scores, and Actual vs Predicted lists for the individual runs
ScoreList = []
PredictList = []
PureTestScore = []
TrainScores = []
ActualValues = []

for train, test in DiabetesAD5Fold:

    
    #Run the fit using the train data for each K
    lgr2.fit(trainX.iloc[train,], trainY[train])
    #Run your predicion for the "missing" K-part
    p = lgr2.predict(trainX.iloc[test,])
    actual = trainY[train].values
    #Check your schore for the missing K-part
    R2 = lgr2.score(trainX.iloc[test,], trainY[test])
    #Run a test on the completely untouched test 20%
    TestR2 = lgr2.score(testX, testY)
    TrainScore = lgr2.score(trainX.iloc[train,], trainY[train])
    
    #Append these scores to the lists above
    ScoreList.append(R2)
    PureTestScore.append(TestR2)
    PredictList.append(p)
    TrainScores.append(TrainScore)
    ActualValues.append(actual)
    
    #Make predictions for the completely untouched 20%
    PredictionsTest = lgr2.predict(testX)
    
    #Use these predictions to calculate RMSLE for the untouched 20% and append
    #RMSLEvalue = np.sqrt(np.mean(np.power(np.log1p(testY)-np.log1p(PredictionsTest), 2)))
    #RMSLE.append(RMSLEvalue)


In [25]:
TrainScores

[0.886540652754148,
 0.8857133937908752,
 0.8867123480106763,
 0.8863689574976197,
 0.8863239058500343]

In [26]:
ScoreList

[0.8856215271274271,
 0.8886183430105513,
 0.8848098894924143,
 0.8861209964412812,
 0.8861138861138861]

In [27]:
PureTestScore

[0.8865202196704942,
 0.8861208187718422,
 0.8861707438841737,
 0.8863205192211683,
 0.8864203694458312]

In [28]:
#The accuracy with reduced features was no better than the overall accuracy with all features. Let's check what might be going on:

In [29]:
np.sum(PredictList[0])

54

In [30]:
len(PredictList[0])

16017

In [31]:
'''This seems to be the biggest problem. Out of 16,200 observations (in the first K group, as an example), only 65 of them
were predicted to return within 30 days. We know that 11% should, or rougly 1,800. We need to tune this model to be more
generous in predicting a positive outcome'''

'This seems to be the biggest problem. Out of 16,200 observations (in the first K group, as an example), only 65 of them\nwere predicted to return within 30 days. We know that 11% should, or rougly 1,800. We need to tune this model to be more\ngenerous in predicting a positive outcome'

In [32]:
from sklearn.model_selection import RandomizedSearchCV

#We will perform a grid search to find the optimal hyperparameters for our RF algorithm and test using our K-fold data:

# C value is the one most important for tuning a logistic regression. Let's see how varying this value affects the score:
C_range = [0.001, 0.01, 0.1, 1, 10, 100, 100]


# Create the random grid
random_grid = {'C': C_range}

# Use the random grid to search for best C hyperparameter:

# First create the base model to tune
lgrCV = lgr()

# Random search of parameters, using 5-fold cross validation, 
lgr_random = RandomizedSearchCV(estimator = lgrCV, param_distributions = random_grid, n_iter = 7, cv = 5, verbose=2, random_state=42, n_jobs = 2)

# Fit the random search model
lgr_random.fit(trainX, trainY)

#Then print the best parameters using best_params_
lgr_random.best_params_

Fitting 5 folds for each of 7 candidates, totalling 35 fits


[Parallel(n_jobs=2)]: Done  35 out of  35 | elapsed:   25.8s finished


{'C': 0.001}

In [42]:
#Let's try the logistic regression with c=0.001 and see how that works:

from sklearn.linear_model import LogisticRegression as lgr

lgr = lgr()
lgr.set_params(C=0.001)

import sklearn.model_selection as ms
ms_k5 = ms.KFold(n_splits=5, shuffle=True)

#First try with all features:

#Create X and Y variables (replace 2 to 1 and 1 to 0)
trainX = DiabetesTrain.drop('readmitted', axis=1)
trainY = DiabetesTrain['readmitted'].replace([2, 1], [1, 0])

testX = DiabetesTest.drop('readmitted', axis=1)
testY = DiabetesTest['readmitted'].replace([2, 1], [1, 0])


#Instantiate the K-fold generator object:
np.random.seed(0)
DiabetesAD5Fold = ms_k5.split(trainX, trainY)

#Produce list of test R^2 scores, and Actual vs Predicted lists for the individual runs
ScoreList = []
PredictList = []
PureTestScore = []
TrainScores = []
ActualValues = []

for train, test in DiabetesAD5Fold:

    
    #Run the fit using the train data for each K
    lgr.fit(trainX.iloc[train,], trainY[train])
    #Run your predicion for the "missing" K-part
    p = lgr.predict(trainX.iloc[test,])
    actual = trainY[train].values
    #Check your schore for the missing K-part
    R2 = lgr.score(trainX.iloc[test,], trainY[test])
    #Run a test on the completely untouched test 20%
    TestR2 = lgr.score(testX, testY)
    TrainScore = lgr.score(trainX.iloc[train,], trainY[train])
    
    #Append these scores to the lists above
    ScoreList.append(R2)
    PureTestScore.append(TestR2)
    PredictList.append(p)
    TrainScores.append(TrainScore)
    ActualValues.append(actual)
    
    #Make predictions for the completely untouched 20%
    PredictionsTest = lgr.predict(testX)
    PredictionsProbs = lgr.predict_proba(testX)
    
    #Use these predictions to calculate RMSLE for the untouched 20% and append
    #RMSLEvalue = np.sqrt(np.mean(np.power(np.log1p(testY)-np.log1p(PredictionsTest), 2)))
    #RMSLE.append(RMSLEvalue)

In [48]:
from sklearn.metrics import roc_auc_score as AUC


AUC(testY, PredictionsProbs[:,1])

0.5948836629098997

In [47]:
PredictionsProbs[:,1]

array([0.26189372, 0.11925366, 0.17535673, ..., 0.14924451, 0.10639737,
       0.08913864])

In [34]:
PureTestScore

[0.8860708936595107,
 0.8857713429855217,
 0.8858711932101847,
 0.8858212680978532,
 0.8858212680978532]

In [35]:
#Either logistic regression does not work for this data, or something is going wrong here. The predicted scores are up by the slightest amount

In [36]:
#Finally, try the logistic regression with c=0.001 and reduced features and see how that works:

from sklearn.linear_model import LogisticRegression as lgr

lgr = lgr()
lgr.set_params(C=0.001)

import sklearn.model_selection as ms
ms_k5 = ms.KFold(n_splits=5, shuffle=True)

#First try with all features:



#Create X and Y variables (replace 2 to 1 and 1 to 0)
trainX = DiabetesTrainM.drop('readmitted', axis=1)
trainY = DiabetesTrainM['readmitted'].replace([2, 1], [1, 0])

testX = DiabetesTestM.drop('readmitted', axis=1)
testY = DiabetesTestM['readmitted'].replace([2, 1], [1, 0])


#Instantiate the K-fold generator object:
np.random.seed(0)
DiabetesAD5Fold = ms_k5.split(trainX, trainY)

#Produce list of test R^2 scores, and Actual vs Predicted lists for the individual runs
ScoreList = []
PredictList = []
PureTestScore = []
TrainScores = []
ActualValues = []

for train, test in DiabetesAD5Fold:

    
    #Run the fit using the train data for each K
    lgr.fit(trainX.iloc[train,], trainY[train])
    #Run your predicion for the "missing" K-part
    p = lgr.predict(trainX.iloc[test,])
    actual = trainY[train].values
    #Check your schore for the missing K-part
    R2 = lgr.score(trainX.iloc[test,], trainY[test])
    #Run a test on the completely untouched test 20%
    TestR2 = lgr.score(testX, testY)
    TrainScore = lgr.score(trainX.iloc[train,], trainY[train])
    
    #Append these scores to the lists above
    ScoreList.append(R2)
    PureTestScore.append(TestR2)
    PredictList.append(p)
    TrainScores.append(TrainScore)
    ActualValues.append(actual)
    
    #Make predictions for the completely untouched 20%
    PredictionsTest = lgr.predict(testX)
    
    #Use these predictions to calculate RMSLE for the untouched 20% and append
    #RMSLEvalue = np.sqrt(np.mean(np.power(np.log1p(testY)-np.log1p(PredictionsTest), 2)))
    #RMSLE.append(RMSLEvalue)

In [37]:
PureTestScore

[0.8863704443334998,
 0.8865202196704942,
 0.8865202196704942,
 0.8865701447828258,
 0.8865701447828258]

In [49]:
#Finally, try the logistic regression with c=0.001 and reduced features and see how that works:

from sklearn.linear_model import LogisticRegression as lgr

lgr = lgr()
lgr.set_params(C=1000)

import sklearn.model_selection as ms
ms_k5 = ms.KFold(n_splits=5, shuffle=True)

#First try with all features:



#Create X and Y variables (replace 2 to 1 and 1 to 0)
trainX = DiabetesTrainM.drop('readmitted', axis=1)
trainY = DiabetesTrainM['readmitted'].replace([2, 1], [1, 0])

testX = DiabetesTestM.drop('readmitted', axis=1)
testY = DiabetesTestM['readmitted'].replace([2, 1], [1, 0])


#Instantiate the K-fold generator object:
np.random.seed(0)
DiabetesAD5Fold = ms_k5.split(trainX, trainY)

#Produce list of test R^2 scores, and Actual vs Predicted lists for the individual runs
ScoreList = []
PredictList = []
PureTestScore = []
TrainScores = []
ActualValues = []

for train, test in DiabetesAD5Fold:

    
    #Run the fit using the train data for each K
    lgr.fit(trainX.iloc[train,], trainY[train])
    #Run your predicion for the "missing" K-part
    p = lgr.predict(trainX.iloc[test,])
    actual = trainY[train].values
    #Check your schore for the missing K-part
    R2 = lgr.score(trainX.iloc[test,], trainY[test])
    #Run a test on the completely untouched test 20%
    TestR2 = lgr.score(testX, testY)
    TrainScore = lgr.score(trainX.iloc[train,], trainY[train])
    
    #Append these scores to the lists above
    ScoreList.append(R2)
    PureTestScore.append(TestR2)
    PredictList.append(p)
    TrainScores.append(TrainScore)
    ActualValues.append(actual)
    
    #Make predictions for the completely untouched 20%
    PredictionsTest = lgr.predict(testX)
    PredictionsProb = lgr.predict_proba(testX)
    
    #Use these predictions to calculate RMSLE for the untouched 20% and append
    #RMSLEvalue = np.sqrt(np.mean(np.power(np.log1p(testY)-np.log1p(PredictionsTest), 2)))
    #RMSLE.append(RMSLEvalue)

In [52]:
from sklearn.metrics import roc_auc_score as AUC

print(AUC(testY, PredictionsProb[:,1]))
AUC(trainY, lgr.predict_proba(trainX)[:,1])

0.6592364935793682


0.6615979343823551

In [39]:
PureTestScore

[0.8865202196704942,
 0.8861707438841737,
 0.8862206689965052,
 0.8862705941088368,
 0.8863704443334998]

In [40]:
np.sum(PredictList[0])

54