In [1]:
import pandas as pd
import numpy as np

In [15]:
DiabetesAllDummy = pd.read_csv('DiabetesAllDummies.csv', index_col=0)
DiabetesOrdMed = pd.read_csv('DiabetesOrdMed.csv', index_col=0)

In [19]:
#Set a random index of 20% of the values in our dataset. This is the test dataset and will *only* be used for testing our models (sparingly!)
np.random.seed(100)
length = len(DiabetesAllDummy)
testIdx = np.random.choice(range(length), size=int(round(0.2*length)), replace=False)
trainIdx = list(set(range(length))-set(testIdx))

In [48]:
#Let's just start by looking at a simple linear regression:
#Yes, we have a categorical variable as our output. This will not be valid, but will show right away variables which could be important:

DiabetesTrain = DiabetesAllDummy.iloc[trainIdx,]
DiabetesTrain.index = list(range(len(DiabetesTrain)))

DiabetesTest = DiabetesAllDummy.iloc[testIdx,]
DiabetesTest.index = list(range(len(DiabetesTest)))

In [22]:
#Try simple linear model:

import statsmodels.api as sm

DiabetesX = DiabetesTrain.drop('readmitted', axis=1)
DiabetesY = DiabetesTrain['readmitted']

X2 = sm.add_constant(DiabetesX)

est = sm.OLS(DiabetesY, X2)
est2 = est.fit()
print(est2.summary())

                            OLS Regression Results                            
Dep. Variable:             readmitted   R-squared:                       0.097
Model:                            OLS   Adj. R-squared:                  0.095
Method:                 Least Squares   F-statistic:                     79.82
Date:                Thu, 29 Nov 2018   Prob (F-statistic):               0.00
Time:                        09:57:50   Log-Likelihood:                -80508.
No. Observations:               81410   AIC:                         1.612e+05
Df Residuals:                   81300   BIC:                         1.623e+05
Df Model:                         109                                         
Covariance Type:            nonrobust                                         
                                       coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------------------
const   

In [None]:
'''Based on this analysis, we see some variables which have a positive coefficient (age, time_in_hospital, number_emergency,
diag_circulatory, diabfeat_circulatory, diabfeat_neurologic, diabfeat_renal, max_glu_serum_>300, etc). We also see significant
variables with negative coefficients (all race categories, **discharge_disposition_died**, d_d_hospice, all AIC results, med_any_No,
etc.) Variables with negative coefficients are candidaites to lower readmission rates, those with positive coefficients are
candidates to increase readmission rates'''

#Of note, 0 is not admitted, 1 is readmitted post 30 days, 2 is readmitted in less than 30 days

#This is crude, but gives us a basic indication of what may be significant.

In [23]:
#let's write a function to see which variables are stongly correlated with each other:

def CreateCorrelationMatrix(df, dependent):
    df2 = df.drop(dependent, axis=1)
    for i in range(len(df2.columns)):
        corrarray = []
        indexarray = []
        for j in range(len(df2.columns)):
            corr12 = df2[df2.columns[i]].corr(df2[df2.columns[j]])
            corrarray.append(corr12)
            indexarray.append(df2.columns[j])
        seriesi = pd.Series(corrarray, index=indexarray)
        
        if i > 0:
            corrDF = pd.concat([corrDF, seriesi], axis=1)
        
        else:
            corrDF = pd.DataFrame(seriesi)
     
    #Rename the columns to be the same as the indices (a self matrix)
    corrDF.columns = corrDF.index
    
    #reset all self-covariances to 0
    for var in corrDF.columns:
        corrDF.loc[var, var] = 0
    
    return corrDF 

In [24]:
CM = CreateCorrelationMatrix(DiabetesAllDummy, 'readmitted')

In [26]:
#Check the matrix out, eliminate a variable then rerun the process:

np.max(CM).sort_values(ascending=False)

#No variable is more than 60% correlated with any other variable, which is encouraging.
#The most corrrelated variables are admission_type_id_3 and admission_source_id_1 (59.6%),
# followed by admission_type_id_5 and admission_source_id_8 (47.5%)

admission_type_id_3               0.596186
admission_source_id_1             0.596186
admission_type_id_5               0.475770
admission_source_id_8             0.475770
num_medications                   0.466137
time_in_hospital                  0.466137
max_glu_serum_Norm                0.448192
admission_type_id_6               0.433858
change_Ch                         0.398655
med_insulin_Down                  0.398655
num_procedures                    0.385761
med_insulin_Up                    0.381739
max_glu_serum_>200                0.332143
num_lab_procedures                0.318429
med_metformin_Steady              0.289579
age                               0.276945
discharge_disposition_nursing     0.276945
number_inpatient                  0.266557
number_emergency                  0.266557
number_diagnoses                  0.261529
max_glu_serum_>300                0.225075
admission_type_id_2               0.216453
diag_circulatory                  0.200543
med_pioglit

In [28]:
#Let's also check which rows are extremely poorly represented. These are candidates for removal:

np.sum(DiabetesAllDummy).sort_values()
#We see that 4 features (med_tolazamide_Up, med_chloropropamide_Down, med_acarbose_Down, med_troglitazone_Steady) are 
#very low. We are unlikely to glean anything from this, and they could be removed. Let's keep this in mind going forward:

med_tolazamide_Up                       1
med_chlorpropamide_Down                 1
med_acarbose_Down                       3
med_troglitazone_Steady                 3
med_glyburide-metformin_Down            6
med_chlorpropamide_Up                   6
med_glyburide-metformin_Up              8
admission_type_id_4                    10
med_acarbose_Up                        10
med_nateglinide_Down                   11
med_glipizide-metformin_Steady         13
admission_type_id_7                    21
med_tolbutamide_Steady                 23
med_nateglinide_Up                     24
med_tolazamide_Steady                  38
med_repaglinide_Down                   45
med_chlorpropamide_Steady              79
diabfeat_coma                          86
med_rosiglitazone_Down                 87
med_repaglinide_Up                    110
med_pioglitazone_Down                 118
discharge_disposition_psych           139
med_rosiglitazone_Up                  178
admission_source_id_3             

In [36]:
#We can write a function to find a candidate feature list using AIC engineering:

def FindLowestAICNonLogBackward(df, dependent):
    '''Input: DF to AIC-modify and the dependent variable. WILL RETURN: A tuple: [0] is the modified DF (with dependent)
    and tuple[1] will give you the summary DF'''
    df2 = df.copy()
    df2X = df2.drop(dependent, axis=1)
    df2Y = df2[dependent]
    FeatureList = list(df2X.columns)
    X2 = sm.add_constant(df2X)
    est = sm.OLS(df2Y, X2)
    CurrentAIC = est.fit().aic
    CanBeBetter = True
    ModList = []
    AddedSubtracted = []
    AIC = []
    TriesSinceReset = 0
    
    tempColumnList = list(df2X.columns)
    tempDF2X = df2X[tempColumnList]
    
    while CanBeBetter == True:
        Choice = np.random.choice(list(df2X.columns))
        
        HeadsTails = np.random.randint(2)
        if Choice in tempColumnList:
            HeadsTails = 0
        if Choice not in tempColumnList:
            HeadsTails = 1
        
        if HeadsTails == 1:
            tempColumnList.append(Choice)
            tempDF2X[Choice] = df2X[Choice]
        
        if HeadsTails == 0:

            tempColumnList.remove(Choice)
            tempDF2X = tempDF2X[tempColumnList]
            
        est = sm.OLS(df2Y, sm.add_constant(tempDF2X))
        NewAIC = est.fit().aic
        
        if NewAIC < CurrentAIC:
            TriesSinceReset = 0
            CurrentAIC = NewAIC
            
            if HeadsTails == 1:
                print(Choice + " added: New AIC = " + str(CurrentAIC))
                ModList.append(Choice)
                AddedSubtracted.append('Added')
                AIC.append(CurrentAIC)
                
            if HeadsTails == 0:
                print(Choice + " removed: New AIC = " + str(CurrentAIC))
                ModList.append(Choice)
                AddedSubtracted.append('Subtracted')
                AIC.append(CurrentAIC)
            continue
            
        else:
            TriesSinceReset += 1
            
            if HeadsTails == 1:
                tempColumnList.remove(Choice)
                tempDF2X = tempDF2X[tempColumnList]
                
            if HeadsTails == 0:
                tempColumnList.append(Choice)
                tempDF2X[Choice] = df2X[Choice]
                
            if TriesSinceReset > 150:
                CanBeBetter = False
                
    SummaryDF = pd.DataFrame({'Feature': ModList, 'AddOrSubtract': AddedSubtracted, 'AIC': AIC})
    NewDF = pd.concat([tempDF2X, df2[[dependent]]], axis=1)
    
    return NewDF, SummaryDF  

In [30]:
DiabetesAICTest01 = FindLowestAICNonLogBackward(DiabetesAllDummy, 'readmitted')

med_glyburide_Up removed: New AIC = 201324.88088640256
med_chlorpropamide_Down removed: New AIC = 201323.35157176637
med_chlorpropamide_Up removed: New AIC = 201321.99795494787
med_nateglinide_Up removed: New AIC = 201321.32307940078
max_glu_serum_Norm removed: New AIC = 201319.3280722086
med_tolazamide_Up removed: New AIC = 201317.68732753897
diabfeat_coma removed: New AIC = 201315.71746481204
med_glyburide_Down removed: New AIC = 201313.7229107958
max_glu_serum_>200 removed: New AIC = 201312.94614537718
med_chlorpropamide_Steady removed: New AIC = 201311.15717062462
med_pioglitazone_Down removed: New AIC = 201310.159945262
med_glimepiride_Down removed: New AIC = 201308.18510779942
med_glyburide_Steady removed: New AIC = 201306.2141118548
diag_injury removed: New AIC = 201305.62420463594
med_repaglinide_Up removed: New AIC = 201303.84281996603
med_glyburide-metformin_Steady removed: New AIC = 201302.29692810256
med_nateglinide_Steady removed: New AIC = 201300.74869960372
med_tolazamid

In [34]:
DiabetesAICTest01 = DiabetesAICTest01[0]

In [35]:
#With run 1, we were left with 67 feature columns:
DiabetesAICTest01.head()

Unnamed: 0,admission_type_id_3,med_acarbose_Steady,age,diag_metabolic,diabfeat_renal,diabfeat_neurologic,admission_type_id_6,admission_source_id_6,admission_source_id_4,diag_pregnancy,...,discharge_disposition_leftAMA,med_insulin_Steady,med_glipizide_Steady,admission_source_id_2,med_metformin_Steady,A1Cresult_>7,med_metformin_Up,time_in_hospital,med_glipizide_Down,readmitted
0,0,0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0,0,2,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,3,0,1
2,0,0,3,0,0,0,0,0,0,1,...,0,0,1,0,0,0,0,2,0,0
3,0,0,4,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,2,0,0
4,0,0,5,0,0,0,0,0,0,0,...,0,1,1,0,0,0,0,1,0,0


In [37]:
DiabetesAICTest02 = FindLowestAICNonLogBackward(DiabetesAllDummy, 'readmitted')[0]

max_glu_serum_>200 removed: New AIC = 201325.45535367238
admission_type_id_3 removed: New AIC = 201324.12246568425
diabfeat_coma removed: New AIC = 201322.16294293612
med_glimepiride_Steady removed: New AIC = 201321.57623605744
med_chlorpropamide_Up removed: New AIC = 201320.23272505222
med_insulin_Up removed: New AIC = 201319.04285634874
diag_infection removed: New AIC = 201317.3767846704
med_chlorpropamide_Down removed: New AIC = 201315.8595374215
med_nateglinide_Steady removed: New AIC = 201314.20768775573
med_repaglinide_Down removed: New AIC = 201312.68173267634
med_tolazamide_Up removed: New AIC = 201311.0322078659
diag_mentaldis removed: New AIC = 201309.09923923356
med_acarbose_Down removed: New AIC = 201307.10177856567
med_chlorpropamide_Steady removed: New AIC = 201305.33936319884
med_tolazamide_Steady removed: New AIC = 201304.951864884
med_troglitazone_Steady removed: New AIC = 201302.9547978234
admission_type_id_4 removed: New AIC = 201300.9574205811
diag_blooddis removed:

In [38]:
#This feature list (run 2) contains 66 features
DiabetesAICTest02.head()

Unnamed: 0,discharge_disposition_died,diabfeat_ophthalmic,admission_source_id_1,time_in_hospital,admission_source_id_3,discharge_disposition_hhealth,discharge_disposition_hospital,admission_source_id_2,diag_neoplasm,med_metformin_Steady,...,diag_circulatory,med_metformin_Up,diag_skin,diag_metabolic,med_any_No,admission_source_id_4,med_insulin_Steady,diabfeat_ketoacidosis,number_outpatient,readmitted
0,0,0,1,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,0,0,0,3,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
2,0,0,0,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2,0
3,0,0,0,2,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,0,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0


In [39]:
DiabetesAICTest03 = FindLowestAICNonLogBackward(DiabetesAllDummy, 'readmitted')[0]

med_glyburide_Steady removed: New AIC = 201324.4569704103
med_tolbutamide_Steady removed: New AIC = 201323.3604938939
med_tolazamide_Steady removed: New AIC = 201322.9339368703
med_chlorpropamide_Up removed: New AIC = 201321.58116824712
admission_type_id_3 removed: New AIC = 201320.3213476225
diag_infection removed: New AIC = 201318.65988500812
med_insulin_Up removed: New AIC = 201317.27880613416
diag_other removed: New AIC = 201315.28111843416
med_pioglitazone_Down removed: New AIC = 201314.22748952504
med_glyburide_Up removed: New AIC = 201312.8541441699
med_chlorpropamide_Down removed: New AIC = 201311.33771306655
med_acarbose_Down removed: New AIC = 201309.34011246546
med_pioglitazone_Up removed: New AIC = 201308.69854131702
med_glyburide-metformin_Steady removed: New AIC = 201307.0572971382
med_troglitazone_Steady removed: New AIC = 201305.05910075258
discharge_disposition_unknown removed: New AIC = 201304.02294776068
med_glimepiride_Up removed: New AIC = 201302.85783696343
med_na

In [86]:
#This results in 66 features common to all three lists:
FeatureIntersect = set(DiabetesAICTest01.columns) & set(DiabetesAICTest02.columns) & set(DiabetesAICTest03.columns)
FeatureIntersect = list(FeatureIntersect)
print(len(FeatureIntersect))
FeatureIntersect

66


['admission_type_id_2',
 'admission_type_id_7',
 'num_lab_procedures',
 'admission_type_id_5',
 'med_metformin_Up',
 'diag_pregnancy',
 'admission_source_id_8',
 'diag_metabolic',
 'med_rosiglitazone_Steady',
 'med_any_No',
 'discharge_disposition_nursing',
 'med_insulin_Steady',
 'med_glipizide_Steady',
 'age',
 'admission_source_id_1',
 'diabfeat_renal',
 'admission_source_id_2',
 'admission_source_id_5',
 'diag_nervous',
 'number_emergency',
 'med_metformin_Steady',
 'admission_source_id_3',
 'admission_source_id_6',
 'race_Asian',
 'number_diagnoses',
 'diag_respiratory',
 'time_in_hospital',
 'diabfeat_other',
 'med_glyburide-metformin_Up',
 'med_acarbose_Up',
 'max_glu_serum_>300',
 'race_Other',
 'A1Cresult_>8',
 'discharge_disposition_leftAMA',
 'med_glipizide_Up',
 'discharge_disposition_outpatient',
 'gender_Male',
 'med_glipizide_Down',
 'number_inpatient',
 'diabfeat_ophthalmic',
 'discharge_disposition_hospital',
 'diabfeat_neurologic',
 'readmitted',
 'discharge_dispositi

In [59]:
#Let's try the simplest logistic regression model and see how it looks, whether using all features, or our own:

from sklearn.linear_model import LogisticRegression as lgr

lgr = lgr()

import sklearn.model_selection as ms
ms_k5 = ms.KFold(n_splits=5, shuffle=True)

#First try with all features:

#Create X and Y variables (replace 2 to 1 and 1 to 0)
trainX = DiabetesTrain.drop('readmitted', axis=1)
trainY = DiabetesTrain['readmitted'].replace([2, 1], [1, 0])

testX = DiabetesTest.drop('readmitted', axis=1)
testY = DiabetesTest['readmitted'].replace([2, 1], [1, 0])


#Instantiate the K-fold generator object:
np.random.seed(0)
DiabetesAD5Fold = ms_k5.split(trainX, trainY)

#Produce list of test R^2 scores, and Actual vs Predicted lists for the individual runs
ScoreList = []
PredictList = []
PureTestScore = []
TrainScores = []
ActualValues = []

for train, test in DiabetesAD5Fold:

    
    #Run the fit using the train data for each K
    lgr.fit(trainX.iloc[train,], trainY[train])
    #Run your predicion for the "missing" K-part
    p = lgr.predict(trainX.iloc[test,])
    actual = trainY[train].values
    #Check your schore for the missing K-part
    R2 = lgr.score(trainX.iloc[test,], trainY[test])
    #Run a test on the completely untouched test 20%
    TestR2 = lgr.score(testX, testY)
    TrainScore = lgr.score(trainX.iloc[train,], trainY[train])
    
    #Append these scores to the lists above
    ScoreList.append(R2)
    PureTestScore.append(TestR2)
    PredictList.append(p)
    TrainScores.append(TrainScore)
    ActualValues.append(actual)
    
    #Make predictions for the completely untouched 20%
    PredictionsTest = lgr.predict(testX)
    
    #Use these predictions to calculate RMSLE for the untouched 20% and append
    #RMSLEvalue = np.sqrt(np.mean(np.power(np.log1p(testY)-np.log1p(PredictionsTest), 2)))
    #RMSLE.append(RMSLEvalue)

    


In [61]:
#These are the internal train scores, based on the logistic regression
TrainScores

[0.8886039798550547,
 0.8877134258690579,
 0.8876980714899889,
 0.888204765999263,
 0.8879130327969537]

In [62]:
#These are the K-test scores, based on the logistic regression
ScoreList

[0.8859476722761331,
 0.8890799656061908,
 0.8888342955410883,
 0.8875445276992998,
 0.8886500429922614]

In [63]:
#These are the scores for each of the 5 ensemble models, compared to the untouched 20% test index
PureTestScore

[0.889156389721417,
 0.8890581241094679,
 0.8888615928855697,
 0.8890581241094679,
 0.8891072569154425]

In [56]:
#These are the prediction lists from the train, compared to actual Y values for the train:
PredictList

[array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
 array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
 array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
 array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
 array([0, 0, 0, ..., 0, 0, 0], dtype=int64)]

In [65]:
#These values may sound impressive, but they absolutely aren't. 11.1 percent of patients return under 30 days, so simply guessing "No" will get you 88.8% accuracy:
1-np.sum(trainY)/len(trainY)

0.8880604348360153

In [87]:
#Let's try the regression with our modified features:

from sklearn.linear_model import LogisticRegression as lgr

lgr2 = lgr()

import sklearn.model_selection as ms
ms_k5 = ms.KFold(n_splits=5, shuffle=True)

#Let's try it with our modified features:

DiabetesTrainM = DiabetesTrain[FeatureIntersect]
DiabetesTestM = DiabetesTest[FeatureIntersect]

#Create X and Y variables (replace 2 to 1 and 1 to 0)
trainX = DiabetesTrainM.drop('readmitted', axis=1)
trainY = DiabetesTrainM['readmitted'].replace([2, 1], [1, 0])

testX = DiabetesTestM.drop('readmitted', axis=1)
testY = DiabetesTestM['readmitted'].replace([2, 1], [1, 0])


#Instantiate the K-fold generator object:
np.random.seed(0)
DiabetesAD5Fold = ms_k5.split(trainX, trainY)

#Produce list of test R^2 scores, and Actual vs Predicted lists for the individual runs
ScoreList = []
PredictList = []
PureTestScore = []
TrainScores = []
ActualValues = []

for train, test in DiabetesAD5Fold:

    
    #Run the fit using the train data for each K
    lgr2.fit(trainX.iloc[train,], trainY[train])
    #Run your predicion for the "missing" K-part
    p = lgr2.predict(trainX.iloc[test,])
    actual = trainY[train].values
    #Check your schore for the missing K-part
    R2 = lgr2.score(trainX.iloc[test,], trainY[test])
    #Run a test on the completely untouched test 20%
    TestR2 = lgr2.score(testX, testY)
    TrainScore = lgr2.score(trainX.iloc[train,], trainY[train])
    
    #Append these scores to the lists above
    ScoreList.append(R2)
    PureTestScore.append(TestR2)
    PredictList.append(p)
    TrainScores.append(TrainScore)
    ActualValues.append(actual)
    
    #Make predictions for the completely untouched 20%
    PredictionsTest = lgr2.predict(testX)
    
    #Use these predictions to calculate RMSLE for the untouched 20% and append
    #RMSLEvalue = np.sqrt(np.mean(np.power(np.log1p(testY)-np.log1p(PredictionsTest), 2)))
    #RMSLE.append(RMSLEvalue)


In [88]:
TrainScores

[0.8885272079597101,
 0.8877134258690579,
 0.8877287802481267,
 0.8881126397248496,
 0.8878362609016092]

In [89]:
ScoreList

[0.8855791671784793,
 0.8892028006387421,
 0.8887728780248126,
 0.8874831101830242,
 0.8885272079597101]

In [90]:
PureTestScore

[0.8890581241094679,
 0.8890089913034933,
 0.8890581241094679,
 0.8891072569154425,
 0.8890089913034933]

In [None]:
#The accuracy with reduced features was no better than the overall accuracy with all features. Let's check what might be going on:

In [91]:
np.sum(PredictList[0])

65

In [92]:
len(PredictList[0])

16282

In [96]:
'''This seems to be the biggest problem. Out of 16,200 observations (in the first K group, as an example), only 65 of them
were predicted to return within 30 days. We know that 11% should, or rougly 1,800. We need to tune this model to be more
generous in predicting a positive outcome'''

'This seems to be the biggest problem. Out of 16,200 observations (in the first K group, as an example), only 65 of them\nwere predicted to return within 30 days. We know that 11% should, or rougly 1,800. We need to tune this model to be more\ngenerous in predicting a positive outcome'

In [98]:
from sklearn.model_selection import RandomizedSearchCV

#We will perform a grid search to find the optimal hyperparameters for our RF algorithm and test using our K-fold data:

# C value is the one most important for tuning a logistic regression. Let's see how varying this value affects the score:
C_range = [0.001, 0.01, 0.1, 1, 10, 100, 100]


# Create the random grid
random_grid = {'C': C_range}

# Use the random grid to search for best C hyperparameter:

# First create the base model to tune
lgrCV = lgr()

# Random search of parameters, using 5-fold cross validation, 
lgr_random = RandomizedSearchCV(estimator = lgrCV, param_distributions = random_grid, n_iter = 7, cv = 5, verbose=2, random_state=42, n_jobs = 2)

# Fit the random search model
lgr_random.fit(trainX, trainY)

#Then print the best parameters using best_params_
lgr_random.best_params_

Fitting 5 folds for each of 7 candidates, totalling 35 fits


[Parallel(n_jobs=2)]: Done  35 out of  35 | elapsed:   28.8s finished


{'C': 0.001}

In [101]:
#Let's try the logistic regression with c=0.001 and see how that works:

from sklearn.linear_model import LogisticRegression as lgr

lgr = lgr()
lgr.set_params(C=0.001)

import sklearn.model_selection as ms
ms_k5 = ms.KFold(n_splits=5, shuffle=True)

#First try with all features:

#Create X and Y variables (replace 2 to 1 and 1 to 0)
trainX = DiabetesTrain.drop('readmitted', axis=1)
trainY = DiabetesTrain['readmitted'].replace([2, 1], [1, 0])

testX = DiabetesTest.drop('readmitted', axis=1)
testY = DiabetesTest['readmitted'].replace([2, 1], [1, 0])


#Instantiate the K-fold generator object:
np.random.seed(0)
DiabetesAD5Fold = ms_k5.split(trainX, trainY)

#Produce list of test R^2 scores, and Actual vs Predicted lists for the individual runs
ScoreList = []
PredictList = []
PureTestScore = []
TrainScores = []
ActualValues = []

for train, test in DiabetesAD5Fold:

    
    #Run the fit using the train data for each K
    lgr.fit(trainX.iloc[train,], trainY[train])
    #Run your predicion for the "missing" K-part
    p = lgr.predict(trainX.iloc[test,])
    actual = trainY[train].values
    #Check your schore for the missing K-part
    R2 = lgr.score(trainX.iloc[test,], trainY[test])
    #Run a test on the completely untouched test 20%
    TestR2 = lgr.score(testX, testY)
    TrainScore = lgr.score(trainX.iloc[train,], trainY[train])
    
    #Append these scores to the lists above
    ScoreList.append(R2)
    PureTestScore.append(TestR2)
    PredictList.append(p)
    TrainScores.append(TrainScore)
    ActualValues.append(actual)
    
    #Make predictions for the completely untouched 20%
    PredictionsTest = lgr.predict(testX)
    
    #Use these predictions to calculate RMSLE for the untouched 20% and append
    #RMSLEvalue = np.sqrt(np.mean(np.power(np.log1p(testY)-np.log1p(PredictionsTest), 2)))
    #RMSLE.append(RMSLEvalue)

In [109]:
PureTestScore

[0.8897459833931116,
 0.8896477177811625,
 0.8897459833931116,
 0.889696850587137,
 0.8897459833931116]

In [110]:
#Either logistic regression does not work for this data, or something is going wrong here. The predicted scores are up by the slightest amount

In [111]:
#Finally, try the logistic regression with c=0.001 and reduced features and see how that works:

from sklearn.linear_model import LogisticRegression as lgr

lgr = lgr()
lgr.set_params(C=0.001)

import sklearn.model_selection as ms
ms_k5 = ms.KFold(n_splits=5, shuffle=True)

#First try with all features:



#Create X and Y variables (replace 2 to 1 and 1 to 0)
trainX = DiabetesTrainM.drop('readmitted', axis=1)
trainY = DiabetesTrainM['readmitted'].replace([2, 1], [1, 0])

testX = DiabetesTestM.drop('readmitted', axis=1)
testY = DiabetesTestM['readmitted'].replace([2, 1], [1, 0])


#Instantiate the K-fold generator object:
np.random.seed(0)
DiabetesAD5Fold = ms_k5.split(trainX, trainY)

#Produce list of test R^2 scores, and Actual vs Predicted lists for the individual runs
ScoreList = []
PredictList = []
PureTestScore = []
TrainScores = []
ActualValues = []

for train, test in DiabetesAD5Fold:

    
    #Run the fit using the train data for each K
    lgr.fit(trainX.iloc[train,], trainY[train])
    #Run your predicion for the "missing" K-part
    p = lgr.predict(trainX.iloc[test,])
    actual = trainY[train].values
    #Check your schore for the missing K-part
    R2 = lgr.score(trainX.iloc[test,], trainY[test])
    #Run a test on the completely untouched test 20%
    TestR2 = lgr.score(testX, testY)
    TrainScore = lgr.score(trainX.iloc[train,], trainY[train])
    
    #Append these scores to the lists above
    ScoreList.append(R2)
    PureTestScore.append(TestR2)
    PredictList.append(p)
    TrainScores.append(TrainScore)
    ActualValues.append(actual)
    
    #Make predictions for the completely untouched 20%
    PredictionsTest = lgr.predict(testX)
    
    #Use these predictions to calculate RMSLE for the untouched 20% and append
    #RMSLEvalue = np.sqrt(np.mean(np.power(np.log1p(testY)-np.log1p(PredictionsTest), 2)))
    #RMSLE.append(RMSLEvalue)

In [112]:
PureTestScore

[0.8896477177811625,
 0.889696850587137,
 0.8897951161990861,
 0.889598584975188,
 0.889696850587137]

In [116]:
#Finally, try the logistic regression with c=0.001 and reduced features and see how that works:

from sklearn.linear_model import LogisticRegression as lgr

lgr = lgr()
lgr.set_params(C=1000)

import sklearn.model_selection as ms
ms_k5 = ms.KFold(n_splits=5, shuffle=True)

#First try with all features:



#Create X and Y variables (replace 2 to 1 and 1 to 0)
trainX = DiabetesTrainM.drop('readmitted', axis=1)
trainY = DiabetesTrainM['readmitted'].replace([2, 1], [1, 0])

testX = DiabetesTestM.drop('readmitted', axis=1)
testY = DiabetesTestM['readmitted'].replace([2, 1], [1, 0])


#Instantiate the K-fold generator object:
np.random.seed(0)
DiabetesAD5Fold = ms_k5.split(trainX, trainY)

#Produce list of test R^2 scores, and Actual vs Predicted lists for the individual runs
ScoreList = []
PredictList = []
PureTestScore = []
TrainScores = []
ActualValues = []

for train, test in DiabetesAD5Fold:

    
    #Run the fit using the train data for each K
    lgr.fit(trainX.iloc[train,], trainY[train])
    #Run your predicion for the "missing" K-part
    p = lgr.predict(trainX.iloc[test,])
    actual = trainY[train].values
    #Check your schore for the missing K-part
    R2 = lgr.score(trainX.iloc[test,], trainY[test])
    #Run a test on the completely untouched test 20%
    TestR2 = lgr.score(testX, testY)
    TrainScore = lgr.score(trainX.iloc[train,], trainY[train])
    
    #Append these scores to the lists above
    ScoreList.append(R2)
    PureTestScore.append(TestR2)
    PredictList.append(p)
    TrainScores.append(TrainScore)
    ActualValues.append(actual)
    
    #Make predictions for the completely untouched 20%
    PredictionsTest = lgr.predict(testX)
    
    #Use these predictions to calculate RMSLE for the untouched 20% and append
    #RMSLEvalue = np.sqrt(np.mean(np.power(np.log1p(testY)-np.log1p(PredictionsTest), 2)))
    #RMSLE.append(RMSLEvalue)

In [118]:
PureTestScore

[0.8889598584975188,
 0.8890581241094679,
 0.8889598584975188,
 0.8890581241094679,
 0.8889598584975188]

In [119]:
np.sum(PredictList[0])

68