In [1]:
import IPython
import pandas as pd
import numpy as np
from statistics import mode
import matplotlib.pyplot as plt  
from sklearn.preprocessing import MinMaxScaler

In [28]:
# load modified file with some pre-processing completed
df = pd.read_csv("preprocessed_3.csv", index_col=0)
df.shape

(100120, 54)

In [29]:
df.head(10).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
encounter_id,2278392,149190,64410,500364,16680,35754,55842,63768,12522,15738
patient_nbr,8222157,55629189,86047875,82442376,42519267,82637451,84259809,114882984,48330783,63555939
race,Caucasian,Caucasian,AfricanAmerican,Caucasian,Caucasian,Caucasian,Caucasian,Caucasian,Caucasian,Caucasian
gender,0,0,0,1,1,1,1,1,0,0
age,1,2,3,4,5,6,7,8,9,10
admission_type_id,6,1,1,1,1,2,3,1,2,3
discharge_disposition_id,25,1,1,1,1,1,1,1,1,3
admission_source_id,1,7,7,7,7,2,2,7,4,4
time_in_hospital,1,3,2,2,1,3,4,5,13,12
num_lab_procedures,41,59,11,44,51,31,70,73,68,33


In [30]:
# convert data type of nominal features in dataframe to 'object' type
i = ['encounter_id', 'patient_nbr', 'gender', 'admission_type_id', 'discharge_disposition_id', 'admission_source_id',\
          'A1Cresult', 'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide', \
          'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose','miglitol', \
          'troglitazone', 'tolazamide', 'insulin', 'glyburide-metformin', 'glipizide-metformin', \
          'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone', 'change', 'diabetesMed', \
          'age', 'A1Cresult', 'max_glu_serum', 'level1_diag1', 'level1_diag2', 'level1_diag3', 'level2_diag1', 'level2_diag2', 'level2_diag3']

df[i] = df[i].astype('object')

In [31]:
df.dtypes

encounter_id                object
patient_nbr                 object
race                        object
gender                      object
age                         object
admission_type_id           object
discharge_disposition_id    object
admission_source_id         object
time_in_hospital             int64
num_lab_procedures           int64
num_procedures               int64
num_medications              int64
number_outpatient            int64
number_emergency             int64
number_inpatient             int64
diag_1                      object
diag_2                      object
diag_3                      object
number_diagnoses             int64
max_glu_serum               object
A1Cresult                   object
metformin                   object
repaglinide                 object
nateglinide                 object
chlorpropamide              object
glimepiride                 object
acetohexamide               object
glipizide                   object
glyburide           

In [32]:
"""
This code converts age as categorical variable to a continuous approximation by assuming mid-point of each age-category as
the actual age value. This is done to avoid having to deal with age as a dummy variable in the models which makes
interpretation very cumbersome. Also, since age category is not purely nominal but ordinal, we do not want to lose that
information by treating it as a simple categorical variable.
"""

# convert age back to integer type
df['age'] = df['age'].astype('int64')
print(df.age.value_counts())
# convert age categories to mid-point values
age_dict = {1:5, 2:15, 3:25, 4:35, 5:45, 6:55, 7:65, 8:75, 9:85, 10:95}
df['age'] = df.age.map(age_dict)
print(df.age.value_counts())

8     25564
7     22186
6     17102
9     16708
5      9626
4      3765
10     2669
3      1650
2       690
1       160
Name: age, dtype: int64
75    25564
65    22186
55    17102
85    16708
45     9626
35     3765
95     2669
25     1650
15      690
5       160
Name: age, dtype: int64


In [33]:
# get list of only numeric features
num_col = list(set(list(df._get_numeric_data().columns))- {'readmitted'})

In [34]:
num_col

['number_diagnoses',
 'number_outpatient',
 'nummed',
 'num_procedures',
 'age',
 'numchange',
 'time_in_hospital',
 'number_inpatient',
 'number_emergency',
 'num_medications',
 'num_lab_procedures',
 'service_utilization']

In [35]:
# Removing skewnewss and kurtosis using log transformation if it is above a threshold value (2)

statdataframe = pd.DataFrame()
statdataframe['numeric_column'] = num_col
skew_before = []
skew_after = []

kurt_before = []
kurt_after = []

standard_deviation_before = []
standard_deviation_after = []

log_transform_needed = []

log_type = []

for i in num_col:
    skewval = df[i].skew()
    skew_before.append(skewval)
    
    kurtval = df[i].kurtosis()
    kurt_before.append(kurtval)
    
    sdval = df[i].std()
    standard_deviation_before.append(sdval)
    
    if (abs(skewval) >2) & (abs(kurtval) >2):
        log_transform_needed.append('Yes')
        
        if len(df[df[i] == 0])/len(df) <=0.02:
            log_type.append('log')
            skewvalnew = np.log(pd.DataFrame(df[train_data[i] > 0])[i]).skew()
            skew_after.append(skewvalnew)
            
            kurtvalnew = np.log(pd.DataFrame(df[train_data[i] > 0])[i]).kurtosis()
            kurt_after.append(kurtvalnew)
            
            sdvalnew = np.log(pd.DataFrame(df[train_data[i] > 0])[i]).std()
            standard_deviation_after.append(sdvalnew)
            
        else:
            log_type.append('log1p')
            skewvalnew = np.log1p(pd.DataFrame(df[df[i] >= 0])[i]).skew()
            skew_after.append(skewvalnew)
        
            kurtvalnew = np.log1p(pd.DataFrame(df[df[i] >= 0])[i]).kurtosis()
            kurt_after.append(kurtvalnew)
            
            sdvalnew = np.log1p(pd.DataFrame(df[df[i] >= 0])[i]).std()
            standard_deviation_after.append(sdvalnew)
            
    else:
        log_type.append('NA')
        log_transform_needed.append('No')
        
        skew_after.append(skewval)
        kurt_after.append(kurtval)
        standard_deviation_after.append(sdval)

statdataframe['skew_before'] = skew_before
statdataframe['kurtosis_before'] = kurt_before
statdataframe['standard_deviation_before'] = standard_deviation_before
statdataframe['log_transform_needed'] = log_transform_needed
statdataframe['log_type'] = log_type
statdataframe['skew_after'] = skew_after
statdataframe['kurtosis_after'] = kurt_after
statdataframe['standard_deviation_after'] = standard_deviation_after

In [36]:
statdataframe

Unnamed: 0,numeric_column,skew_before,kurtosis_before,standard_deviation_before,log_transform_needed,log_type,skew_after,kurtosis_after,standard_deviation_after
0,number_diagnoses,-0.867785,-0.109005,1.938211,No,,-0.867785,-0.109005,1.938211
1,number_outpatient,8.818291,148.558544,1.263973,Yes,log1p,2.733914,7.804218,0.429394
2,nummed,0.675116,0.277244,0.92162,No,,0.675116,0.277244,0.92162
3,num_procedures,1.32602,0.890773,1.700335,No,,1.32602,0.890773,1.700335
4,age,-0.626715,0.274302,15.947496,No,,-0.626715,0.274302,15.947496
5,numchange,1.42531,1.433619,0.487858,No,,1.42531,1.433619,0.487858
6,time_in_hospital,1.137931,0.871189,2.974528,No,,1.137931,0.871189,2.974528
7,number_inpatient,3.626402,20.833542,1.261825,Yes,log1p,1.450492,1.405364,0.510433
8,number_emergency,22.842251,1185.246374,0.935517,Yes,log1p,3.661064,16.27661,0.315405
9,num_medications,1.333039,3.523472,8.092612,No,,1.333039,3.523472,8.092612


In [37]:
# performing the log transformation for the columns determined to be needing it above.

for i in range(len(statdataframe)):
    if statdataframe['log_transform_needed'][i] == 'Yes':
        colname = str(statdataframe['numeric_column'][i])
        
        if statdataframe['log_type'][i] == 'log':
            df = df[df[colname] > 0]
            df[colname + "_log"] = np.log(df[colname])
            
        elif statdataframe['log_type'][i] == 'log1p':
            df = df[df[colname] >= 0]
            df[colname + "_log1p"] = np.log1p(df[colname])

In [38]:
df = df.drop(['number_outpatient', 'number_inpatient', 'number_emergency','service_utilization'], axis = 1)

In [39]:
df.shape

(100120, 54)

In [40]:
# get list of only numeric features
numerics = list(set(list(df._get_numeric_data().columns))- {'readmitted'})

In [41]:
numerics

['number_emergency_log1p',
 'number_diagnoses',
 'nummed',
 'num_procedures',
 'number_inpatient_log1p',
 'age',
 'numchange',
 'time_in_hospital',
 'num_medications',
 'num_lab_procedures',
 'number_outpatient_log1p',
 'service_utilization_log1p']

In [42]:
# show list of features that are categorical
df.encounter_id = df.encounter_id.astype('int64')
df.patient_nbr = df.patient_nbr.astype('int64')
df.diabetesMed = df.diabetesMed.astype('int64')
df.change = df.change.astype('int64')

# convert data type of nominal features in dataframe to 'object' type for aggregating
i = ['metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide', \
          'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose','miglitol', \
          'troglitazone', 'tolazamide', 'insulin', 'glyburide-metformin', 'glipizide-metformin', \
          'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone','A1Cresult']
df[i] = df[i].astype('int64')

df.dtypes

encounter_id                   int64
patient_nbr                    int64
race                          object
gender                        object
age                            int64
admission_type_id             object
discharge_disposition_id      object
admission_source_id           object
time_in_hospital               int64
num_lab_procedures             int64
num_procedures                 int64
num_medications                int64
diag_1                        object
diag_2                        object
diag_3                        object
number_diagnoses               int64
max_glu_serum                 object
A1Cresult                      int64
metformin                      int64
repaglinide                    int64
nateglinide                    int64
chlorpropamide                 int64
glimepiride                    int64
acetohexamide                  int64
glipizide                      int64
glyburide                      int64
tolbutamide                    int64
p

In [43]:
df.A1Cresult.value_counts()

-99    83243
 1     11935
 0      4942
Name: A1Cresult, dtype: int64

In [44]:
dfcopy = df.copy(deep = True)

In [45]:
df = dfcopy.copy(deep = True)

In [46]:
df['readmitted'] = df['readmitted'].apply(lambda x: 0 if x == 2 else x)

In [47]:
# drop individual diagnosis columns that have too granular disease information
# also drop level 2 categorization (which was not comparable with any reference)
# also drop level 1 secondary and tertiary diagnoses
df.drop(['diag_1', 'diag_2', 'diag_3', 'level2_diag1', 'level1_diag2', 'level2_diag2', 'level1_diag3',
         'level2_diag3'], axis=1, inplace=True)

In [52]:
df.head(2)

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,...,change,diabetesMed,readmitted,numchange,nummed,level1_diag1,number_outpatient_log1p,number_inpatient_log1p,number_emergency_log1p,service_utilization_log1p
0,2278392,8222157,Caucasian,0,5,6,25,1,1,41,...,0,0,0,0,0,4,0.0,0.0,0.0,0.0
1,149190,55629189,Caucasian,0,15,1,1,7,3,59,...,1,1,0,1,1,0,0.0,0.0,0.0,0.0


In [56]:
# apply minimax scaling to all numeric features
for feature in numerics:
    scaler = MinMaxScaler()
    df[numerics] = scaler.fit_transform(df[numerics])

### Scaling applied to df; using df2 from here

In [59]:
# dropping multiple encounters while keeping either first or last encounter of these patients
df2 = df.drop_duplicates(subset= ['patient_nbr'], keep = 'first')
df2.shape

(70442, 46)

In [66]:
import seaborn as sns
from matplotlib.colors import ListedColormap
# my_cmap = ListedColormap(sns.color_palette("RdYlGn", n_colors=15).as_hex())
# my_cmap = ListedColormap(sns.diverging_palette(150, 250, sep=120, n=28, center="light").as_hex())

my_cmap = ListedColormap(sns.light_palette((250, 100, 50), input="husl", n_colors=50).as_hex())
# drop some columns due to their means is round to 0
# table = df.drop(['acetohexamide','tolbutamide', 'troglitazone', 'glipizide-metformin', 'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone','patient_nbr', 'encounter_id', 'service_utilization_log1p'], axis = 1).corr(method='pearson')
table = df2.drop(['patient_nbr', 'encounter_id'], axis=1).corr(method='pearson')
table.style.background_gradient(cmap=my_cmap, axis = 0)

  cbook._putmask(xa, xa < 0.0, -1)


Unnamed: 0,age,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_diagnoses,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted,numchange,nummed,number_outpatient_log1p,number_inpatient_log1p,number_emergency_log1p,service_utilization_log1p
age,1.0,0.128561,0.0266174,-0.02434,0.0544389,0.254252,-0.119782,-0.0579647,0.0414484,0.0161402,0.0152096,0.0377133,0.00223594,0.0545017,0.0815643,0.00864805,0.0164393,0.00433687,0.010257,0.00841861,-0.00157337,0.00535901,-0.0897403,-0.000501737,0.00235053,,0.00316212,-0.000122294,-0.0352598,-0.0225514,0.0457358,-0.067106,-0.00361263,0.032686,0.0262302,-0.0443524,0.0237037
time_in_hospital,0.128561,1.0,0.332129,0.187856,0.467931,0.23437,0.0730612,-0.00341142,0.0316879,0.00702791,0.00150421,0.017376,0.0124541,0.0223775,0.0316636,0.00285512,0.00630963,0.00580346,0.00661367,-0.00248417,0.00455109,-0.00550029,0.106829,-0.00257176,0.0014615,,-0.000513566,0.0022003,0.111446,0.068631,0.0546615,0.16797,0.085771,-0.021992,0.0704713,-0.00759536,0.0169746
num_lab_procedures,0.0266174,0.332129,1.0,0.0437017,0.259065,0.154574,0.270411,-0.043375,0.00680715,-0.00734694,-0.00321878,-0.00163993,0.00474876,0.0255972,0.00638916,0.000523227,-0.0111864,-0.00693091,0.000153428,-0.00408003,0.00450863,-0.00174741,0.107558,-0.0141136,-0.00382334,,0.00149459,-0.00377108,0.0766929,0.0452145,0.0317229,0.12901,0.0428562,-0.0196605,0.0849385,0.017373,0.0303425
num_procedures,-0.02434,0.187856,0.0437017,1.0,0.400298,0.0855402,-0.0273474,-0.044993,0.00424909,-0.00321778,0.00612367,0.00797309,0.0055344,0.00907378,0.0104035,-0.000613463,0.0150408,0.0126282,0.00168803,0.000266634,-0.00528889,0.000914676,0.0156393,-0.00400785,-0.00321006,,0.00479057,-0.000906525,0.00623448,-0.00812359,7.41584e-05,0.0187962,0.00506419,-0.0228547,-0.023825,-0.0404431,-0.0442299
num_medications,0.0544389,0.467931,0.259065,0.400298,1.0,0.2586,0.0190428,0.0821979,0.0251063,0.0254037,0.00187453,0.0459042,0.00834018,0.0646736,0.0497511,0.00195039,0.0723863,0.0582677,0.0177404,0.00393478,0.0028895,-8.17504e-05,0.203417,0.0111631,0.00263714,,0.0095432,0.0019715,0.246424,0.187527,0.0360049,0.2273,0.237304,0.0328306,0.0429493,0.00835637,0.045897
number_diagnoses,0.254252,0.23437,0.154574,0.0855402,0.2586,1.0,-0.00953908,-0.0614565,0.0313363,0.0183025,-0.0135243,0.0152679,0.00333302,-0.0014698,-0.01698,-0.000427626,0.0117765,-0.00423499,0.00631924,-0.00237288,0.00468492,-0.0123708,0.0757276,-0.00338175,0.00169509,,0.00471363,-0.00609025,0.0480764,0.0200636,0.0443419,0.0654594,0.0178828,0.0903418,0.0793302,0.0648761,0.123683
A1Cresult,-0.119782,0.0730612,0.270411,-0.0273474,0.0190428,-0.00953908,1.0,0.0434001,0.0182952,0.000374328,-0.00346282,0.017214,-0.00178218,0.0115492,0.00375512,-0.00260625,0.000247278,0.00729526,0.00501895,0.000774054,-0.00308687,-0.000855015,0.104236,-0.00423321,0.00267502,,0.00439246,-0.00178218,0.0970424,0.0759254,-0.00916522,0.124851,0.0875953,-0.0398983,-0.0471473,-0.00780217,-0.0558532
metformin,-0.0579647,-0.00341142,-0.043375,-0.044993,0.0821979,-0.0614565,0.0434001,1.0,0.00596062,0.0151929,-0.0110045,0.0403678,-0.00195449,0.0764801,0.145164,-0.0080595,0.0574363,0.0943838,0.0126194,0.0139341,-0.00338533,-0.000607633,-0.0216575,-0.0254067,-0.00168714,,0.00375399,0.0072634,0.351968,0.290631,-0.0118034,0.0690069,0.548881,0.0130275,-0.0407038,-0.00515867,-0.012172
repaglinide,0.0414484,0.0316879,0.00680715,0.00424909,0.0251063,0.0313363,0.0182952,0.00596062,1.0,-0.000678382,-0.00365799,-0.00704232,-0.000433908,-0.0195836,-0.0215672,-0.00178925,0.019871,0.0174588,0.0195974,0.0203007,-0.000751562,-0.0023771,0.00801543,-0.00521636,-0.00114806,,-0.000613643,-0.000433908,0.075719,0.0645215,0.013832,0.0460792,0.122584,-0.00218385,0.00936368,0.00944679,0.00754135
nateglinide,0.0161402,0.00702791,-0.00734694,-0.00321778,0.0254037,0.0183025,0.000374328,0.0151929,-0.000678382,1.0,-0.00266664,0.00763138,-0.000316315,-0.015442,-0.0209651,-0.00130435,0.0297787,0.0145524,0.00189398,0.0188004,-0.000547882,-0.00173289,0.000407088,-0.00299645,-0.000836927,,-0.00044734,-0.000316315,0.0562767,0.0470356,-0.00361935,0.0046126,0.0967559,0.00740182,-0.00524577,0.00209036,0.00419324


In [115]:
pd.options.display.max_rows = 400

c = df2.corr().abs()
s = c.unstack()
print(s.shape)
so = s.sort_values(ascending=False)

so[38:120]

(1521,)


number_outpatient_log1p    service_utilization_log1p    0.753627
service_utilization_log1p  number_outpatient_log1p      0.753627
nummed                     change                       0.731185
change                     nummed                       0.731185
diabetesMed                nummed                       0.706872
nummed                     diabetesMed                  0.706872
numchange                  change                       0.609089
change                     numchange                    0.609089
number_inpatient_log1p     service_utilization_log1p    0.605494
service_utilization_log1p  number_inpatient_log1p       0.605494
insulin                    diabetesMed                  0.572456
diabetesMed                insulin                      0.572456
nummed                     metformin                    0.548881
metformin                  nummed                       0.548881
diabetesMed                change                       0.506272
change                   

In [369]:
df_pd = pd.get_dummies(df_p, columns=['race', 'gender', 'admission_type_id', 'discharge_disposition_id',
                                      'admission_source_id', 'max_glu_serum', 'A1Cresult', ])

In [370]:
non_num_cols = ['race', 'gender', 'admission_type_id', 'discharge_disposition_id', 'admission_source_id', 'max_glu_serum', 'A1Cresult', ]

In [371]:
num_cols = list(set(list(df._get_numeric_data().columns))- {'readmitted', 'change'})
num_cols

['age',
 'number_diagnoses',
 'glyburide',
 'metformin-pioglitazone',
 'rosiglitazone',
 'glimepiride-pioglitazone',
 'glipizide',
 'num_lab_procedures',
 'tolazamide',
 'glipizide-metformin',
 'pioglitazone',
 'service_utilization_log1p',
 'num_procedures',
 'number_inpatient_log1p',
 'insulin',
 'chlorpropamide',
 'number_emergency_log1p',
 'num_medications',
 'glimepiride',
 'metformin',
 'encounter_id',
 'number_outpatient_log1p',
 'acarbose',
 'metformin-rosiglitazone',
 'numchange',
 'miglitol',
 'A1Cresult',
 'repaglinide',
 'troglitazone',
 'glyburide-metformin',
 'diabetesMed',
 'patient_nbr',
 'acetohexamide',
 'nateglinide',
 'time_in_hospital',
 'nummed',
 'tolbutamide']

In [372]:
new_non_num_cols = []
for i in non_num_cols:
    for j in df_pd.columns:
        if i in j:
            new_non_num_cols.append(j)

# new_non_num_cols

### Modeling

In [374]:
full_feature_list = ['age', 'time_in_hospital', 'num_lab_procedures',
       'num_procedures', 'num_medications', 'number_outpatient_log1p',
       'number_emergency_log1p', 'number_inpatient_log1p', 'number_diagnoses',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'insulin', 'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'diabetesMed', 'service_utilization_log1p',
       'level1_diag1', 'race_?', 'race_AfricanAmerican', 'race_Asian',
       'race_Caucasian', 'race_Hispanic', 'race_Other', 'gender_0', 'gender_1',
       'admission_type_id_1', 'admission_type_id_2', 'admission_type_id_3',
       'admission_type_id_4', 'admission_type_id_5', 'admission_type_id_6',
       'admission_type_id_7', 'admission_type_id_8',
       'discharge_disposition_id_1', 'discharge_disposition_id_2',
       'discharge_disposition_id_3', 'discharge_disposition_id_4',
       'discharge_disposition_id_5', 'discharge_disposition_id_6',
       'discharge_disposition_id_7', 'discharge_disposition_id_8',
       'discharge_disposition_id_9', 'discharge_disposition_id_10',
       'discharge_disposition_id_12', 'discharge_disposition_id_13',
       'discharge_disposition_id_14', 'discharge_disposition_id_15',
       'discharge_disposition_id_16', 'discharge_disposition_id_17',
       'discharge_disposition_id_18', 'discharge_disposition_id_19',
       'discharge_disposition_id_20', 'discharge_disposition_id_22',
       'discharge_disposition_id_23', 'discharge_disposition_id_24',
       'discharge_disposition_id_25', 'discharge_disposition_id_27',
       'discharge_disposition_id_28', 'admission_source_id_1',
       'admission_source_id_2', 'admission_source_id_3',
       'admission_source_id_4', 'admission_source_id_5',
       'admission_source_id_6', 'admission_source_id_7',
       'admission_source_id_8', 'admission_source_id_9',
       'admission_source_id_10', 'admission_source_id_11',
       'admission_source_id_13', 'admission_source_id_14',
       'admission_source_id_17', 'admission_source_id_20',
       'admission_source_id_22', 'admission_source_id_25', 'max_glu_serum_-99',
       'max_glu_serum_0', 'max_glu_serum_1', 'A1Cresult_-99', 'A1Cresult_0',
       'A1Cresult_1', 'nummed', 'numchange']

In [375]:
train_input = df_pd[full_feature_list]
train_output = df_pd['readmitted']

In [376]:
df_pd['readmitted'].value_counts()

0    56476
1     5199
Name: readmitted, dtype: int64

In [377]:
train_input.columns

Index(['age', 'time_in_hospital', 'num_lab_procedures', 'num_procedures',
       'num_medications', 'number_outpatient_log1p', 'number_emergency_log1p',
       'number_inpatient_log1p', 'number_diagnoses', 'metformin',
       'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride',
       'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'insulin', 'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'diabetesMed', 'service_utilization_log1p',
       'level1_diag1', 'race_?', 'race_AfricanAmerican', 'race_Asian',
       'race_Caucasian', 'race_Hispanic', 'race_Other', 'gender_0', 'gender_1',
       'admission_type_id_1', 'admission_type_id_2', 'admission_type_id_3',
       'admission_type_id_4', 'admission_type_id_5', 'admission_type_id_6',
       'admission_type_id_7', 'admission_type_

In [378]:
from sklearn.linear_model import RandomizedLogisticRegression
clffeat = RandomizedLogisticRegression()
X_train, X_dev, Y_train, Y_dev = train_test_split(train_input, train_output, test_size=0.20, random_state=0)
clffeat.fit(train_input,train_output)
lstfeat = list(clffeat.get_support(indices=True))
score = list(clffeat.all_scores_)



In [379]:
favorfeatures = [list(X_train.columns)[i] for i in lstfeat]

In [380]:
score  = [float(x) for x in score]
Z = [(x[1],x[0]) for x in sorted(zip(score,X_train.columns), reverse = True)]
Znew = []
for x in Z:
    if x[0] in favorfeatures:
        Znew.append(x)

In [381]:
Znew

[('discharge_disposition_id_22', 1.0),
 ('discharge_disposition_id_1', 1.0),
 ('discharge_disposition_id_5', 0.92),
 ('number_inpatient_log1p', 0.835),
 ('discharge_disposition_id_3', 0.635),
 ('age', 0.545),
 ('discharge_disposition_id_28', 0.53),
 ('time_in_hospital', 0.52),
 ('discharge_disposition_id_15', 0.52),
 ('number_diagnoses', 0.495),
 ('diabetesMed', 0.425),
 ('service_utilization_log1p', 0.405)]

In [382]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
X_train, X_dev, Y_train, Y_dev = train_test_split(train_input, train_output, test_size=0.20, random_state=0)
logreg = LogisticRegression(fit_intercept=True, multi_class = 'ovr')
print("Cross Validation Score: {:.2%}".format(np.mean(cross_val_score(logreg, X_train[favorfeatures], Y_train, cv=25))))
logreg.fit(X_train[favorfeatures], Y_train)
print("Dev Set score: {:.2%}".format(logreg.score(X_dev[favorfeatures], Y_dev)))

Cross Validation Score: 91.67%
Dev Set score: 91.16%


In [383]:
Y_dev_predict = logreg.predict(X_dev[favorfeatures])

In [384]:
pd.crosstab(pd.Series(Y_dev, name = 'Actual'), pd.Series(Y_dev_predict, name = 'Predict'), margins = True)

Predict,0,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1580,1580
1,157,157
All,1737,1737


In [385]:
from sklearn.metrics import accuracy_score, precision_score, recall_score
print("Accuracy is {0:.2f}".format(accuracy_score(Y_dev, Y_dev_predict)))
print("Precision is {0:.2f}".format(precision_score(Y_dev, Y_dev_predict)))
print("Recall is {0:.2f}".format(recall_score(Y_dev, Y_dev_predict)))

Accuracy is 0.91
Precision is 0.00
Recall is 0.00


In [386]:
from imblearn.over_sampling import SMOTE

from collections import Counter
print('Original dataset shape {}'.format(Counter(train_output)))
sm = SMOTE(random_state=20)
train_input_new, train_output_new = sm.fit_sample(train_input, train_output)
print('New dataset shape {}'.format(Counter(train_output_new)))

Original dataset shape Counter({0: 56476, 1: 5199})
New dataset shape Counter({0: 56476, 1: 56476})


In [387]:
train_input_new = pd.DataFrame(train_input_new, columns = list(train_input.columns))

In [388]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
X_train, X_dev, Y_train, Y_dev = train_test_split(train_input_new, train_output_new, test_size=0.20, random_state=0)
logreg = LogisticRegression(fit_intercept=True)
print("Cross Validation Score: {:.2%}".format(np.mean(cross_val_score(logreg, X_train, Y_train, cv=25))))
logreg.fit(X_train, Y_train)
print("Dev Set score: {:.2%}".format(logreg.score(X_dev, Y_dev)))

Cross Validation Score: 59.88%
Dev Set score: 59.71%


In [389]:
Y_dev_predict = logreg.predict(X_dev)

In [390]:
pd.crosstab(pd.Series(Y_dev, name = 'Actual'), pd.Series(Y_dev_predict, name = 'Predict'), margins = True)

Predict,0,1,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,7147,4121,11268
1,4981,6342,11323
All,12128,10463,22591


In [391]:
from sklearn.metrics import accuracy_score, precision_score, recall_score
print("Accuracy is {0:.2f}".format(accuracy_score(Y_dev, Y_dev_predict)))
print("Precision is {0:.2f}".format(precision_score(Y_dev, Y_dev_predict)))
print("Recall is {0:.2f}".format(recall_score(Y_dev, Y_dev_predict)))

Accuracy is 0.60
Precision is 0.61
Recall is 0.56


In [392]:
[print(fname, coeffs) for fname, coeffs in zip(X_train.columns, np.round(logreg.coef_[0],3))];

age 0.422
time_in_hospital 0.215
num_lab_procedures 0.312
num_procedures -0.09
num_medications 0.092
number_outpatient_log1p -0.13
number_emergency_log1p -0.114
number_inpatient_log1p 0.357
number_diagnoses 0.568
metformin 0.185
repaglinide 0.22
nateglinide -0.395
chlorpropamide -0.834
glimepiride -0.004
acetohexamide 0.0
glipizide 0.262
glyburide 0.188
tolbutamide -1.073
pioglitazone -0.055
rosiglitazone 0.041
acarbose -0.357
miglitol -1.494
troglitazone -0.722
tolazamide -1.251
insulin 0.424
glyburide-metformin -0.161
glipizide-metformin -1.141
glimepiride-pioglitazone 0.0
metformin-rosiglitazone -0.319
metformin-pioglitazone -0.278
diabetesMed 0.423
service_utilization_log1p -0.133
level1_diag1 -0.017
race_? -0.18
race_AfricanAmerican 0.192
race_Asian -0.376
race_Caucasian 0.221
race_Hispanic -0.316
race_Other -0.443
gender_0 -0.465
gender_1 -0.437
admission_type_id_1 0.25
admission_type_id_2 0.25
admission_type_id_3 0.159
admission_type_id_4 0.439
admission_type_id_5 0.052
admissio

In [393]:
logreg.intercept_

array([-0.90196507])

In [399]:
yhat = logreg.predict(train_input)
y = train_output
SS_Residual = sum((y-yhat)**2)
SS_Total = sum((y-np.mean(y))**2)
r_squared = 1 - (float(SS_Residual))/SS_Total
adjusted_r_squared = 1 - (1-r_squared)*(len(y)-1)/(len(y)-train_input.shape[1]-1)
print(r_squared, adjusted_r_squared)
# 0.877643371323 0.863248473832

-3.832650675915084 -3.8404205893038874
