In [6]:
import IPython
import pandas as pd
import numpy as np
from statistics import mode
import matplotlib.pyplot as plt  
from sklearn.preprocessing import MinMaxScaler

In [7]:
# load modified file with some pre-processing completed
df = pd.read_csv("modify.csv", index_col=0)
df.shape

(100120, 52)

In [8]:
df.head(10).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
encounter_id,2278392,149190,64410,500364,16680,35754,55842,63768,12522,15738
patient_nbr,8222157,55629189,86047875,82442376,42519267,82637451,84259809,114882984,48330783,63555939
race,Caucasian,Caucasian,AfricanAmerican,Caucasian,Caucasian,Caucasian,Caucasian,Caucasian,Caucasian,Caucasian
gender,0,0,0,1,1,1,1,1,0,0
age,1,2,3,4,5,6,7,8,9,10
admission_type_id,6,1,1,1,1,2,3,1,2,3
discharge_disposition_id,25,1,1,1,1,1,1,1,1,3
admission_source_id,1,7,7,7,7,2,2,7,4,4
time_in_hospital,1,3,2,2,1,3,4,5,13,12
num_lab_procedures,41,59,11,44,51,31,70,73,68,33


In [9]:
# convert data type of nominal features in dataframe to 'object' type
for i in ('encounter_id', 'patient_nbr', 'gender', 'admission_type_id', 'discharge_disposition_id', 'admission_source_id',\
          'A1Cresult', 'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide', \
          'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose','miglitol', \
          'troglitazone', 'tolazamide', 'insulin', 'glyburide-metformin', 'glipizide-metformin', \
          'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone', 'change', 'diabetesMed', \
          'age', 'A1Cresult', 'max_glu_serum', 'level1_diag1', 'level1_diag2', 'level1_diag3', 'level2_diag1', 'level2_diag2', 'level2_diag3' ):
    df[i] = df[i].astype('object')

In [10]:
df.dtypes

encounter_id                object
patient_nbr                 object
race                        object
gender                      object
age                         object
admission_type_id           object
discharge_disposition_id    object
admission_source_id         object
time_in_hospital             int64
num_lab_procedures           int64
num_procedures               int64
num_medications              int64
number_outpatient            int64
number_emergency             int64
number_inpatient             int64
diag_1                      object
diag_2                      object
diag_3                      object
number_diagnoses             int64
max_glu_serum               object
A1Cresult                   object
metformin                   object
repaglinide                 object
nateglinide                 object
chlorpropamide              object
glimepiride                 object
acetohexamide               object
glipizide                   object
glyburide           

In [11]:
"""
This code converts age as categorical variable to a continuous approximation by assuming mid-point of each age-category as
the actual age value. This is done to avoid having to deal with age as a dummy variable in the models which makes
interpretation very cumbersome. Also, since age category is not purely nominal but ordinal, we do not want to lose that
information by treating it as a simple categorical variable.
"""

# convert age back to integer type
df['age'] = df['age'].astype('int64')
print(df.age.value_counts())
# convert age categories to mid-point values
age_dict = {1:5, 2:15, 3:25, 4:35, 5:45, 6:55, 7:65, 8:75, 9:85, 10:95}
df['age'] = df.age.map(age_dict)
print(df.age.value_counts())

8     25564
7     22186
6     17102
9     16708
5      9626
4      3765
10     2669
3      1650
2       690
1       160
Name: age, dtype: int64
75    25564
65    22186
55    17102
85    16708
45     9626
35     3765
95     2669
25     1650
15      690
5       160
Name: age, dtype: int64


In [12]:
# get list of only numeric features
num_col = list(set(list(df._get_numeric_data().columns))- {'readmitted'})

In [13]:
num_col

['number_inpatient',
 'service_utilization',
 'age',
 'number_outpatient',
 'number_emergency',
 'number_diagnoses',
 'time_in_hospital',
 'num_medications',
 'num_procedures',
 'num_lab_procedures']

In [14]:
# Removing skewnewss and kurtosis using log transformation if it is above a threshold value (2)

statdataframe = pd.DataFrame()
statdataframe['numeric_column'] = num_col
skew_before = []
skew_after = []

kurt_before = []
kurt_after = []

standard_deviation_before = []
standard_deviation_after = []

log_transform_needed = []

log_type = []

for i in num_col:
    skewval = df[i].skew()
    skew_before.append(skewval)
    
    kurtval = df[i].kurtosis()
    kurt_before.append(kurtval)
    
    sdval = df[i].std()
    standard_deviation_before.append(sdval)
    
    if (abs(skewval) >2) & (abs(kurtval) >2):
        log_transform_needed.append('Yes')
        
        if len(df[df[i] == 0])/len(df) <=0.02:
            log_type.append('log')
            skewvalnew = np.log(pd.DataFrame(df[train_data[i] > 0])[i]).skew()
            skew_after.append(skewvalnew)
            
            kurtvalnew = np.log(pd.DataFrame(df[train_data[i] > 0])[i]).kurtosis()
            kurt_after.append(kurtvalnew)
            
            sdvalnew = np.log(pd.DataFrame(df[train_data[i] > 0])[i]).std()
            standard_deviation_after.append(sdvalnew)
            
        else:
            log_type.append('log1p')
            skewvalnew = np.log1p(pd.DataFrame(df[df[i] >= 0])[i]).skew()
            skew_after.append(skewvalnew)
        
            kurtvalnew = np.log1p(pd.DataFrame(df[df[i] >= 0])[i]).kurtosis()
            kurt_after.append(kurtvalnew)
            
            sdvalnew = np.log1p(pd.DataFrame(df[df[i] >= 0])[i]).std()
            standard_deviation_after.append(sdvalnew)
            
    else:
        log_type.append('NA')
        log_transform_needed.append('No')
        
        skew_after.append(skewval)
        kurt_after.append(kurtval)
        standard_deviation_after.append(sdval)

statdataframe['skew_before'] = skew_before
statdataframe['kurtosis_before'] = kurt_before
statdataframe['standard_deviation_before'] = standard_deviation_before
statdataframe['log_transform_needed'] = log_transform_needed
statdataframe['log_type'] = log_type
statdataframe['skew_after'] = skew_after
statdataframe['kurtosis_after'] = kurt_after
statdataframe['standard_deviation_after'] = standard_deviation_after

In [15]:
statdataframe

Unnamed: 0,numeric_column,skew_before,kurtosis_before,standard_deviation_before,log_transform_needed,log_type,skew_after,kurtosis_after,standard_deviation_after
0,number_inpatient,3.626402,20.833542,1.261825,Yes,log1p,1.450492,1.405364,0.510433
1,service_utilization,5.334724,67.77977,2.29273,Yes,log1p,1.119136,0.547353,0.662373
2,age,-0.626715,0.274302,15.947496,No,,-0.626715,0.274302,15.947496
3,number_outpatient,8.818291,148.558544,1.263973,Yes,log1p,2.733914,7.804218,0.429394
4,number_emergency,22.842251,1185.246374,0.935517,Yes,log1p,3.661064,16.27661,0.315405
5,number_diagnoses,-0.867785,-0.109005,1.938211,No,,-0.867785,-0.109005,1.938211
6,time_in_hospital,1.137931,0.871189,2.974528,No,,1.137931,0.871189,2.974528
7,num_medications,1.333039,3.523472,8.092612,No,,1.333039,3.523472,8.092612
8,num_procedures,1.32602,0.890773,1.700335,No,,1.32602,0.890773,1.700335
9,num_lab_procedures,-0.241491,-0.253099,19.620228,No,,-0.241491,-0.253099,19.620228


In [16]:
# performing the log transformation for the columns determined to be needing it above.

for i in range(len(statdataframe)):
    if statdataframe['log_transform_needed'][i] == 'Yes':
        colname = str(statdataframe['numeric_column'][i])
        
        if statdataframe['log_type'][i] == 'log':
            df = df[df[colname] > 0]
            df[colname + "_log"] = np.log(df[colname])
            
        elif statdataframe['log_type'][i] == 'log1p':
            df = df[df[colname] >= 0]
            df[colname + "_log1p"] = np.log1p(df[colname])

In [17]:
df = df.drop(['number_outpatient', 'number_inpatient', 'number_emergency','service_utilization'], axis = 1)

In [18]:
df.shape

(100120, 52)

In [19]:
# get list of only numeric features
numerics = list(set(list(df._get_numeric_data().columns))- {'readmitted'})

In [20]:
numerics

['age',
 'number_diagnoses',
 'time_in_hospital',
 'num_medications',
 'num_procedures',
 'service_utilization_log1p',
 'number_inpatient_log1p',
 'number_outpatient_log1p',
 'number_emergency_log1p',
 'num_lab_procedures']

In [23]:
# show list of features that are categorical
df.encounter_id = df.encounter_id.astype('int64')
df.patient_nbr = df.patient_nbr.astype('int64')
df.diabetesMed = df.diabetesMed.astype('int64')
df.change = df.change.astype('int64')

# convert data type of nominal features in dataframe to 'object' type for aggregating
for i in ('metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide', \
          'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose','miglitol', \
          'troglitazone', 'tolazamide', 'insulin', 'glyburide-metformin', 'glipizide-metformin', \
          'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone','A1Cresult'):
    df[i] = df[i].astype('int64')

df.dtypes

encounter_id                   int64
patient_nbr                    int64
race                          object
gender                        object
age                            int64
admission_type_id             object
discharge_disposition_id      object
admission_source_id           object
time_in_hospital               int64
num_lab_procedures             int64
num_procedures                 int64
num_medications                int64
diag_1                        object
diag_2                        object
diag_3                        object
number_diagnoses               int64
max_glu_serum                 object
A1Cresult                      int64
metformin                      int64
repaglinide                    int64
nateglinide                    int64
chlorpropamide                 int64
glimepiride                    int64
acetohexamide                  int64
glipizide                      int64
glyburide                      int64
tolbutamide                    int64
p

In [22]:
df.A1Cresult.value_counts()

-99    83243
 1     11935
 0      4942
Name: A1Cresult, dtype: int64

In [24]:
"""
For dealing with multiple encounters of same patient, we collapse these multiple encounters into a single one, such that
the resultant encounter retains the key features of its constituents. For situations where this could not be achieved
effectively, we have dropped those patients altogether. An example is where the category of primary diagnosis
(not exact disease) is different across multiple encounters of the same patient.
"""
# create a rule dictionary for combining patient records with multiple encounters

ruledict = {
    "patient_nbr":"first",
    "race": "first",
    "gender": "first",
    "age": "mean",
    "admission_type_id" : "first",
    "discharge_disposition_id" : "first",
    "admission_source_id" : "first",
    "time_in_hospital": "mean",
    "num_lab_procedures" : "mean",
    "num_procedures" : "mean",
    "num_medications" : "mean",
    "number_outpatient_log1p" : "mean",
    "number_emergency_log1p" : "mean", 
    "number_inpatient_log1p" : "mean",
    "diag_1" : ','.join,
    "diag_2" : ','.join,
    "diag_3" : ','.join,
    "number_diagnoses" : "mean", 
    "max_glu_serum" : "mean",
    "A1Cresult" : "mean",
    "metformin" : "mean",
    "repaglinide" : "mean",
    "nateglinide" : "mean",
    "chlorpropamide" : "mean",
    "glimepiride" : "mean",
    "acetohexamide" : "mean",
    "glipizide" : "mean",
    "glyburide" : "mean",
    "tolbutamide" : "mean",
    "pioglitazone" : "mean",
    "rosiglitazone" : "mean",
    "acarbose" : "mean",
    "miglitol" : "mean",
    "troglitazone" : "mean",
    "tolazamide" : "mean",
    "insulin" : "mean",
    "glyburide-metformin" : "mean",
    "glipizide-metformin" : "mean",
    "glimepiride-pioglitazone" : "mean",
    "metformin-rosiglitazone" : "mean",
    "metformin-pioglitazone" : "mean",
    "change" : lambda x: 1 if np.sum(x) >= 1 else 0, # if any encounter has a change, then composite encounter has change
    "diabetesMed": "mean", # gives proportion of change as this is a binary variable
    "readmitted" : "mean", # simialr logic as above but here values range = 0-2
    "service_utilization_log1p" : "mean",
    "level1_diag1": lambda x: x.iloc[0] if x.nunique()==1 else np.NaN,
    "level1_diag2": lambda x: x.iloc[0] if x.nunique()==1 else np.NaN,
    "level1_diag3": lambda x: x.iloc[0] if x.nunique()==1 else np.NaN,
    "level2_diag1": lambda x: x.iloc[0] if x.nunique()==1 else np.NaN,
    "level2_diag2": lambda x: x.iloc[0] if x.nunique()==1 else np.NaN,
    "level2_diag3": lambda x: x.iloc[0] if x.nunique()==1 else np.NaN,
    "max_glu_serum": lambda x: x.max(),
    "A1Cresult": lambda x: x.max(),
}

In [25]:
df_p = df.groupby('patient_nbr').agg(ruledict).reset_index(drop = True)

In [26]:
df_p[df_p.patient_nbr==5220].T

Unnamed: 0,12
patient_nbr,5220
race,Caucasian
gender,1
age,72.5
admission_type_id,1
discharge_disposition_id,1
admission_source_id,7
time_in_hospital,5.5
num_lab_procedures,41.25
num_procedures,1.75


In [27]:
import scipy as sp
df = df[(np.abs(sp.stats.zscore(df[numerics])) < 3).all(axis=1)]

In [28]:
# apply minimax scaling to all numeric features
for feature in numerics:
    scaler = MinMaxScaler()
    df[numerics] = scaler.fit_transform(df[numerics])

In [29]:
df_p.isnull().sum()

patient_nbr                      0
race                             0
gender                           0
age                              0
admission_type_id                0
discharge_disposition_id         0
admission_source_id              0
time_in_hospital                 0
num_lab_procedures               0
num_procedures                   0
num_medications                  0
number_outpatient_log1p          0
number_emergency_log1p           0
number_inpatient_log1p           0
diag_1                           0
diag_2                           0
diag_3                           0
number_diagnoses                 0
max_glu_serum                    0
A1Cresult                        0
metformin                        0
repaglinide                      0
nateglinide                      0
chlorpropamide                   0
glimepiride                      0
acetohexamide                    0
glipizide                        0
glyburide                        0
tolbutamide         

In [30]:
# drop the records where multiple encounters had dis-similar diagnoses
df_p = df_p.drop(df_p[(df_p.level1_diag1.isnull())].index)
df_p.shape

(58830, 51)

In [31]:
df_p.isnull().sum()

patient_nbr                     0
race                            0
gender                          0
age                             0
admission_type_id               0
discharge_disposition_id        0
admission_source_id             0
time_in_hospital                0
num_lab_procedures              0
num_procedures                  0
num_medications                 0
number_outpatient_log1p         0
number_emergency_log1p          0
number_inpatient_log1p          0
diag_1                          0
diag_2                          0
diag_3                          0
number_diagnoses                0
max_glu_serum                   0
A1Cresult                       0
metformin                       0
repaglinide                     0
nateglinide                     0
chlorpropamide                  0
glimepiride                     0
acetohexamide                   0
glipizide                       0
glyburide                       0
tolbutamide                     0
pioglitazone  

In [32]:
df_p.drop(['level1_diag2','level1_diag3','level2_diag1','level2_diag2','level2_diag3','diag_1','diag_2','diag_3'], axis=1, inplace=True)

In [33]:
df_p.isnull().sum()

patient_nbr                  0
race                         0
gender                       0
age                          0
admission_type_id            0
discharge_disposition_id     0
admission_source_id          0
time_in_hospital             0
num_lab_procedures           0
num_procedures               0
num_medications              0
number_outpatient_log1p      0
number_emergency_log1p       0
number_inpatient_log1p       0
number_diagnoses             0
max_glu_serum                0
A1Cresult                    0
metformin                    0
repaglinide                  0
nateglinide                  0
chlorpropamide               0
glimepiride                  0
acetohexamide                0
glipizide                    0
glyburide                    0
tolbutamide                  0
pioglitazone                 0
rosiglitazone                0
acarbose                     0
miglitol                     0
troglitazone                 0
tolazamide                   0
insulin 

In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 90363 entries, 2 to 101765
Data columns (total 52 columns):
encounter_id                 90363 non-null int64
patient_nbr                  90363 non-null int64
race                         90363 non-null object
gender                       90363 non-null object
age                          90363 non-null float64
admission_type_id            90363 non-null object
discharge_disposition_id     90363 non-null object
admission_source_id          90363 non-null object
time_in_hospital             90363 non-null float64
num_lab_procedures           90363 non-null float64
num_procedures               90363 non-null float64
num_medications              90363 non-null float64
diag_1                       90363 non-null object
diag_2                       90363 non-null object
diag_3                       90363 non-null object
number_diagnoses             90363 non-null float64
max_glu_serum                90363 non-null object
A1Cresult          

In [35]:
df_p.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 58830 entries, 1 to 70441
Data columns (total 43 columns):
patient_nbr                  58830 non-null int64
race                         58830 non-null object
gender                       58830 non-null int64
age                          58830 non-null float64
admission_type_id            58830 non-null int64
discharge_disposition_id     58830 non-null int64
admission_source_id          58830 non-null int64
time_in_hospital             58830 non-null float64
num_lab_procedures           58830 non-null float64
num_procedures               58830 non-null float64
num_medications              58830 non-null float64
number_outpatient_log1p      58830 non-null float64
number_emergency_log1p       58830 non-null float64
number_inpatient_log1p       58830 non-null float64
number_diagnoses             58830 non-null float64
max_glu_serum                58830 non-null int64
A1Cresult                    58830 non-null int64
metformin             

In [30]:
import seaborn as sns
from matplotlib.colors import ListedColormap
# my_cmap = ListedColormap(sns.color_palette("RdYlGn", n_colors=15).as_hex())
# my_cmap = ListedColormap(sns.diverging_palette(150, 250, sep=120, n=28, center="light").as_hex())

my_cmap = ListedColormap(sns.light_palette((250, 100, 50), input="husl", n_colors=50).as_hex())
# drop some columns due to their means is round to 0
table = df.drop(['acetohexamide','tolbutamide', 'troglitazone', 'glipizide-metformin', 'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone','patient_nbr', 'encounter_id', 'service_utilization_log1p'], axis = 1).corr(method='pearson')
table.style.background_gradient(cmap=my_cmap, axis = 0)

Unnamed: 0,age,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_diagnoses,change,diabetesMed,readmitted,number_emergency_log1p,number_inpatient_log1p,number_outpatient_log1p
age,1.0,0.107041,0.020637,-0.0552466,0.0245776,0.205741,-0.0362124,-0.0120603,0.0464467,-0.0161375,0.0313813,0.0342327
time_in_hospital,0.107041,1.0,0.310612,0.145689,0.439077,0.213863,0.0990284,0.0569363,0.0459764,0.000919061,0.0926895,-0.0209491
num_lab_procedures,0.020637,0.310612,1.0,0.0183901,0.241781,0.146501,0.0610134,0.0324489,0.040902,0.00646775,0.0450542,-0.034323
num_procedures,-0.0552466,0.145689,0.0183901,1.0,0.316175,0.0502352,-0.0171375,-0.0189926,-0.0344399,-0.0317337,-0.0714389,-0.0254826
num_medications,0.0245776,0.439077,0.241781,0.316175,1.0,0.257815,0.242495,0.192044,0.0557302,0.0326071,0.0845975,0.0535743
number_diagnoses,0.205741,0.213863,0.146501,0.0502352,0.257815,1.0,0.0483603,0.0259287,0.108756,0.0689631,0.124007,0.0854886
change,-0.0362124,0.0990284,0.0610134,-0.0171375,0.242495,0.0483603,1.0,0.504617,0.0411701,0.0433837,0.0210895,0.0261643
diabetesMed,-0.0120603,0.0569363,0.0324489,-0.0189926,0.192044,0.0259287,0.504617,1.0,0.0578791,0.0328113,0.0357886,0.022041
readmitted,0.0464467,0.0459764,0.040902,-0.0344399,0.0557302,0.108756,0.0411701,0.0578791,1.0,0.0782555,0.189031,0.0860094
number_emergency_log1p,-0.0161375,0.000919061,0.00646775,-0.0317337,0.0326071,0.0689631,0.0433837,0.0328113,0.0782555,1.0,0.14003,0.118213


In [31]:
table2 = df_p.drop(['service_utilization_log1p', 'patient_nbr'], axis = 1).corr(method='pearson')
table2.style.background_gradient(cmap=my_cmap, axis = 0)

Unnamed: 0,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient_log1p,number_emergency_log1p,number_inpatient_log1p,number_diagnoses,change,diabetesMed,readmitted,level1_diag1,max_glu_serum,A1Cresult
gender,1.0,-0.0475507,0.0112774,-0.0175784,0.00737787,-0.0216815,0.00521249,0.0687973,-0.00859764,-0.0157099,-0.0272365,-0.0010573,-0.000673306,0.0222032,0.0209756,-0.00646061,-0.0251872,-0.000800684,0.0227826
age,-0.0475507,1.0,-0.00809739,0.111261,0.0446432,0.142707,0.0412997,-0.0180136,0.0687171,0.0307165,-0.0419177,0.0310096,0.269727,-0.0386379,-0.0275382,0.0523924,-0.0129582,0.038078,-0.125831
admission_type_id,0.0112774,-0.00809739,1.0,0.0854243,0.1327,-0.0136165,-0.111474,0.14981,0.107972,0.0485027,-0.0217585,0.0387094,-0.115569,-0.0154187,-0.0165212,0.0366481,0.0662726,0.375425,-0.0719678
discharge_disposition_id,-0.0175784,0.111261,0.0854243,1.0,0.00183148,0.169648,0.0159808,0.013169,0.112316,-0.0203238,-0.0271911,-0.0113982,0.042495,-0.003375,-0.0231163,-0.0461553,0.041125,0.0301281,-0.0175329
admission_source_id,0.00737787,0.0446432,0.1327,0.00183148,1.0,0.00990212,0.109957,-0.128406,-0.0602862,0.0279833,0.0716308,0.0304458,0.0742116,0.00198969,-0.000770859,0.0532555,-0.0876903,0.393777,0.0215226
time_in_hospital,-0.0216815,0.142707,-0.0136165,0.169648,0.00990212,1.0,0.338121,0.163589,0.458965,-0.0190566,-0.00298386,0.0857127,0.234198,0.108473,0.0657564,0.0456769,-0.00817301,0.0171585,0.070372
num_lab_procedures,0.00521249,0.0412997,-0.111474,0.0159808,0.109957,0.338121,1.0,0.0283448,0.250547,-0.0209156,0.0247187,0.0848345,0.157682,0.0753287,0.0481641,0.0852901,-0.0588639,-0.125444,0.262962
num_procedures,0.0687973,-0.0180136,0.14981,0.013169,-0.128406,0.163589,0.0283448,1.0,0.398602,-0.0209611,-0.0468236,-0.0289289,0.0857903,0.00323889,-0.0067289,-0.0117304,0.00735864,-0.0831813,-0.0371566
num_medications,-0.00859764,0.0687171,0.107972,0.112316,-0.0602862,0.458965,0.250547,0.398602,1.0,0.0319424,0.0081143,0.0446328,0.261004,0.24312,0.188856,0.0361618,0.0537183,-0.032638,0.0101522
number_outpatient_log1p,-0.0157099,0.0307165,0.0485027,-0.0203238,0.0279833,-0.0190566,-0.0209156,-0.0209611,0.0319424,1.0,0.142416,0.0892277,0.0941069,0.0286589,0.0321924,0.103922,0.00800444,0.059512,-0.0394156


In [43]:
df_pd = pd.get_dummies(df_p, columns=['race', 'gender', 'admission_type_id', 'discharge_disposition_id',
                                      'admission_source_id', 'max_glu_serum', 'A1Cresult', ])

In [44]:
non_cols = ['race', 'gender', 'admission_type_id', 'discharge_disposition_id', 'admission_source_id', 'max_glu_serum', 'A1Cresult', ]

In [48]:
numerics = list(set(list(df._get_numeric_data().columns))- {'readmitted', 'change'})
numerics

['insulin',
 'metformin',
 'num_medications',
 'repaglinide',
 'service_utilization_log1p',
 'number_outpatient_log1p',
 'num_lab_procedures',
 'nateglinide',
 'glyburide-metformin',
 'pioglitazone',
 'num_procedures',
 'miglitol',
 'glimepiride',
 'metformin-rosiglitazone',
 'glipizide',
 'glipizide-metformin',
 'number_emergency_log1p',
 'diabetesMed',
 'chlorpropamide',
 'rosiglitazone',
 'A1Cresult',
 'age',
 'acetohexamide',
 'number_diagnoses',
 'metformin-pioglitazone',
 'tolazamide',
 'glyburide',
 'encounter_id',
 'glimepiride-pioglitazone',
 'time_in_hospital',
 'patient_nbr',
 'number_inpatient_log1p',
 'tolbutamide',
 'acarbose',
 'troglitazone']

In [49]:
new_non_cols = []
for i in non_cols:
    for j in df_pd.columns:
        if i in j:
            new_non_cols.append(j)

In [50]:
new_non_cols

['race_?',
 'race_AfricanAmerican',
 'race_Asian',
 'race_Caucasian',
 'race_Hispanic',
 'race_Other',
 'gender_0',
 'gender_1',
 'admission_type_id_1',
 'admission_type_id_2',
 'admission_type_id_3',
 'admission_type_id_4',
 'admission_type_id_5',
 'admission_type_id_6',
 'admission_type_id_7',
 'admission_type_id_8',
 'discharge_disposition_id_1',
 'discharge_disposition_id_2',
 'discharge_disposition_id_3',
 'discharge_disposition_id_4',
 'discharge_disposition_id_5',
 'discharge_disposition_id_6',
 'discharge_disposition_id_7',
 'discharge_disposition_id_8',
 'discharge_disposition_id_9',
 'discharge_disposition_id_10',
 'discharge_disposition_id_12',
 'discharge_disposition_id_13',
 'discharge_disposition_id_14',
 'discharge_disposition_id_15',
 'discharge_disposition_id_16',
 'discharge_disposition_id_17',
 'discharge_disposition_id_18',
 'discharge_disposition_id_19',
 'discharge_disposition_id_20',
 'discharge_disposition_id_22',
 'discharge_disposition_id_23',
 'discharge_disp

In [58]:
train_input = df_pd.drop(['readmitted', 'change'], axis= 1)
train_output = df_pd['readmitted']

In [59]:
train_input.columns

Index(['patient_nbr', 'age', 'time_in_hospital', 'num_lab_procedures',
       'num_procedures', 'num_medications', 'number_outpatient_log1p',
       'number_emergency_log1p', 'number_inpatient_log1p', 'number_diagnoses',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'insulin', 'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'diabetesMed', 'service_utilization_log1p',
       'level1_diag1', 'race_?', 'race_AfricanAmerican', 'race_Asian',
       'race_Caucasian', 'race_Hispanic', 'race_Other', 'gender_0', 'gender_1',
       'admission_type_id_1', 'admission_type_id_2', 'admission_type_id_3',
       'admission_type_id_4', 'admission_type_id_5', 'admission_type_id_6',
       'admission_type_id_7', '

In [60]:
full_feature_list = ['age', 'time_in_hospital', 'num_lab_procedures',
       'num_procedures', 'num_medications', 'number_outpatient_log1p',
       'number_emergency_log1p', 'number_inpatient_log1p', 'number_diagnoses',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'insulin', 'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'diabetesMed', 'service_utilization_log1p',
       'level1_diag1', 'race_?', 'race_AfricanAmerican', 'race_Asian',
       'race_Caucasian', 'race_Hispanic', 'race_Other', 'gender_0', 'gender_1',
       'admission_type_id_1', 'admission_type_id_2', 'admission_type_id_3',
       'admission_type_id_4', 'admission_type_id_5', 'admission_type_id_6',
       'admission_type_id_7', 'admission_type_id_8',
       'discharge_disposition_id_1', 'discharge_disposition_id_2',
       'discharge_disposition_id_3', 'discharge_disposition_id_4',
       'discharge_disposition_id_5', 'discharge_disposition_id_6',
       'discharge_disposition_id_7', 'discharge_disposition_id_8',
       'discharge_disposition_id_9', 'discharge_disposition_id_10',
       'discharge_disposition_id_12', 'discharge_disposition_id_13',
       'discharge_disposition_id_14', 'discharge_disposition_id_15',
       'discharge_disposition_id_16', 'discharge_disposition_id_17',
       'discharge_disposition_id_18', 'discharge_disposition_id_19',
       'discharge_disposition_id_20', 'discharge_disposition_id_22',
       'discharge_disposition_id_23', 'discharge_disposition_id_24',
       'discharge_disposition_id_25', 'discharge_disposition_id_27',
       'discharge_disposition_id_28', 'admission_source_id_1',
       'admission_source_id_2', 'admission_source_id_3',
       'admission_source_id_4', 'admission_source_id_5',
       'admission_source_id_6', 'admission_source_id_7',
       'admission_source_id_8', 'admission_source_id_9',
       'admission_source_id_10', 'admission_source_id_11',
       'admission_source_id_13', 'admission_source_id_14',
       'admission_source_id_17', 'admission_source_id_20',
       'admission_source_id_22', 'admission_source_id_25', 'max_glu_serum_-99',
       'max_glu_serum_0', 'max_glu_serum_1', 'A1Cresult_-99', 'A1Cresult_0',
       'A1Cresult_1']

In [61]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
X_train, X_dev, Y_train, Y_dev = train_test_split(train_input[full_feature_list], train_output, test_size=0.20, random_state=0)
Y_dev = Y_dev.astype(int)
Y_train = Y_train.astype(int)
logreg = LogisticRegression(fit_intercept=True, penalty='l1')
print("Cross Validation Score: {:.2%}".format(np.mean(cross_val_score(logreg, X_train, Y_train, cv=25))))
logreg.fit(X_train, Y_train)
print("Dev Set score: {:.2%}".format(logreg.score(X_dev, Y_dev)))

Cross Validation Score: 72.68%
Dev Set score: 72.78%


In [62]:
Y_dev_predict = logreg.predict(X_dev)

In [63]:
pd.crosstab(pd.Series(Y_dev, name = 'Actual'), pd.Series(Y_dev_predict, name = 'Predict'), margins = True)

Predict,0.0,1.0,2.0,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.0,1483,17,46,1546
1.0,167,3,5,175
2.0,194,1,3,198
All,1844,21,54,1919


In [66]:
from sklearn.metrics import accuracy_score, precision_score, recall_score
print("Accuracy is {0:.2f}".format(accuracy_score(Y_dev, Y_dev_predict)))
#print("Precision is {0:.2f}".format(precision_score(Y_dev, Y_dev_predict)))
#print("Recall is {0:.2f}".format(recall_score(Y_dev, Y_dev_predict)))

Accuracy is 0.73


In [67]:
[print(fname, coeffs) for fname, coeffs in zip(X_train.columns, np.round(logreg.coef_[0],3))];

age -0.005
time_in_hospital -0.008
num_lab_procedures -0.003
num_procedures 0.005
num_medications 0.003
number_outpatient_log1p 0.17
number_emergency_log1p -0.0
number_inpatient_log1p -1.297
number_diagnoses -0.082
metformin 0.034
repaglinide -0.274
nateglinide 0.579
chlorpropamide 0.0
glimepiride 0.08
acetohexamide 0.0
glipizide -0.092
glyburide 0.014
tolbutamide 0.0
pioglitazone -0.448
rosiglitazone 0.597
acarbose -0.549
miglitol 0.0
troglitazone 0.0
tolazamide 0.0
insulin 0.035
glyburide-metformin 0.0
glipizide-metformin 0.0
glimepiride-pioglitazone 0.0
metformin-rosiglitazone 0.0
metformin-pioglitazone 0.0
diabetesMed -0.257
service_utilization_log1p -0.648
level1_diag1 0.029
race_? 0.0
race_AfricanAmerican 0.0
race_Asian 0.221
race_Caucasian -0.118
race_Hispanic 0.197
race_Other 0.104
gender_0 0.67
gender_1 0.693
admission_type_id_1 0.069
admission_type_id_2 0.0
admission_type_id_3 0.16
admission_type_id_4 0.0
admission_type_id_5 0.37
admission_type_id_6 -0.856
admission_type_id_7

In [69]:
df_p.readmitted.mean()

0.4771979201696176