In [1]:
import IPython
import pandas as pd
import numpy as np
from statistics import mode
import matplotlib.pyplot as plt  
from sklearn.preprocessing import MinMaxScaler

In [2]:
df_ori = pd.read_csv("diabetic_data.csv")
df_ori.shape

(101766, 50)

In [3]:
print('--Examine the data type--')
print(df_ori.info())
df_ori.describe()

--Examine the data type--
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101766 entries, 0 to 101765
Data columns (total 50 columns):
encounter_id                101766 non-null int64
patient_nbr                 101766 non-null int64
race                        101766 non-null object
gender                      101766 non-null object
age                         101766 non-null object
weight                      101766 non-null object
admission_type_id           101766 non-null int64
discharge_disposition_id    101766 non-null int64
admission_source_id         101766 non-null int64
time_in_hospital            101766 non-null int64
payer_code                  101766 non-null object
medical_specialty           101766 non-null object
num_lab_procedures          101766 non-null int64
num_procedures              101766 non-null int64
num_medications             101766 non-null int64
number_outpatient           101766 non-null int64
number_emergency            101766 non-null int64
number_

Unnamed: 0,encounter_id,patient_nbr,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses
count,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0
mean,165201600.0,54330400.0,2.024006,3.715642,5.754437,4.395987,43.095641,1.33973,16.021844,0.369357,0.197836,0.635566,7.422607
std,102640300.0,38696360.0,1.445403,5.280166,4.064081,2.985108,19.674362,1.705807,8.127566,1.267265,0.930472,1.262863,1.9336
min,12522.0,135.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
25%,84961190.0,23413220.0,1.0,1.0,1.0,2.0,31.0,0.0,10.0,0.0,0.0,0.0,6.0
50%,152389000.0,45505140.0,1.0,1.0,7.0,4.0,44.0,1.0,15.0,0.0,0.0,0.0,8.0
75%,230270900.0,87545950.0,3.0,4.0,7.0,6.0,57.0,2.0,20.0,0.0,0.0,1.0,9.0
max,443867200.0,189502600.0,8.0,28.0,25.0,14.0,132.0,6.0,81.0,42.0,76.0,21.0,16.0


In [4]:
# exploring unique values in each column
# for col in df_ori.columns:
#     print(col, df_ori[col].unique())

# Example:
# len(df_ori['diag_3'].unique())
# Result: 790

In [4]:
df_ori.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [5]:
df = df_ori.copy(deep=True) 

In [6]:
# Additional features
df['service_utilization'] = df['number_outpatient'] + df['number_emergency'] + df['number_inpatient']

In [7]:
# standardize function
def standardize(raw_data):
    return ((raw_data - np.mean(raw_data, axis = 0)) / np.std(raw_data, axis = 0))

In [8]:
# calculate missing values
missingvalues = pd.DataFrame(df.isnull().sum(axis=0), columns = ['number_of_missing_values'])
missingvalues['feature'] = missingvalues.index
missingvalues = missingvalues[['feature','number_of_missing_values']].reset_index (drop = True)
missingvalues[missingvalues.number_of_missing_values > 0]

Unnamed: 0,feature,number_of_missing_values


In [9]:
# calculate missing values, which indicates as '?'
for col in df.columns:
    if df[col].dtype == object:
         print(col,df[col][df[col] == '?'].count())
print('gender', df['gender'][df['gender'] == 'Unknown/Invalid'].count())

race 2273
gender 0
age 0
weight 98569
payer_code 40256
medical_specialty 49949
diag_1 21
diag_2 358
diag_3 1423
max_glu_serum 0
A1Cresult 0
metformin 0
repaglinide 0
nateglinide 0
chlorpropamide 0
glimepiride 0
acetohexamide 0
glipizide 0
glyburide 0
tolbutamide 0
pioglitazone 0
rosiglitazone 0
acarbose 0
miglitol 0
troglitazone 0
tolazamide 0
examide 0
citoglipton 0
insulin 0
glyburide-metformin 0
glipizide-metformin 0
glimepiride-pioglitazone 0
metformin-rosiglitazone 0
metformin-pioglitazone 0
change 0
diabetesMed 0
readmitted 0
gender 3


In [10]:
# calculate the readmission numbers
df['readmitted'] = df['readmitted'].replace('>30', 2)
df['readmitted'] = df['readmitted'].replace('<30', 1) #should we code it into 1 and 2?
df['readmitted'] = df['readmitted'].replace('NO', 0)

#noreadmitted = df['readmitted'][df['readmitted'] == 0].count()
print('>30 readmissions', df['readmitted'][df['readmitted'] == 2].count())
print('<30 readmissions', df['readmitted'][df['readmitted'] == 1].count())
print('no readmissions', df['readmitted'][df['readmitted'] == 0].count())

>30 readmissions 35545
<30 readmissions 11357
no readmissions 54864


In [11]:
# dropping discharge_disposition_id = 11, which means the patient died
# dropping the missing values in gender
# drop_Idx = set(df['race'][df['race'] == '?'].index)
drop_Idx = set(df[(df['diag_1'] == '?') & (df['diag_2'] == '?') & (df['diag_3'] == '?')].index)
drop_Idx = drop_Idx.union(set(df[df['discharge_disposition_id'] == 11].index))
drop_Idx = drop_Idx.union(set(df['gender'][df['gender'] == 'Unknown/Invalid'].index))
new_Idx = list(set(df.index) - set(drop_Idx))
df = df.iloc[new_Idx]

In [12]:
# dropping columns with too many missing values
df = df.drop(['weight', 'payer_code', 'medical_specialty'], axis = 1)

In [13]:
# remove columns having same value in each row: citoglipton, examide
df = df.drop(['citoglipton', 'examide'], axis = 1)

In [14]:
# code the non-numeric values
df['change'] = df['change'].replace('Ch', 1)
df['change'] = df['change'].replace('No', 0)

df['gender'] = df['gender'].replace('Male', 1)
df['gender'] = df['gender'].replace('Female', 0)

df['diabetesMed'] = df['diabetesMed'].replace('Yes', 1)
df['diabetesMed'] = df['diabetesMed'].replace('No', 0)

# code age intervals [0-10) - [90-100) from 1-10
for i in range(0,10):
    df['age'] = df['age'].replace('['+str(10*i)+'-'+str(10*(i+1))+')', i+1)

# Features have been coded
# --
# metformin, repaglinide, nateglinide, chlorpropamide, glimepiride, glipizide, glyburide, pioglitazone, 
# rosiglitazone, acarbose, miglitol, insulin, glyburide-metformin: No, Up, Down, Steady
# --
# tolazamide: No, Steady, Up
# --
# A1Cresult: >7, >8, None, Norm
# --
# max_glu_serum: >200, >300, None, Norm

keys = ['metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'glipizide', 'glyburide', 
        'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'insulin', 'glyburide-metformin', 'tolazamide']
for col in keys:
    df[col] = df[col].replace('No', 0)
    df[col] = df[col].replace('Steady', 0)
    df[col] = df[col].replace('Up', 1)
    df[col] = df[col].replace('Down', 1) 

keys = ['metformin-pioglitazone','metformin-rosiglitazone', 'glimepiride-pioglitazone', 
        'glipizide-metformin', 'troglitazone', 'tolbutamide', 'acetohexamide']
for col in keys:
    df[col] = df[col].replace('No', 0)
    df[col] = df[col].replace('Steady', 0)  

df['A1Cresult'] = df['A1Cresult'].replace('>7', 1)
df['A1Cresult'] = df['A1Cresult'].replace('>8', 1)
df['A1Cresult'] = df['A1Cresult'].replace('Norm', 0)
df['A1Cresult'] = df['A1Cresult'].replace('None', -99) #84748 rows are None
# 1: test result is normal or abnormal
# 2: test it or not

df['max_glu_serum'] = df['max_glu_serum'].replace('>200', 1)
df['max_glu_serum'] = df['max_glu_serum'].replace('>300', 1)
df['max_glu_serum'] = df['max_glu_serum'].replace('Norm', 0)
df['max_glu_serum'] = df['max_glu_serum'].replace('None', -99)
# 1: test result is normal or abnormal
# 2: test it or not

In [15]:
# Creating additional columns for diagnosis
df['level1_diag1'] = df['diag_1']
df['level2_diag1'] = df['diag_1']
df['level1_diag2'] = df['diag_2']
df['level2_diag2'] = df['diag_2']
df['level1_diag3'] = df['diag_3']
df['level2_diag3'] = df['diag_3']

In [16]:
df.loc[df['diag_1'].str.contains('V'), ['level1_diag1', 'level2_diag1']] = 0
df.loc[df['diag_1'].str.contains('E'), ['level1_diag1', 'level2_diag1']] = 0
df.loc[df['diag_2'].str.contains('V'), ['level1_diag2', 'level2_diag2']] = 0
df.loc[df['diag_2'].str.contains('E'), ['level1_diag2', 'level2_diag2']] = 0
df.loc[df['diag_3'].str.contains('V'), ['level1_diag3', 'level2_diag3']] = 0
df.loc[df['diag_3'].str.contains('E'), ['level1_diag3', 'level2_diag3']] = 0
df['level1_diag1'] = df['level1_diag1'].replace('?', -1)
df['level2_diag1'] = df['level2_diag1'].replace('?', -1)
df['level1_diag2'] = df['level1_diag2'].replace('?', -1)
df['level2_diag2'] = df['level2_diag2'].replace('?', -1)
df['level1_diag3'] = df['level1_diag3'].replace('?', -1)
df['level2_diag3'] = df['level2_diag3'].replace('?', -1)

In [17]:
df['level1_diag1'] = df['level1_diag1'].astype(float)
df['level2_diag1'] = df['level2_diag1'].astype(float)
df['level1_diag2'] = df['level1_diag2'].astype(float)
df['level2_diag2'] = df['level2_diag2'].astype(float)
df['level1_diag3'] = df['level1_diag3'].astype(float)
df['level2_diag3'] = df['level2_diag3'].astype(float)

In [18]:
for index, row in df.iterrows():
    if (row['level1_diag1'] >= 390 and row['level1_diag1'] < 460) or (np.floor(row['level1_diag1']) == 785):
        df.loc[index, 'level1_diag1'] = 1
    elif (row['level1_diag1'] >= 460 and row['level1_diag1'] < 520) or (np.floor(row['level1_diag1']) == 786):
        df.loc[index, 'level1_diag1'] = 2
    elif (row['level1_diag1'] >= 520 and row['level1_diag1'] < 580) or (np.floor(row['level1_diag1']) == 787):
        df.loc[index, 'level1_diag1'] = 3
    elif (np.floor(row['level1_diag1']) == 250):
        df.loc[index, 'level1_diag1'] = 4
    elif (row['level1_diag1'] >= 800 and row['level1_diag1'] < 1000):
        df.loc[index, 'level1_diag1'] = 5
    elif (row['level1_diag1'] >= 710 and row['level1_diag1'] < 740):
        df.loc[index, 'level1_diag1'] = 6
    elif (row['level1_diag1'] >= 580 and row['level1_diag1'] < 630) or (np.floor(row['level1_diag1']) == 788):
        df.loc[index, 'level1_diag1'] = 7
    elif (row['level1_diag1'] >= 140 and row['level1_diag1'] < 240):
        df.loc[index, 'level1_diag1'] = 8
    else:
        df.loc[index, 'level1_diag1'] = 0
        
    if (row['level1_diag2'] >= 390 and row['level1_diag2'] < 460) or (np.floor(row['level1_diag2']) == 785):
        df.loc[index, 'level1_diag2'] = 1
    elif (row['level1_diag2'] >= 460 and row['level1_diag2'] < 520) or (np.floor(row['level1_diag2']) == 786):
        df.loc[index, 'level1_diag2'] = 2
    elif (row['level1_diag2'] >= 520 and row['level1_diag2'] < 580) or (np.floor(row['level1_diag2']) == 787):
        df.loc[index, 'level1_diag2'] = 3
    elif (np.floor(row['level1_diag2']) == 250):
        df.loc[index, 'level1_diag2'] = 4
    elif (row['level1_diag2'] >= 800 and row['level1_diag2'] < 1000):
        df.loc[index, 'level1_diag2'] = 5
    elif (row['level1_diag2'] >= 710 and row['level1_diag2'] < 740):
        df.loc[index, 'level1_diag2'] = 6
    elif (row['level1_diag2'] >= 580 and row['level1_diag2'] < 630) or (np.floor(row['level1_diag2']) == 788):
        df.loc[index, 'level1_diag2'] = 7
    elif (row['level1_diag2'] >= 140 and row['level1_diag2'] < 240):
        df.loc[index, 'level1_diag2'] = 8
    else:
        df.loc[index, 'level1_diag2'] = 0
    
    if (row['level1_diag3'] >= 390 and row['level1_diag3'] < 460) or (np.floor(row['level1_diag3']) == 785):
        df.loc[index, 'level1_diag3'] = 1
    elif (row['level1_diag3'] >= 460 and row['level1_diag3'] < 520) or (np.floor(row['level1_diag3']) == 786):
        df.loc[index, 'level1_diag3'] = 2
    elif (row['level1_diag3'] >= 520 and row['level1_diag3'] < 580) or (np.floor(row['level1_diag3']) == 787):
        df.loc[index, 'level1_diag3'] = 3
    elif (np.floor(row['level1_diag3']) == 250):
        df.loc[index, 'level1_diag3'] = 4
    elif (row['level1_diag3'] >= 800 and row['level1_diag3'] < 1000):
        df.loc[index, 'level1_diag3'] = 5
    elif (row['level1_diag3'] >= 710 and row['level1_diag3'] < 740):
        df.loc[index, 'level1_diag3'] = 6
    elif (row['level1_diag3'] >= 580 and row['level1_diag3'] < 630) or (np.floor(row['level1_diag3']) == 788):
        df.loc[index, 'level1_diag3'] = 7
    elif (row['level1_diag3'] >= 140 and row['level1_diag3'] < 240):
        df.loc[index, 'level1_diag3'] = 8
    else:
        df.loc[index, 'level1_diag3'] = 0
    

In [19]:
for index, row in df.iterrows():
    if (row['level2_diag1'] >= 390 and row['level2_diag1'] < 399):
        df.loc[index, 'level2_diag1'] = 1
    elif (row['level2_diag1'] >= 401 and row['level2_diag1'] < 415):
        df.loc[index, 'level2_diag1'] = 2
    elif (row['level2_diag1'] >= 415 and row['level2_diag1'] < 460):
        df.loc[index, 'level2_diag1'] = 3
    elif (np.floor(row['level2_diag1']) == 785):
        df.loc[index, 'level2_diag1'] = 4
    elif (row['level2_diag1'] >= 460 and row['level2_diag1'] < 489):
        df.loc[index, 'level2_diag1'] = 5
    elif (row['level2_diag1'] >= 490 and row['level2_diag1'] < 497):
        df.loc[index, 'level2_diag1'] = 6
    elif (row['level2_diag1'] >= 500 and row['level2_diag1'] < 520):
        df.loc[index, 'level2_diag1'] = 7
    elif (np.floor(row['level2_diag1']) == 786):
        df.loc[index, 'level2_diag1'] = 8
    elif (row['level2_diag1'] >= 520 and row['level2_diag1'] < 530):
        df.loc[index, 'level2_diag1'] = 9
    elif (row['level2_diag1'] >= 530 and row['level2_diag1'] < 544):
        df.loc[index, 'level2_diag1'] = 10
    elif (row['level2_diag1'] >= 550 and row['level2_diag1'] < 554):
        df.loc[index, 'level2_diag1'] = 11
    elif (row['level2_diag1'] >= 555 and row['level2_diag1'] < 580):
        df.loc[index, 'level2_diag1'] = 12
    elif (np.floor(row['level2_diag1']) == 787):
        df.loc[index, 'level2_diag1'] = 13
    elif (np.floor(row['level2_diag1']) == 250):
        df.loc[index, 'level2_diag1'] = 14
    elif (row['level2_diag1'] >= 800 and row['level2_diag1'] < 1000):
        df.loc[index, 'level2_diag1'] = 15
    elif (row['level2_diag1'] >= 710 and row['level2_diag1'] < 740):
        df.loc[index, 'level2_diag1'] = 16
    elif (row['level2_diag1'] >= 580 and row['level2_diag1'] < 630):
        df.loc[index, 'level2_diag1'] = 17
    elif (np.floor(row['level2_diag1']) == 788):
        df.loc[index, 'level2_diag1'] = 18
    elif (row['level2_diag1'] >= 140 and row['level2_diag1'] < 240):
        df.loc[index, 'level2_diag1'] = 19
    elif row['level2_diag1'] >= 240 and row['level2_diag1'] < 280 and (np.floor(row['level2_diag1']) != 250):
        df.loc[index, 'level2_diag1'] = 20
    elif (row['level2_diag1'] >= 680 and row['level2_diag1'] < 710) or (np.floor(row['level2_diag1']) == 782):
        df.loc[index, 'level2_diag1'] = 21
    elif (row['level2_diag1'] >= 290 and row['level2_diag1'] < 320):
        df.loc[index, 'level2_diag1'] = 22
    else:
        df.loc[index, 'level2_diag1'] = 0
        
    if (row['level2_diag2'] >= 390 and row['level2_diag2'] < 399):
        df.loc[index, 'level2_diag2'] = 1
    elif (row['level2_diag2'] >= 401 and row['level2_diag2'] < 415):
        df.loc[index, 'level2_diag2'] = 2
    elif (row['level2_diag2'] >= 415 and row['level2_diag2'] < 460):
        df.loc[index, 'level2_diag2'] = 3
    elif (np.floor(row['level2_diag2']) == 785):
        df.loc[index, 'level2_diag2'] = 4
    elif (row['level2_diag2'] >= 460 and row['level2_diag2'] < 489):
        df.loc[index, 'level2_diag2'] = 5
    elif (row['level2_diag2'] >= 490 and row['level2_diag2'] < 497):
        df.loc[index, 'level2_diag2'] = 6
    elif (row['level2_diag2'] >= 500 and row['level2_diag2'] < 520):
        df.loc[index, 'level2_diag2'] = 7
    elif (np.floor(row['level2_diag2']) == 786):
        df.loc[index, 'level2_diag2'] = 8
    elif (row['level2_diag2'] >= 520 and row['level2_diag2'] < 530):
        df.loc[index, 'level2_diag2'] = 9
    elif (row['level2_diag2'] >= 530 and row['level2_diag2'] < 544):
        df.loc[index, 'level2_diag2'] = 10
    elif (row['level2_diag2'] >= 550 and row['level2_diag2'] < 554):
        df.loc[index, 'level2_diag2'] = 11
    elif (row['level2_diag2'] >= 555 and row['level2_diag2'] < 580):
        df.loc[index, 'level2_diag2'] = 12
    elif (np.floor(row['level2_diag2']) == 787):
        df.loc[index, 'level2_diag2'] = 13
    elif (np.floor(row['level2_diag2']) == 250):
        df.loc[index, 'level2_diag2'] = 14
    elif (row['level2_diag2'] >= 800 and row['level2_diag2'] < 1000):
        df.loc[index, 'level2_diag2'] = 15
    elif (row['level2_diag2'] >= 710 and row['level2_diag2'] < 740):
        df.loc[index, 'level2_diag2'] = 16
    elif (row['level2_diag2'] >= 580 and row['level2_diag2'] < 630):
        df.loc[index, 'level2_diag2'] = 17
    elif (np.floor(row['level2_diag2']) == 788):
        df.loc[index, 'level2_diag2'] = 18
    elif (row['level2_diag2'] >= 140 and row['level2_diag2'] < 240):
        df.loc[index, 'level2_diag2'] = 19
    elif row['level2_diag2'] >= 240 and row['level2_diag2'] < 280 and (np.floor(row['level2_diag2']) != 250):
        df.loc[index, 'level2_diag2'] = 20
    elif (row['level2_diag2'] >= 680 and row['level2_diag2'] < 710) or (np.floor(row['level2_diag2']) == 782):
        df.loc[index, 'level2_diag2'] = 21
    elif (row['level2_diag2'] >= 290 and row['level2_diag2'] < 320):
        df.loc[index, 'level2_diag2'] = 22
    else:
        df.loc[index, 'level2_diag2'] = 0
        
        
    if (row['level2_diag3'] >= 390 and row['level2_diag3'] < 399):
        df.loc[index, 'level2_diag3'] = 1
    elif (row['level2_diag3'] >= 401 and row['level2_diag3'] < 415):
        df.loc[index, 'level2_diag3'] = 2
    elif (row['level2_diag3'] >= 415 and row['level2_diag3'] < 460):
        df.loc[index, 'level2_diag3'] = 3
    elif (np.floor(row['level2_diag3']) == 785):
        df.loc[index, 'level2_diag3'] = 4
    elif (row['level2_diag3'] >= 460 and row['level2_diag3'] < 489):
        df.loc[index, 'level2_diag3'] = 5
    elif (row['level2_diag3'] >= 490 and row['level2_diag3'] < 497):
        df.loc[index, 'level2_diag3'] = 6
    elif (row['level2_diag3'] >= 500 and row['level2_diag3'] < 520):
        df.loc[index, 'level2_diag3'] = 7
    elif (np.floor(row['level2_diag3']) == 786):
        df.loc[index, 'level2_diag3'] = 8
    elif (row['level2_diag3'] >= 520 and row['level2_diag3'] < 530):
        df.loc[index, 'level2_diag3'] = 9
    elif (row['level2_diag3'] >= 530 and row['level2_diag3'] < 544):
        df.loc[index, 'level2_diag3'] = 10
    elif (row['level2_diag3'] >= 550 and row['level2_diag3'] < 554):
        df.loc[index, 'level2_diag3'] = 11
    elif (row['level2_diag3'] >= 555 and row['level2_diag3'] < 580):
        df.loc[index, 'level2_diag3'] = 12
    elif (np.floor(row['level2_diag3']) == 787):
        df.loc[index, 'level2_diag3'] = 13
    elif (np.floor(row['level2_diag3']) == 250):
        df.loc[index, 'level2_diag3'] = 14
    elif (row['level2_diag3'] >= 800 and row['level2_diag3'] < 1000):
        df.loc[index, 'level2_diag3'] = 15
    elif (row['level2_diag3'] >= 710 and row['level2_diag3'] < 740):
        df.loc[index, 'level2_diag3'] = 16
    elif (row['level2_diag3'] >= 580 and row['level2_diag3'] < 630):
        df.loc[index, 'level2_diag3'] = 17
    elif (np.floor(row['level2_diag3']) == 788):
        df.loc[index, 'level2_diag3'] = 18
    elif (row['level2_diag3'] >= 140 and row['level2_diag3'] < 240):
        df.loc[index, 'level2_diag3'] = 19
    elif row['level2_diag3'] >= 240 and row['level2_diag3'] < 280 and (np.floor(row['level2_diag3']) != 250):
        df.loc[index, 'level2_diag3'] = 20
    elif (row['level2_diag3'] >= 680 and row['level2_diag3'] < 710) or (np.floor(row['level2_diag3']) == 782):
        df.loc[index, 'level2_diag3'] = 21
    elif (row['level2_diag3'] >= 290 and row['level2_diag3'] < 320):
        df.loc[index, 'level2_diag3'] = 22
    else:
        df.loc[index, 'level2_diag3'] = 0
    

In [20]:
df.to_csv('./modify.csv')

In [21]:
for i in ('encounter_id', 'patient_nbr', 'gender', 'admission_type_id', 'discharge_disposition_id', 'admission_source_id',\
          'A1Cresult', 'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide', \
          'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose','miglitol', \
          'troglitazone', 'tolazamide', 'insulin', 'glyburide-metformin', 'glipizide-metformin', \
          'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone', 'change', 'diabetesMed', \
          'age', 'A1Cresult', 'max_glu_serum', 'level1_diag1', 'level1_diag2', 'level1_diag3', 'level2_diag1', 'level2_diag2', 'level2_diag3' ):
    df[i] = df[i].astype('object')

In [22]:
df.dtypes

encounter_id                object
patient_nbr                 object
race                        object
gender                      object
age                         object
admission_type_id           object
discharge_disposition_id    object
admission_source_id         object
time_in_hospital             int64
num_lab_procedures           int64
num_procedures               int64
num_medications              int64
number_outpatient            int64
number_emergency             int64
number_inpatient             int64
diag_1                      object
diag_2                      object
diag_3                      object
number_diagnoses             int64
max_glu_serum               object
A1Cresult                   object
metformin                   object
repaglinide                 object
nateglinide                 object
chlorpropamide              object
glimepiride                 object
acetohexamide               object
glipizide                   object
glyburide           

In [23]:
num_col = list(set(list(df._get_numeric_data().columns))- {'readmitted'})

In [24]:
num_col

['time_in_hospital',
 'num_medications',
 'number_inpatient',
 'num_procedures',
 'number_diagnoses',
 'num_lab_procedures',
 'service_utilization',
 'number_emergency',
 'number_outpatient']

In [25]:
statdataframe = pd.DataFrame()
statdataframe['numeric_column'] = num_col
skew_before = []
skew_after = []

kurt_before = []
kurt_after = []

standard_deviation_before = []
standard_deviation_after = []

log_transform_needed = []

log_type = []

for i in num_col:
    skewval = df[i].skew()
    skew_before.append(skewval)
    
    kurtval = df[i].kurtosis()
    kurt_before.append(kurtval)
    
    sdval = df[i].std()
    standard_deviation_before.append(sdval)
    
    if (abs(skewval) >2) & (abs(kurtval) >2):
        log_transform_needed.append('Yes')
        
        if len(df[df[i] == 0])/len(df) <=0.02:
            log_type.append('log')
            skewvalnew = np.log(pd.DataFrame(df[train_data[i] > 0])[i]).skew()
            skew_after.append(skewvalnew)
            
            kurtvalnew = np.log(pd.DataFrame(df[train_data[i] > 0])[i]).kurtosis()
            kurt_after.append(kurtvalnew)
            
            sdvalnew = np.log(pd.DataFrame(df[train_data[i] > 0])[i]).std()
            standard_deviation_after.append(sdvalnew)
            
        else:
            log_type.append('log1p')
            skewvalnew = np.log1p(pd.DataFrame(df[df[i] >= 0])[i]).skew()
            skew_after.append(skewvalnew)
        
            kurtvalnew = np.log1p(pd.DataFrame(df[df[i] >= 0])[i]).kurtosis()
            kurt_after.append(kurtvalnew)
            
            sdvalnew = np.log1p(pd.DataFrame(df[df[i] >= 0])[i]).std()
            standard_deviation_after.append(sdvalnew)
            
    else:
        log_type.append('NA')
        log_transform_needed.append('No')
        
        skew_after.append(skewval)
        kurt_after.append(kurtval)
        standard_deviation_after.append(sdval)

statdataframe['skew_before'] = skew_before
statdataframe['kurtosis_before'] = kurt_before
statdataframe['standard_deviation_before'] = standard_deviation_before
statdataframe['log_transform_needed'] = log_transform_needed
statdataframe['log_type'] = log_type
statdataframe['skew_after'] = skew_after
statdataframe['kurtosis_after'] = kurt_after
statdataframe['standard_deviation_after'] = standard_deviation_after

In [26]:
statdataframe

Unnamed: 0,numeric_column,skew_before,kurtosis_before,standard_deviation_before,log_transform_needed,log_type,skew_after,kurtosis_after,standard_deviation_after
0,time_in_hospital,1.137931,0.871189,2.974528,No,,1.137931,0.871189,2.974528
1,num_medications,1.333039,3.523472,8.092612,No,,1.333039,3.523472,8.092612
2,number_inpatient,3.626402,20.833542,1.261825,Yes,log1p,1.450492,1.405364,0.510433
3,num_procedures,1.32602,0.890773,1.700335,No,,1.32602,0.890773,1.700335
4,number_diagnoses,-0.867785,-0.109005,1.938211,No,,-0.867785,-0.109005,1.938211
5,num_lab_procedures,-0.241491,-0.253099,19.620228,No,,-0.241491,-0.253099,19.620228
6,service_utilization,5.334724,67.77977,2.29273,Yes,log1p,1.119136,0.547353,0.662373
7,number_emergency,22.842251,1185.246374,0.935517,Yes,log1p,3.661064,16.27661,0.315405
8,number_outpatient,8.818291,148.558544,1.263973,Yes,log1p,2.733914,7.804218,0.429394


In [27]:
# performing the log transformation for the columns determined to be needing it above.

for i in range(len(statdataframe)):
    if statdataframe['log_transform_needed'][i] == 'Yes':
        colname = str(statdataframe['numeric_column'][i])
        
        if statdataframe['log_type'][i] == 'log':
            df = df[df[colname] > 0]
            df[colname + "_log"] = np.log(df[colname])
            
        elif statdataframe['log_type'][i] == 'log1p':
            df = df[df[colname] >= 0]
            df[colname + "_log1p"] = np.log1p(df[colname])

In [28]:
df = df.drop(['number_outpatient', 'number_inpatient', 'number_emergency','service_utilization'], axis = 1)

In [29]:
df.shape

(100120, 52)

In [32]:
numerics = list(set(list(df._get_numeric_data().columns))- {'readmitted'})

In [33]:
numerics

['time_in_hospital',
 'num_medications',
 'number_inpatient_log1p',
 'number_emergency_log1p',
 'num_procedures',
 'number_diagnoses',
 'service_utilization_log1p',
 'number_outpatient_log1p',
 'num_lab_procedures']

In [34]:
for feature in numerics:
    scaler = MinMaxScaler()
    df[numerics] = scaler.fit_transform(df[numerics])

In [35]:
df.dtypes[df.dtypes == 'object']

encounter_id                object
patient_nbr                 object
race                        object
gender                      object
age                         object
admission_type_id           object
discharge_disposition_id    object
admission_source_id         object
diag_1                      object
diag_2                      object
diag_3                      object
max_glu_serum               object
A1Cresult                   object
metformin                   object
repaglinide                 object
nateglinide                 object
chlorpropamide              object
glimepiride                 object
acetohexamide               object
glipizide                   object
glyburide                   object
tolbutamide                 object
pioglitazone                object
rosiglitazone               object
acarbose                    object
miglitol                    object
troglitazone                object
tolazamide                  object
insulin             

In [36]:
ruledict = {
    "encounter_id": "count",
    "race": "first",
    "gender": "first",
    "age": "first", #last
    "admission_type_id" : "first",
    "discharge_disposition_id" : "first",
    "admission_source_id" : "first",
    "time_in_hospital": "mean",
    "num_lab_procedures" : "mean",
    "num_procedures" : "mean",
    "num_medications" : "mean",
    "number_outpatient_log1p" : "mean",
    "number_emergency_log1p" : "mean", 
    "number_inpatient_log1p" : "mean",
    "diag_1" : ','.join,
    "diag_2" : ','.join,
    "diag_3" : ','.join,
    "number_diagnoses" : "mean", 
#     "max_glu_serum" : ','.join,
#     "A1Cresult" : ','.join,
#     "metformin" : ','.join,
#     "repaglinide" : ','.join,
#     "nateglinide" : ','.join,
#     "chlorpropamide" : ','.join,
#     "glimepiride" : ','.join,
#     "acetohexamide" : "mean",
#     "glipizide" : ','.join,
#     "glyburide" : ','.join,
#     "tolbutamide" : "mean",
#     "pioglitazone" : ','.join,
#     "rosiglitazone" : ','.join,
#     "acarbose" : ','.join,
#     "miglitol" : ','.join,
#     "troglitazone" : "mean",
#     "tolazamide" : ','.join,
#     "insulin" : ','.join,
#     "glyburide-metformin" : ','.join,
#     "glipizide-metformin" : "mean",
#     "glimepiride-pioglitazone" : "mean",
#     "metformin-rosiglitazone" : "mean",
#     "metformin-pioglitazone" : "mean",
    "change" : lambda x: ', '.join(x.astype(str)),
    "diabetesMed": lambda x: ', '.join(x.astype(str)),
    "readmitted" : lambda x: ', '.join(x.astype(str)),
    "service_utilization_log1p" : "mean",
    "level1_diag1": lambda x: ', '.join(x.astype(str)),
    "level1_diag2": lambda x: ', '.join(x.astype(str)),
    "level1_diag3": lambda x: ', '.join(x.astype(str)),
    "level2_diag1": lambda x: ', '.join(x.astype(str)),
    "level2_diag2": lambda x: ', '.join(x.astype(str)),
    "level2_diag3": lambda x: ', '.join(x.astype(str))
}

In [37]:
df_p = df.groupby('patient_nbr').agg(ruledict).reset_index(drop = True)

In [38]:
df_p

Unnamed: 0,encounter_id,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,...,change,diabetesMed,readmitted,service_utilization_log1p,level1_diag1,level1_diag2,level1_diag3,level2_diag1,level2_diag2,level2_diag3
0,2,Caucasian,0,6,2,1,1,0.346154,0.404580,0.583333,...,"1, 1","1, 1","1, 2",0.078866,"1.0, 5.0","5.0, 0.0","3.0, 4.0","2.0, 15.0","15.0, 0.0","12.0, 14.0"
1,1,Caucasian,0,6,3,1,1,0.076923,0.366412,0.166667,...,0,0,0,0.000000,6.0,0.0,4.0,16.0,22.0,14.0
2,1,Caucasian,0,9,1,3,7,0.230769,0.511450,0.333333,...,0,1,0,0.000000,5.0,2.0,0.0,15.0,6.0,0.0
3,1,Caucasian,0,9,1,1,7,0.153846,0.343511,0.000000,...,1,1,0,0.000000,0.0,1.0,1.0,20.0,3.0,3.0
4,1,AfricanAmerican,0,4,1,1,7,0.307692,0.366412,0.000000,...,0,1,0,0.000000,7.0,8.0,4.0,17.0,19.0,14.0
5,5,AfricanAmerican,0,6,1,1,7,0.569231,0.311450,0.333333,...,"0, 1, 0, 0, 1","1, 1, 1, 1, 1","2, 2, 2, 2, 0",0.194639,"0.0, 0.0, 0.0, 0.0, 0.0","4.0, 0.0, 0.0, 4.0, 0.0","0.0, 4.0, 1.0, 0.0, 2.0","0.0, 0.0, 0.0, 0.0, 0.0","14.0, 0.0, 20.0, 14.0, 0.0","0.0, 14.0, 3.0, 0.0, 5.0"
6,1,Caucasian,0,7,3,1,1,0.615385,0.389313,0.166667,...,0,1,0,0.000000,2.0,1.0,4.0,6.0,3.0,14.0
7,3,Caucasian,0,5,2,1,1,0.076923,0.376590,0.722222,...,"0, 0, 1","1, 1, 1","2, 1, 0",0.135911,"5.0, 1.0, 0.0","1.0, 1.0, 4.0","1.0, 4.0, 1.0","15.0, 2.0, 0.0","2.0, 2.0, 14.0","2.0, 14.0, 2.0"
8,1,Caucasian,1,8,3,6,4,1.000000,0.152672,0.000000,...,0,1,2,0.442811,0.0,6.0,1.0,0.0,16.0,3.0
9,1,Caucasian,0,8,2,3,1,0.846154,0.351145,0.333333,...,1,1,0,0.000000,0.0,0.0,3.0,21.0,21.0,12.0


In [39]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100120 entries, 0 to 101765
Data columns (total 52 columns):
encounter_id                 100120 non-null object
patient_nbr                  100120 non-null object
race                         100120 non-null object
gender                       100120 non-null object
age                          100120 non-null object
admission_type_id            100120 non-null object
discharge_disposition_id     100120 non-null object
admission_source_id          100120 non-null object
time_in_hospital             100120 non-null float64
num_lab_procedures           100120 non-null float64
num_procedures               100120 non-null float64
num_medications              100120 non-null float64
diag_1                       100120 non-null object
diag_2                       100120 non-null object
diag_3                       100120 non-null object
number_diagnoses             100120 non-null float64
max_glu_serum                100120 non-null object


In [71]:
import seaborn as sns
from matplotlib.colors import ListedColormap
# my_cmap = ListedColormap(sns.color_palette("RdYlGn", n_colors=15).as_hex())
# my_cmap = ListedColormap(sns.diverging_palette(150, 250, sep=120, n=28, center="light").as_hex())

my_cmap = ListedColormap(sns.light_palette((250, 100, 50), input="husl", n_colors=50).as_hex())
# drop some columns due to their means is round to 0
table = df.drop(['acetohexamide','tolbutamide', 'troglitazone', 'glipizide-metformin', 'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone','patient_nbr', 'encounter_id', 'service_utilization_log1p'], axis = 1).corr(method='pearson')
table.style.background_gradient(cmap=my_cmap, axis = 0)

Unnamed: 0,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_diagnoses,readmitted,number_inpatient_log1p,number_emergency_log1p,number_outpatient_log1p,race_?,race_AfricanAmerican,race_Asian,race_Caucasian,race_Hispanic,race_Other,gender_0,gender_1,age_1,age_2,age_3,age_4,age_5,age_6,age_7,age_8,age_9,age_10,admission_type_id_1,admission_type_id_2,admission_type_id_3,admission_type_id_4,admission_type_id_5,admission_type_id_6,admission_type_id_7,admission_type_id_8,discharge_disposition_id_1,discharge_disposition_id_2,discharge_disposition_id_3,discharge_disposition_id_4,discharge_disposition_id_5,discharge_disposition_id_6,discharge_disposition_id_7,discharge_disposition_id_8,discharge_disposition_id_9,discharge_disposition_id_10,discharge_disposition_id_12,discharge_disposition_id_13,discharge_disposition_id_14,discharge_disposition_id_15,discharge_disposition_id_16,discharge_disposition_id_17,discharge_disposition_id_18,discharge_disposition_id_19,discharge_disposition_id_20,discharge_disposition_id_22,discharge_disposition_id_23,discharge_disposition_id_24,discharge_disposition_id_25,discharge_disposition_id_27,discharge_disposition_id_28,admission_source_id_1,admission_source_id_2,admission_source_id_3,admission_source_id_4,admission_source_id_5,admission_source_id_6,admission_source_id_7,admission_source_id_8,admission_source_id_9,admission_source_id_10,admission_source_id_11,admission_source_id_13,admission_source_id_14,admission_source_id_17,admission_source_id_20,admission_source_id_22,admission_source_id_25,max_glu_serum_-99,max_glu_serum_0,max_glu_serum_1,A1Cresult_-99,A1Cresult_0,A1Cresult_1
time_in_hospital,1.0,0.319768,0.18996,0.464103,0.220954,0.0429999,0.0882617,-0.00752564,-0.0167823,-0.00484391,0.0178609,-0.0115802,-0.00582177,-0.0160632,-0.00467612,0.0305088,-0.0305088,-0.0251615,-0.0335147,-0.0356746,-0.0389957,-0.0387156,-0.0403948,-0.00290573,0.0378182,0.0630477,0.0221079,-0.00588291,0.0338278,-0.0124051,-0.00399662,-0.0339679,0.0148364,0.00475607,-0.0252609,-0.284528,-0.0260924,0.189028,0.0294322,0.0464033,0.13543,-0.041233,0.0271014,0.001118,0.00462543,-0.00133015,0.0329787,0.0223036,0.016934,-0.00137246,0.00611914,0.0559821,-0.00755665,0.000166108,0.0669972,0.0576745,-0.00332768,-0.00309547,0.00810296,-0.00695606,-0.0304582,0.00909651,0.00617303,0.0867111,0.046067,0.00211123,-0.00291549,0.00498177,-0.0115708,-4.34344e-05,-0.00283909,-0.00147628,-0.00208779,-0.0217338,0.00270498,0.0206512,0.00242001,-0.00928693,-0.0220324,0.0345833,-0.0665278,0.0407111,0.0496468
num_lab_procedures,0.319768,1.0,0.0517537,0.265151,0.149978,0.0429503,0.0432846,0.0033139,-0.0206543,0.00833454,0.0244595,-0.00792195,-0.0235361,-0.00161407,0.00228876,0.00304995,-0.00304995,-0.00451165,0.000614027,0.00109461,0.000713517,-0.00371955,-0.0105584,-0.0131103,0.000842927,0.021158,0.0131154,0.223886,-0.0536508,-0.197441,0.00165872,-0.1824,0.108596,0.00558204,0.00428391,-0.0601275,0.0108943,0.0528544,-0.00357595,0.0201391,0.0201732,0.00819577,0.00406424,0.000252421,0.00436247,-0.00156532,0.0156253,0.0144241,0.00295455,-0.0140552,-0.0154222,0.000973877,-0.000316101,-0.00169571,-0.0035982,0.00990169,-0.00163489,-0.0200258,0.00174917,0.00624555,-0.194931,-0.0105175,-0.00562577,-0.0375953,0.0183205,0.056808,0.242498,-0.0045402,0.00840509,0.000139516,-0.00852976,-0.00675628,0.00525225,-0.136817,0.00414139,-0.00703696,0.00263253,0.162586,-0.137196,-0.0899791,-0.259286,0.129257,0.213155
num_procedures,0.18996,0.0517537,1.0,0.381564,0.0697181,-0.0402652,-0.0772778,-0.0524081,-0.0338067,-0.00397474,-0.0318039,0.000143331,0.0322547,-0.0190138,0.0139349,-0.0605151,0.0605151,-0.0274897,-0.049153,-0.0431786,-0.0257668,0.000452136,0.0575888,0.0725547,0.0128564,-0.0848363,-0.0642517,-0.204134,0.0561656,0.210122,0.000406739,-0.0582254,0.0406256,0.0118497,0.0243674,0.0299135,0.00159045,-0.047983,-0.0305037,-0.0157405,0.0105014,-0.0313806,0.0165122,-0.00281794,-0.00074731,-0.00321125,-0.0153716,-0.0178671,-0.0041791,-0.0082043,-0.00776546,0.0292102,0.00877561,-0.000869553,0.0152715,0.0100669,-0.0107011,-0.003039,-0.00303741,-0.0216105,0.177611,0.0446772,0.0146083,0.122687,-0.00446602,-0.0401732,-0.203431,-0.00430132,-0.0155289,0.00614692,0.00701629,0.00310254,-0.000869553,-0.0031033,0.0279408,-0.00373984,0.00701629,0.0790896,-0.0552464,-0.0550989,0.0134151,-0.00853733,-0.00979176
num_medications,0.464103,0.265151,0.381564,1.0,0.261007,0.0425426,0.0729215,0.0252293,0.0551993,-0.00392263,-0.0400853,-0.0273546,0.0573967,-0.0351721,-0.0128404,0.0226189,-0.0226189,-0.0493158,-0.0793415,-0.0642403,-0.0464169,-0.0254197,0.0323044,0.0742289,0.0273864,-0.0396274,-0.0441777,-0.0845836,-0.0560678,0.15561,-0.00541131,-0.00212891,0.0136948,0.00223993,0.0105155,-0.179661,-0.00768218,0.0982159,-0.0101227,0.00263558,0.132543,-0.0337995,0.0104898,-0.00107496,-0.0031711,-0.00134037,0.00467496,0.000768069,0.00478208,-0.00786571,-0.00529645,0.005365,-0.00467428,0.00166708,0.0763083,0.0591424,0.002869,-0.0194032,1.611e-05,-0.00677648,0.0950798,0.0295186,0.0186921,0.0397011,0.017084,-0.0585381,-0.106931,0.00214558,-0.0195973,0.00195347,-0.00247515,-0.00116439,-0.0016467,0.0250051,0.0128921,0.00521112,0.00221937,0.0195429,-0.0388322,0.0112074,-0.0205039,0.0146439,0.0138995
number_diagnoses,0.220954,0.149978,0.0697181,0.261007,1.0,0.112033,0.129535,0.0822186,0.112152,-0.0579033,-0.0794205,-0.0153363,0.110077,-0.0365245,-0.0151283,0.00335475,-0.00335475,-0.0979834,-0.149087,-0.103192,-0.0978202,-0.0790371,-0.0420849,0.0233443,0.0696634,0.108612,0.0407403,0.102339,-0.00520577,-0.0744176,-0.00211079,0.00412644,-0.0802879,0.00677708,-0.0555081,-0.173515,0.0250791,0.145402,0.0139585,-0.00222416,0.104967,-0.0110761,-0.00081768,0.00334645,-0.0122862,0.00260962,0.0323558,0.029452,0.0148393,-0.00713047,0.000117348,-0.0684211,0.00445377,0.00366807,0.0374784,0.0197505,0.00432018,-0.0255761,0.00142442,0.00956368,-0.0952897,-0.0113244,-0.023486,-0.012306,0.037002,-0.151599,0.138119,-0.00847986,-0.0480368,0.00330073,-0.000943937,0.00259371,-0.000943937,-0.00311468,0.0191426,0.00569021,0.00366807,0.0523839,-0.042317,-0.0308502,0.0103983,0.0272399,-0.0302239
readmitted,0.0429999,0.0429503,-0.0402652,0.0425426,0.112033,1.0,0.207041,0.119912,0.103865,-0.0422427,-0.00361392,-0.0181952,0.0298119,-0.0131004,-0.0166052,0.0179859,-0.0179859,-0.0207519,-0.0104693,-0.00871526,-0.0171668,-0.012596,-0.016677,0.000682098,0.0239615,0.0221641,-0.0182269,0.0278706,-0.00115435,-0.0547715,-0.00349562,0.00382643,0.0360599,-0.0119394,-0.0131085,-0.0212636,-0.00556835,0.0120714,-0.00133858,-0.00279742,0.0560004,0.00260526,0.000679012,-0.00320156,0.00426762,-0.000928129,-0.0406311,-0.0476286,0.0052197,0.00303095,-0.00139622,-0.0301726,-0.0079592,-0.00397948,-0.00389578,-0.00358888,-0.000260047,0.00322966,-0.00323582,0.0013156,-0.0412184,-0.0167907,-0.00184676,-0.0574121,-0.0117028,-0.0612757,0.0791625,-0.00207548,-0.0177179,-0.00312655,-0.00397948,-0.0028139,-0.00397948,0.00646966,0.0143919,-0.00185634,-0.00397948,-0.011183,-0.00372849,0.0191665,0.0179397,-0.0194252,-0.00774054
number_inpatient_log1p,0.0882617,0.0432846,-0.0772778,0.0729215,0.129535,0.207041,1.0,0.27494,0.152502,-0.0443948,0.0249778,-0.0121334,0.00106917,-0.00232758,-0.0197432,0.0148883,-0.0148883,-0.0211876,-0.00968898,0.0369913,0.00986334,0.0110576,-0.0128985,-0.00890591,-0.00211363,0.00617322,-0.00636129,0.0447649,0.000409899,-0.0444001,-0.00500326,-0.0173867,-0.0034582,-0.00853381,-0.0102638,-0.0859398,0.00205399,0.0522206,0.00958378,-0.007171,0.0663973,0.0176424,0.00407926,0.0104361,0.00135461,-0.00100581,0.0139535,0.0155578,0.0100257,0.00260739,-0.00226664,0.00347991,-0.00113672,-0.00284438,-0.013726,0.0119745,0.00424124,-0.0219922,0.00126063,0.00525452,-0.0629246,-0.00763695,-0.0040151,-0.0184875,0.0231908,-0.0302599,0.0776762,-0.00381733,-0.0111659,-0.00265412,0.000190335,-0.00201127,-0.00284438,-0.0111739,-0.00344488,0.00902373,-0.00284438,-0.0235365,0.0062114,0.0264809,0.0750818,-0.0412575,-0.0591644
number_emergency_log1p,-0.00752564,0.0033139,-0.0524081,0.0252293,0.0822186,0.119912,0.27494,1.0,0.168623,-0.0353901,0.0485953,-0.0116052,-0.0327281,0.0059479,0.00427239,0.0289761,-0.0289761,-0.0107356,-0.00473378,0.0641755,0.0571899,0.0569879,0.0175558,-0.0294651,-0.0462975,-0.0195694,-0.011882,0.0292336,0.0107051,-0.0527317,-0.00323101,0.0406793,-0.028436,-0.00433502,-0.00595677,-0.0060608,0.00207624,0.0145056,-0.00737053,-0.00925296,0.0267613,0.0366303,-0.00822616,0.0105917,-0.00250268,-0.00176963,-0.00024934,0.00229976,0.00305265,0.0116268,0.0102574,-0.0593463,-0.000434197,0.00346631,-0.0130939,-0.000924886,-0.00348539,0.000169401,0.000821553,0.00735574,-0.0767069,-0.0241164,-0.0100325,-0.045294,0.0055791,-0.0399401,0.0799704,0.00650364,-0.00936497,-0.00288987,-0.00144489,-0.00102169,-0.00144489,0.0480874,-0.00485328,-0.00153434,-0.00144489,-0.0534232,0.0266747,0.0477094,0.0151334,-0.00572727,-0.0136556
number_outpatient_log1p,-0.0167823,-0.0206543,-0.0338067,0.0551993,0.112152,0.103865,0.152502,0.168623,1.0,-0.0250144,-0.0622482,-0.0147571,0.0732545,-0.00873071,-0.0113423,0.0139933,-0.0139933,-0.0152502,-0.01667,-0.00201165,-0.0165371,-0.01401,-0.0140826,0.0115986,0.0140594,0.0158226,-0.0125846,-0.00267646,-0.0320866,-0.031703,-0.000784637,0.154119,-0.0296073,-0.00538204,0.00327088,-0.0513521,0.0182688,0.0327005,-0.00745939,-0.0283944,0.0843796,-0.00733893,-0.00543619,0.000763326,-0.00310713,-0.00219704,-0.00696371,0.0127702,0.00616129,0.00244121,0.00503029,-0.0670546,1.97186e-05,-0.00179387,-0.0104078,0.00670685,0.00739903,0.00397183,-0.0028364,-0.000454792,0.0300959,-0.0253898,-0.0155473,-0.0513548,-0.0188023,-0.0525336,-0.0236677,-0.00359569,-0.0137339,-0.00358784,0.00181359,-0.00126845,-0.00179387,0.0838337,-0.0131728,-0.00292147,-0.00179387,-0.082254,0.0597802,0.055013,0.0400716,-0.0248867,-0.0296594
race_?,-0.00484391,0.00833454,-0.00397474,-0.00392263,-0.0579033,-0.0422427,-0.0443948,-0.0353901,-0.0250144,1.0,-0.0728969,-0.012049,-0.260207,-0.021715,-0.0185493,-0.0114592,0.0114592,-0.00435635,-0.00605853,0.00484829,-0.00288525,0.000670423,-0.00415035,-0.0014166,0.00229662,0.00266244,0.00350983,-0.0153842,0.048982,-0.0108615,-0.00151092,-0.0156039,-0.0154349,0.00301335,-0.00496848,0.0164576,0.00911497,0.00550709,0.0239092,0.00909316,-0.0169975,-0.00164984,-0.0029091,-0.00218965,-0.00117033,-0.000827534,-0.00740252,-0.00589986,-0.00379338,-0.00158467,-0.00178778,-0.023121,-0.00135139,-0.000675676,-0.0162157,-0.00338378,-0.00331088,-0.00416655,-0.00106835,-0.00382178,0.00754829,0.0233901,-0.00650439,0.0485504,-0.00917157,0.10266,-0.0570103,-0.00185053,0.130533,0.00620903,-0.000675676,-0.000477772,-0.000675676,-0.0183217,-0.00433209,-0.00165514,-0.000675676,0.019065,-0.0138823,-0.0127251,-0.00792713,-0.00793032,0.0144601


In [41]:
df = pd.get_dummies(df, columns=['race', 'gender', 'age', 'admission_type_id', 'discharge_disposition_id', 'admission_source_id', 'max_glu_serum', 'A1Cresult'])

In [42]:
non_cols = ['race', 'gender', 'age', 'admission_type_id', 'discharge_disposition_id', 'admission_source_id', 'max_glu_serum', 'A1Cresult']

In [43]:
numerics

['time_in_hospital',
 'num_medications',
 'number_inpatient_log1p',
 'number_emergency_log1p',
 'num_procedures',
 'number_diagnoses',
 'service_utilization_log1p',
 'number_outpatient_log1p',
 'num_lab_procedures']

In [44]:
new_non_cols = []
for i in non_cols:
    for j in df.columns:
        if i in j:
            new_non_cols.append(j)

In [72]:
train_input = df[numerics + new_non_cols]
train_output = df['change']

In [52]:
full_feature_list = ['time_in_hospital', 'num_medications', 'number_inpatient_log1p',
       'number_emergency_log1p', 'num_procedures', 'number_diagnoses',
       'service_utilization_log1p', 'number_outpatient_log1p',
       'num_lab_procedures', 'race_?', 'race_AfricanAmerican', 'race_Asian',
       'race_Caucasian', 'race_Hispanic', 'race_Other', 'gender_0', 'gender_1',
       'age_1', 'age_2', 'age_3', 'age_4', 'age_5', 'age_6', 'age_7', 'age_8',
       'age_9', 'age_10', 'admission_type_id_1', 'admission_type_id_2',
       'admission_type_id_3', 'admission_type_id_4', 'admission_type_id_5',
       'admission_type_id_6', 'admission_type_id_7', 'admission_type_id_8',
       'discharge_disposition_id_1', 'discharge_disposition_id_2',
       'discharge_disposition_id_3', 'discharge_disposition_id_4',
       'discharge_disposition_id_5', 'discharge_disposition_id_6',
       'discharge_disposition_id_7', 'discharge_disposition_id_8',
       'discharge_disposition_id_9', 'discharge_disposition_id_10',
       'discharge_disposition_id_12', 'discharge_disposition_id_13',
       'discharge_disposition_id_14', 'discharge_disposition_id_15',
       'discharge_disposition_id_16', 'discharge_disposition_id_17',
       'discharge_disposition_id_18', 'discharge_disposition_id_19',
       'discharge_disposition_id_20', 'discharge_disposition_id_22',
       'discharge_disposition_id_23', 'discharge_disposition_id_24',
       'discharge_disposition_id_25', 'discharge_disposition_id_27',
       'discharge_disposition_id_28', 'admission_source_id_1',
       'admission_source_id_2', 'admission_source_id_3',
       'admission_source_id_4', 'admission_source_id_5',
       'admission_source_id_6', 'admission_source_id_7',
       'admission_source_id_8', 'admission_source_id_9',
       'admission_source_id_10', 'admission_source_id_11',
       'admission_source_id_13', 'admission_source_id_14',
       'admission_source_id_17', 'admission_source_id_20',
       'admission_source_id_22', 'admission_source_id_25', 'max_glu_serum_-99',
       'max_glu_serum_0', 'max_glu_serum_1', 'A1Cresult_-99', 'A1Cresult_0',
       'A1Cresult_1']

In [65]:
feature_list = ['time_in_hospital', 'num_medications', 'num_procedures', 'number_diagnoses',
       'number_inpatient_log1p', 'number_emergency_log1p', 'number_outpatient_log1p',
       'num_lab_procedures', 'race_?', 'race_AfricanAmerican', 'race_Asian',
       'race_Caucasian', 'race_Hispanic', 'race_Other', 'gender_0', 'gender_1',
       'age_1', 'age_2', 'age_3', 'age_4', 'age_5', 'age_6', 'age_7', 'age_8', 'age_9', 'age_10',
       'max_glu_serum_-99', 'max_glu_serum_0', 'max_glu_serum_1',
       'A1Cresult_-99', 'A1Cresult_0', 'A1Cresult_1']

In [66]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
X_train, X_dev, Y_train, Y_dev = train_test_split(train_input[feature_list], train_output, test_size=0.20, random_state=0)
Y_dev = Y_dev.astype(int)
Y_train = Y_train.astype(int)
logreg = LogisticRegression(fit_intercept=True)
print("Cross Validation Score: {:.2%}".format(np.mean(cross_val_score(logreg, X_train, Y_train, cv=25))))
logreg.fit(X_train, Y_train)
print("Dev Set score: {:.2%}".format(logreg.score(X_dev, Y_dev)))

Cross Validation Score: 62.51%
Dev Set score: 62.55%


In [67]:
[print(fname, coeffs) for fname, coeffs in zip(X_train.columns, np.round(logreg.coef_[0],3))];

time_in_hospital -0.001
num_medications 6.513
num_procedures -0.831
number_diagnoses -0.112
number_inpatient_log1p -0.11
number_emergency_log1p 1.136
number_outpatient_log1p 0.21
num_lab_procedures -0.456
race_? -0.105
race_AfricanAmerican -0.197
race_Asian -0.088
race_Caucasian -0.187
race_Hispanic -0.023
race_Other 0.223
gender_0 -0.242
gender_1 -0.133
age_1 -0.82
age_2 -0.018
age_3 0.384
age_4 0.093
age_5 0.129
age_6 0.051
age_7 0.06
age_8 -0.006
age_9 -0.063
age_10 -0.186
max_glu_serum_-99 -0.023
max_glu_serum_0 -0.591
max_glu_serum_1 0.238
A1Cresult_-99 -0.344
A1Cresult_0 -0.373
A1Cresult_1 0.34
