In [1]:
# import libraries needed
import pandas as pd
import numpy as np
from statistics import mode
import matplotlib.pyplot as plt
import warnings
%matplotlib inline
warnings.filterwarnings("ignore")

In [2]:
# load original data into dataframe
diabetesTrain = pd.read_csv('../data/train_multi.csv')
diabetesTest = pd.read_csv('../data/test_multi.csv')

diabetesTrain['IsTrain']=1
diabetesTest['IsTrain']=0

In [3]:
#load the initial data file:
diabetes = pd.concat([diabetesTrain, diabetesTest], axis=0)
diabetes.index = list(range(len(diabetes)))

In [4]:
print(diabetes.info())
diabetes.describe()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 101766 entries, 0 to 101765
Data columns (total 51 columns):
encounter_id                101766 non-null int64
patient_nbr                 101766 non-null int64
race                        99493 non-null object
gender                      101766 non-null object
age                         101766 non-null object
weight                      3197 non-null object
admission_type_id           101766 non-null int64
discharge_disposition_id    101766 non-null int64
admission_source_id         101766 non-null int64
time_in_hospital            101766 non-null int64
payer_code                  61510 non-null object
medical_specialty           51817 non-null object
num_lab_procedures          101766 non-null int64
num_procedures              101766 non-null int64
num_medications             101766 non-null int64
number_outpatient           101766 non-null int64
number_emergency            101766 non-null int64
number_inpatient            101766 non

Unnamed: 0,encounter_id,patient_nbr,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,IsTrain
count,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0
mean,165201600.0,54330400.0,2.024006,3.715642,5.754437,4.395987,43.095641,1.33973,16.021844,0.369357,0.197836,0.635566,7.422607,0.800012
std,102640300.0,38696360.0,1.445403,5.280166,4.064081,2.985108,19.674362,1.705807,8.127566,1.267265,0.930472,1.262863,1.9336,0.399993
min,12522.0,135.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
25%,84961190.0,23413220.0,1.0,1.0,1.0,2.0,31.0,0.0,10.0,0.0,0.0,0.0,6.0,1.0
50%,152389000.0,45505140.0,1.0,1.0,7.0,4.0,44.0,1.0,15.0,0.0,0.0,0.0,8.0,1.0
75%,230270900.0,87545950.0,3.0,4.0,7.0,6.0,57.0,2.0,20.0,0.0,0.0,1.0,9.0,1.0
max,443867200.0,189502600.0,8.0,28.0,25.0,14.0,132.0,6.0,81.0,42.0,76.0,21.0,16.0,1.0


In [5]:
# exploring unique values in each column
for col in diabetes.columns:
    print(col, diabetes[col].unique())

encounter_id [  2278392    149190    500364 ... 443835512 443842340 443854148]
patient_nbr [  8222157  55629189  82442376 ... 175326800 139605341 120975314]
race ['Caucasian' 'AfricanAmerican' nan 'Other' 'Asian' 'Hispanic']
gender ['Female' 'Male' 'Unknown/Invalid']
age ['[0-10)' '[10-20)' '[30-40)' '[40-50)' '[50-60)' '[60-70)' '[70-80)'
 '[90-100)' '[80-90)' '[20-30)']
weight [nan '[75-100)' '[50-75)' '[0-25)' '[25-50)' '[100-125)' '[125-150)'
 '[175-200)' '[150-175)' '>200']
admission_type_id [6 1 2 3 4 5 8 7]
discharge_disposition_id [25  1  3  6  2 11  5  7 10  4 14 18  8 13 12 16 17 22 23 20 15  9 24 28
 19 27]
admission_source_id [ 1  7  2  4  5 20  6  3 17  8  9 10 22 14 11 13 25]
time_in_hospital [ 1  3  2  4  5 12  7 10  6 11 13  9 14  8]
payer_code [nan 'MC' 'MD' 'HM' 'SP' 'CP' 'UN' 'BC' 'SI' 'DM' 'CM' 'CH' 'PO' 'WC' 'OT'
 'OG' 'MP' 'FR']
medical_specialty ['Pediatrics-Endocrinology' nan 'InternalMedicine'
 'Family/GeneralPractice' 'Cardiology' 'Surgery-General'
 'Surgery-C

troglitazone ['No' 'Steady']
tolazamide ['No' 'Steady' 'Up']
examide ['No']
citoglipton ['No']
insulin ['No' 'Up' 'Steady' 'Down']
glyburide.metformin ['No' 'Steady' 'Up' 'Down']
glipizide.metformin ['No' 'Steady']
glimepiride.pioglitazone ['No' 'Steady']
metformin.rosiglitazone ['No' 'Steady']
metformin.pioglitazone ['No' 'Steady']
change ['No' 'Ch']
diabetesMed ['No' 'Yes']
readmitted ['NO' '>30' '<30']
IsTrain [1 0]


In [6]:
diabetes.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,insulin,glyburide.metformin,glipizide.metformin,glimepiride.pioglitazone,metformin.rosiglitazone,metformin.pioglitazone,change,diabetesMed,readmitted,IsTrain
0,2278392,8222157,Caucasian,Female,[0-10),,6,25,1,1,...,No,No,No,No,No,No,No,No,NO,1
1,149190,55629189,Caucasian,Female,[10-20),,1,1,7,3,...,Up,No,No,No,No,No,Ch,Yes,>30,1
2,500364,82442376,Caucasian,Male,[30-40),,1,1,7,2,...,Up,No,No,No,No,No,Ch,Yes,NO,1
3,16680,42519267,Caucasian,Male,[40-50),,1,1,7,1,...,Steady,No,No,No,No,No,Ch,Yes,NO,1
4,35754,82637451,Caucasian,Male,[50-60),,2,1,2,3,...,Steady,No,No,No,No,No,No,Yes,>30,1


In [7]:
# make a copy of the dataframe for preprocessing
df = diabetes.copy(deep=True)

### Look into missingness of data set

In [8]:
# calculate missing values
missingvalues = pd.DataFrame(df.isnull().sum(axis=0), columns = ['number_of_missing_values'])
missingvalues['feature'] = missingvalues.index
missingvalues = missingvalues[['feature','number_of_missing_values']].reset_index (drop = True)
missingvalues[missingvalues.number_of_missing_values > 0]

Unnamed: 0,feature,number_of_missing_values
2,race,2273
5,weight,98569
10,payer_code,40256
11,medical_specialty,49949
18,diag_1,21
19,diag_2,358
20,diag_3,1423


In [9]:
# calculate missing values, which indicates as 'Unknown/Invalid'
print('gender', df['gender'][df['gender'] == 'Unknown/Invalid'].count())

gender 3


In [10]:
# re-encoding admission type, discharge type and admission source into fewer categories

df['admission_type_id'] = df['admission_type_id'].replace(2,1)
df['admission_type_id'] = df['admission_type_id'].replace(7,1)
df['admission_type_id'] = df['admission_type_id'].replace(6,5)
df['admission_type_id'] = df['admission_type_id'].replace(8,5)

df['discharge_disposition_id'] = df['discharge_disposition_id'].replace(6,1)
df['discharge_disposition_id'] = df['discharge_disposition_id'].replace(8,1)
df['discharge_disposition_id'] = df['discharge_disposition_id'].replace(9,1)
df['discharge_disposition_id'] = df['discharge_disposition_id'].replace(13,1)
df['discharge_disposition_id'] = df['discharge_disposition_id'].replace(3,2)
df['discharge_disposition_id'] = df['discharge_disposition_id'].replace(4,2)
df['discharge_disposition_id'] = df['discharge_disposition_id'].replace(5,2)
df['discharge_disposition_id'] = df['discharge_disposition_id'].replace(14,2)
df['discharge_disposition_id'] = df['discharge_disposition_id'].replace(22,2)
df['discharge_disposition_id'] = df['discharge_disposition_id'].replace(23,2)
df['discharge_disposition_id'] = df['discharge_disposition_id'].replace(24,2)
df['discharge_disposition_id'] = df['discharge_disposition_id'].replace(12,10)
df['discharge_disposition_id'] = df['discharge_disposition_id'].replace(15,10)
df['discharge_disposition_id'] = df['discharge_disposition_id'].replace(16,10)
df['discharge_disposition_id'] = df['discharge_disposition_id'].replace(17,10)
df['discharge_disposition_id'] = df['discharge_disposition_id'].replace(25,18)
df['discharge_disposition_id'] = df['discharge_disposition_id'].replace(26,18)

df['admission_source_id'] = df['admission_source_id'].replace(2,1)
df['admission_source_id'] = df['admission_source_id'].replace(3,1)
df['admission_source_id'] = df['admission_source_id'].replace(5,4)
df['admission_source_id'] = df['admission_source_id'].replace(6,4)
df['admission_source_id'] = df['admission_source_id'].replace(10,4)
df['admission_source_id'] = df['admission_source_id'].replace(22,4)
df['admission_source_id'] = df['admission_source_id'].replace(25,4)
df['admission_source_id'] = df['admission_source_id'].replace(15,9)
df['admission_source_id'] = df['admission_source_id'].replace(17,9)
df['admission_source_id'] = df['admission_source_id'].replace(20,9)
df['admission_source_id'] = df['admission_source_id'].replace(21,9)
df['admission_source_id'] = df['admission_source_id'].replace(13,11)
df['admission_source_id'] = df['admission_source_id'].replace(14,11)

In [11]:
# calculate the readmission numbers
df['readmitted'] = df['readmitted'].replace('>30', 2)
df['readmitted'] = df['readmitted'].replace('<30', 1) #should we code it into 1 and 2?
df['readmitted'] = df['readmitted'].replace('NO', 0)

print('>30 readmissions', df['readmitted'][df['readmitted'] == 2].count())
print('<30 readmissions', df['readmitted'][df['readmitted'] == 1].count())
print('no readmissions', df['readmitted'][df['readmitted'] == 0].count())

>30 readmissions 35545
<30 readmissions 11357
no readmissions 54864


In [12]:
# dropping unknown race
drop_Idx = set(df['race'][df['race'] == np.nan].index)
# dropping rows where all diagnoses is ?
drop_Idx = set(df[(df['diag_1'] == np.nan) & (df['diag_2'] == np.nan) & (df['diag_3'] == np.nan)].index)
# dropping discharge_disposition_id = 11, which means the patient died
drop_Idx = drop_Idx.union(set(df[df['discharge_disposition_id'] == 11].index))
# dropping the missing values in gender
drop_Idx = drop_Idx.union(set(df['gender'][df['gender'] == 'Unknown/Invalid'].index))
new_Idx = list(set(df.index) - set(drop_Idx))
df = df.iloc[new_Idx]

In [13]:
# dropping columns with too many missing values
df = df.drop(['weight', 'payer_code', 'medical_specialty'], axis = 1)

In [14]:
# remove columns having same value in each row: citoglipton, examide
df = df.drop(['citoglipton', 'examide'], axis = 1)

In [15]:
# code the non-numeric values
df['change'] = df['change'].replace('Ch', 1)
df['change'] = df['change'].replace('No', 0)

df['gender'] = df['gender'].replace('Male', 1)
df['gender'] = df['gender'].replace('Female', 0)

df['diabetesMed'] = df['diabetesMed'].replace('Yes', 1)
df['diabetesMed'] = df['diabetesMed'].replace('No', 0)

# code age intervals [0-10) - [90-100) from 1-10
for i in range(0,10):
    df['age'] = df['age'].replace('['+str(10*i)+'-'+str(10*(i+1))+')', i+1)

### Feature Creation

#### number of medication changes
- 23 features for 23 drugs (or combos) which indicate 
    for each of these, whether a change in that medication was made or not during the current hospital stay 
    of patient.
    
- Medication change for diabetics upon admission has been shown by previous research to be associated 
    with lower readmission rates. 
    
- simplify model and see if there is a relationship with number of changes regardless of drug

In [16]:
# list of medications for use to create new feature
keys = ['metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'glipizide', 'glyburide', 
        'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'insulin', 'glyburide.metformin', 'tolazamide', 
        'metformin.pioglitazone','metformin.rosiglitazone', 'glimepiride.pioglitazone', 
        'glipizide.metformin', 'troglitazone', 'tolbutamide', 'acetohexamide']

In [17]:
for col in keys:
    colname = str(col) + 'temp'
    df[colname] = df[col].apply(lambda x: 0 if (x == 'No' or x == 'Steady') else 1)
#change in drug/medications
df['numchange'] = 0

for col in keys:
    colname = str(col) + 'temp'
    df['numchange'] = df['numchange'] + df[colname]
    del df[colname]

In [18]:
df.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,...,glyburide.metformin,glipizide.metformin,glimepiride.pioglitazone,metformin.rosiglitazone,metformin.pioglitazone,change,diabetesMed,readmitted,IsTrain,numchange
0,2278392,8222157,Caucasian,0,1,5,18,1,1,41,...,No,No,No,No,No,0,0,0,1,0
1,149190,55629189,Caucasian,0,2,1,1,7,3,59,...,No,No,No,No,No,1,1,2,1,1
2,500364,82442376,Caucasian,1,4,1,1,7,2,44,...,No,No,No,No,No,1,1,0,1,1
3,16680,42519267,Caucasian,1,5,1,1,7,1,51,...,No,No,No,No,No,1,1,0,1,0
4,35754,82637451,Caucasian,1,6,1,1,1,3,31,...,No,No,No,No,No,0,1,2,1,0


In [19]:
df['numchange'].value_counts()

0    72869
1    25832
2     1308
3      107
4        5
Name: numchange, dtype: int64

In [20]:
# recode A1Cresult and max_glu_serum
for col in keys:
    df[col] = df[col].replace('No', 0)
    df[col] = df[col].replace('Steady', 1)
    df[col] = df[col].replace('Up', 1)
    df[col] = df[col].replace('Down', 1) 

df['A1Cresult'] = df['A1Cresult'].replace('>7', 1)
df['A1Cresult'] = df['A1Cresult'].replace('>8', 1)
df['A1Cresult'] = df['A1Cresult'].replace('Norm', 0)
df['A1Cresult'] = df['A1Cresult'].replace('None', -99) #83243 None


df['max_glu_serum'] = df['max_glu_serum'].replace('>200', 1)
df['max_glu_serum'] = df['max_glu_serum'].replace('>300', 1)
df['max_glu_serum'] = df['max_glu_serum'].replace('Norm', 0)
df['max_glu_serum'] = df['max_glu_serum'].replace('None', -99) #94895 None

In [21]:
#inspect df
print(df['A1Cresult'].value_counts())
print(df['max_glu_serum'].value_counts())

-99    83244
 1     11935
 0      4942
Name: A1Cresult, dtype: int64
-99    94896
 1      2651
 0      2574
Name: max_glu_serum, dtype: int64


#### Create new feature for number of medications used:
indication of severity of condition and/or the intensity of care during encounter

In [22]:
df['num_meds'] = 0

for col in keys:
    df['num_meds'] = df['num_meds'] + df[col]

In [23]:
df['num_meds'].value_counts()

1    46438
0    22845
2    21712
3     7738
4     1325
5       58
6        5
Name: num_meds, dtype: int64

#### Recode disease codes

In [24]:
# disease codes starting with V or E are in “other” category; so recode them to 0
df.loc[df['diag_1'].str.contains('V', na=False), ['diag_1']] = 0
df.loc[df['diag_1'].str.contains('E', na=False), ['diag_1']] = 0

# replace the unknown values with -1
df['diag_1'] = df['diag_1'].replace('?', -1)

# convert this variable to float type to enable computations later
df['diag_1'] = df['diag_1'].astype(float)

# iterate and recode disease codes between certain ranges to certain categories
for index, row in df.iterrows():
    if (row['diag_1'] >= 390 and row['diag_1'] < 460) or (np.floor(row['diag_1']) == 785):
        df.loc[index, 'diag_1'] = 1
    elif (row['diag_1'] >= 460 and row['diag_1'] < 520) or (np.floor(row['diag_1']) == 786):
        df.loc[index, 'diag_1'] = 2
    elif (row['diag_1'] >= 520 and row['diag_1'] < 580) or (np.floor(row['diag_1']) == 787):
        df.loc[index, 'diag_1'] = 3
    elif (np.floor(row['diag_1']) == 250):
        df.loc[index, 'diag_1'] = 4
    elif (row['diag_1'] >= 800 and row['diag_1'] < 1000):
        df.loc[index, 'diag_1'] = 5
    elif (row['diag_1'] >= 710 and row['diag_1'] < 740):
        df.loc[index, 'diag_1'] = 6
    elif (row['diag_1'] >= 580 and row['diag_1'] < 630) or (np.floor(row['diag_1']) == 788):
        df.loc[index, 'diag_1'] = 7
    elif (row['diag_1'] >= 140 and row['diag_1'] < 240):
        df.loc[index, 'diag_1'] = 8
    else:
        df.loc[index, 'diag_1'] = 0

In [25]:
# disease codes starting with V or E are in “other” category; so recode them to 0
df.loc[df['diag_2'].str.contains('V', na=False), ['diag_2']] = 0
df.loc[df['diag_2'].str.contains('E', na=False), ['diag_2']] = 0

# replace the unknown values with -1
df['diag_2'] = df['diag_2'].replace('?', -1)

# convert this variable to float type to enable computations later
df['diag_2'] = df['diag_2'].astype(float)

# iterate and recode disease codes between certain ranges to certain categories
for index, row in df.iterrows():
    if (row['diag_2'] >= 390 and row['diag_2'] < 460) or (np.floor(row['diag_2']) == 785):
        df.loc[index, 'diag_2'] = 1
    elif (row['diag_2'] >= 460 and row['diag_2'] < 520) or (np.floor(row['diag_2']) == 786):
        df.loc[index, 'diag_2'] = 2
    elif (row['diag_2'] >= 520 and row['diag_2'] < 580) or (np.floor(row['diag_2']) == 787):
        df.loc[index, 'diag_2'] = 3
    elif (np.floor(row['diag_2']) == 250):
        df.loc[index, 'diag_2'] = 4
    elif (row['diag_2'] >= 800 and row['diag_2'] < 1000):
        df.loc[index, 'diag_2'] = 5
    elif (row['diag_2'] >= 710 and row['diag_2'] < 740):
        df.loc[index, 'diag_2'] = 6
    elif (row['diag_2'] >= 580 and row['diag_2'] < 630) or (np.floor(row['diag_2']) == 788):
        df.loc[index, 'diag_2'] = 7
    elif (row['diag_2'] >= 140 and row['diag_2'] < 240):
        df.loc[index, 'diag_2'] = 8
    else:
        df.loc[index, 'diag_2'] = 0

In [26]:
# disease codes starting with V or E are in “other” category; so recode them to 0
df.loc[df['diag_3'].str.contains('V', na=False), ['diag_3']] = 0
df.loc[df['diag_3'].str.contains('E', na=False), ['diag_3']] = 0

# replace the unknown values with -1
df['diag_3'] = df['diag_3'].replace('?', -1)

# convert this variable to float type to enable computations later
df['diag_3'] = df['diag_3'].astype(float)

# iterate and recode disease codes between certain ranges to certain categories
for index, row in df.iterrows():
    if (row['diag_3'] >= 390 and row['diag_3'] < 460) or (np.floor(row['diag_3']) == 785):
        df.loc[index, 'diag_3'] = 1
    elif (row['diag_3'] >= 460 and row['diag_3'] < 520) or (np.floor(row['diag_3']) == 786):
        df.loc[index, 'diag_3'] = 2
    elif (row['diag_3'] >= 520 and row['diag_3'] < 580) or (np.floor(row['diag_3']) == 787):
        df.loc[index, 'diag_3'] = 3
    elif (np.floor(row['diag_3']) == 250):
        df.loc[index, 'diag_3'] = 4
    elif (row['diag_3'] >= 800 and row['diag_3'] < 1000):
        df.loc[index, 'diag_3'] = 5
    elif (row['diag_3'] >= 710 and row['diag_3'] < 740):
        df.loc[index, 'diag_3'] = 6
    elif (row['diag_3'] >= 580 and row['diag_3'] < 630) or (np.floor(row['diag_3']) == 788):
        df.loc[index, 'diag_3'] = 7
    elif (row['diag_3'] >= 140 and row['diag_3'] < 240):
        df.loc[index, 'diag_3'] = 8
    else:
        df.loc[index, 'diag_3'] = 0

In [27]:
# convert data type of nominal features in dataframe to 'object' type
i = ['encounter_id', 'patient_nbr', 'gender', 'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
          'A1Cresult', 'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide', 
          'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose','miglitol',
          'troglitazone', 'tolazamide', 'insulin', 'glyburide.metformin', 'glipizide.metformin',
          'glimepiride.pioglitazone', 'metformin.rosiglitazone', 'metformin.pioglitazone', 'change', 'diabetesMed',
          'age', 'A1Cresult', 'max_glu_serum', 'diag_1', 'diag_2', 'diag_3']

df[i] = df[i].astype('object')

In [28]:
df.dtypes

encounter_id                object
patient_nbr                 object
race                        object
gender                      object
age                         object
admission_type_id           object
discharge_disposition_id    object
admission_source_id         object
time_in_hospital             int64
num_lab_procedures           int64
num_procedures               int64
num_medications              int64
number_outpatient            int64
number_emergency             int64
number_inpatient             int64
diag_1                      object
diag_2                      object
diag_3                      object
number_diagnoses             int64
max_glu_serum               object
A1Cresult                   object
metformin                   object
repaglinide                 object
nateglinide                 object
chlorpropamide              object
glimepiride                 object
acetohexamide               object
glipizide                   object
glyburide           

In [29]:
# get list of only numeric features to be used in checking for skewness and kurtosis
num_col = list(set(list(df._get_numeric_data().columns))- {'IsTrain','readmitted'})
num_col

['number_inpatient',
 'number_emergency',
 'num_procedures',
 'time_in_hospital',
 'number_diagnoses',
 'numchange',
 'num_medications',
 'num_lab_procedures',
 'num_meds',
 'number_outpatient']

In [30]:
# Removing skewnewss and kurtosis using log transformation if it is above a threshold value (2)

statdataframe = pd.DataFrame()
statdataframe['numeric_column'] = num_col
skew_before = []
skew_after = []

kurt_before = []
kurt_after = []

standard_deviation_before = []
standard_deviation_after = []

log_transform_needed = []

log_type = []

for i in num_col:
    skewval = df[i].skew()
    skew_before.append(skewval)
    
    kurtval = df[i].kurtosis()
    kurt_before.append(kurtval)
    
    sdval = df[i].std()
    standard_deviation_before.append(sdval)
    
    if (abs(skewval) >2) & (abs(kurtval) >2):
        log_transform_needed.append('Yes')
        
        if len(df[df[i] == 0])/len(df) <=0.02:
            log_type.append('log')
            skewvalnew = np.log(pd.DataFrame(df[train_data[i] > 0])[i]).skew()
            skew_after.append(skewvalnew)
            
            kurtvalnew = np.log(pd.DataFrame(df[train_data[i] > 0])[i]).kurtosis()
            kurt_after.append(kurtvalnew)
            
            sdvalnew = np.log(pd.DataFrame(df[train_data[i] > 0])[i]).std()
            standard_deviation_after.append(sdvalnew)
            
        else:
            log_type.append('log1p')
            skewvalnew = np.log1p(pd.DataFrame(df[df[i] >= 0])[i]).skew()
            skew_after.append(skewvalnew)
        
            kurtvalnew = np.log1p(pd.DataFrame(df[df[i] >= 0])[i]).kurtosis()
            kurt_after.append(kurtvalnew)
            
            sdvalnew = np.log1p(pd.DataFrame(df[df[i] >= 0])[i]).std()
            standard_deviation_after.append(sdvalnew)
            
    else:
        log_type.append('NA')
        log_transform_needed.append('No')
        
        skew_after.append(skewval)
        kurt_after.append(kurtval)
        standard_deviation_after.append(sdval)

statdataframe['skew_before'] = skew_before
statdataframe['kurtosis_before'] = kurt_before
statdataframe['standard_deviation_before'] = standard_deviation_before
statdataframe['log_transform_needed'] = log_transform_needed
statdataframe['log_type'] = log_type
statdataframe['skew_after'] = skew_after
statdataframe['kurtosis_after'] = kurt_after
statdataframe['standard_deviation_after'] = standard_deviation_after

In [31]:
statdataframe

Unnamed: 0,numeric_column,skew_before,kurtosis_before,standard_deviation_before,log_transform_needed,log_type,skew_after,kurtosis_after,standard_deviation_after
0,number_inpatient,3.626407,20.833698,1.261819,Yes,log1p,1.45047,1.405323,0.510432
1,number_emergency,22.842356,1185.257368,0.935512,Yes,log1p,3.661086,16.27681,0.315404
2,num_procedures,1.326013,0.890779,1.700327,No,,1.326013,0.890779,1.700327
3,time_in_hospital,1.137904,0.871134,2.974524,No,,1.137904,0.871134,2.974524
4,number_diagnoses,-0.867752,-0.109084,1.938216,No,,-0.867752,-0.109084,1.938216
5,numchange,1.425326,1.433667,0.487856,No,,1.425326,1.433667,0.487856
6,num_medications,1.333051,3.523542,8.092574,No,,1.333051,3.523542,8.092574
7,num_lab_procedures,-0.24151,-0.253134,19.620578,No,,-0.24151,-0.253134,19.620578
8,num_meds,0.67512,0.27723,0.921623,No,,0.67512,0.27723,0.921623
9,number_outpatient,8.818332,148.559902,1.263968,Yes,log1p,2.733932,7.804336,0.429392


In [32]:
# log transformation for the columns determined to be needing it above.

for i in range(len(statdataframe)):
    if statdataframe['log_transform_needed'][i] == 'Yes':
        colname = str(statdataframe['numeric_column'][i])
        
        if statdataframe['log_type'][i] == 'log':
            df = df[df[colname] > 0]
            df[colname + "_log"] = np.log(df[colname])
            
        elif statdataframe['log_type'][i] == 'log1p':
            df = df[df[colname] >= 0]
            df[colname + "_log1p"] = np.log1p(df[colname])

In [33]:
# get list of only numeric features
numeric = list(set(list(df._get_numeric_data().columns))- {'IsTrain','readmitted'})

In [34]:
# show list of features that are categorical
# df.encounter_id = df.encounter_id.astype('int64')
# df.patient_nbr = df.patient_nbr.astype('int64')
df.diabetesMed = df.diabetesMed.astype('int64')
df.change = df.change.astype('int64')

# convert data type of nominal features in dataframe to 'object' type for aggregating
i = ['metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide', \
          'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose','miglitol', \
          'troglitazone', 'tolazamide', 'insulin', 'glyburide.metformin', 'glipizide.metformin', \
          'glimepiride.pioglitazone', 'metformin.rosiglitazone', 'metformin.pioglitazone','A1Cresult']
df[i] = df[i].astype('int64')

df.dtypes

encounter_id                 object
patient_nbr                  object
race                         object
gender                       object
age                          object
admission_type_id            object
discharge_disposition_id     object
admission_source_id          object
time_in_hospital              int64
num_lab_procedures            int64
num_procedures                int64
num_medications               int64
number_outpatient             int64
number_emergency              int64
number_inpatient              int64
diag_1                       object
diag_2                       object
diag_3                       object
number_diagnoses              int64
max_glu_serum                object
A1Cresult                     int64
metformin                     int64
repaglinide                   int64
nateglinide                   int64
chlorpropamide                int64
glimepiride                   int64
acetohexamide                 int64
glipizide                   

In [35]:
df.A1Cresult.value_counts()

-99    83244
 1     11935
 0      4942
Name: A1Cresult, dtype: int64

In [36]:
# convert readmitted over 30 days to 0
df['readmitted'] = df['readmitted'].apply(lambda x: 0 if x == 2 else x)

#### Interaction variables

- effect of one of the variables differs depending on the level of the other variable
- e.g. length of stay affects number of procedures done


In [37]:
interaction_var = [('num_medications','time_in_hospital'),
('num_medications','num_procedures'),
('time_in_hospital','num_lab_procedures'),
('num_medications','num_lab_procedures'),
('num_medications','number_diagnoses'),
('age','number_diagnoses'),
('change','num_medications'),
('number_diagnoses','time_in_hospital'),
('num_medications','numchange')]

for inter in interaction_var:
    name = inter[0] + '|' + inter[1]
    df[name] = df[inter[0]] * df[inter[1]]

In [38]:
df[['num_medications','time_in_hospital', 'num_medications|time_in_hospital']].head()

Unnamed: 0,num_medications,time_in_hospital,num_medications|time_in_hospital
0,1,1,1
1,18,3,54
2,16,2,32
3,8,1,8
4,16,3,48


In [39]:
### Scaling applied to df; using df2 from here
datf = pd.DataFrame()
datf['features'] = numeric
datf['std_dev'] = datf['features'].apply(lambda x: df[x].std())
datf['mean'] = datf['features'].apply(lambda x: df[x].mean())

In [40]:
# Logical order: duplicate removal, then outlier removal followed by scaling
# dropping multiple encounters while keeping either first or last encounter of these patients
df2 = df.drop_duplicates(subset= ['patient_nbr'], keep = 'first')
df2.shape

(70443, 60)

In [41]:
# standardize function
def standardize(raw_data):
    return ((raw_data - np.mean(raw_data, axis = 0)) / np.std(raw_data, axis = 0))

In [42]:
df2[numeric] = standardize(df2[numeric])
import scipy.stats as sp
df2 = df2[(np.abs(sp.zscore(df2[numeric])) < 3).all(axis=1)]

In [43]:
import seaborn as sns
from matplotlib.colors import ListedColormap

my_cmap = ListedColormap(sns.light_palette((250, 100, 50), input="husl", n_colors=50).as_hex())
# drop some columns due to their means is round to 0
# table = df.drop(['acetohexamide','tolbutamide', 'troglitazone', 'glipizide-metformin', 'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone','patient_nbr', 'encounter_id', 'service_utilization_log1p'], axis = 1).corr(method='pearson')
table = df2.drop(['patient_nbr', 'encounter_id'], axis=1).corr(method='pearson')
table.style.background_gradient(cmap=my_cmap, axis = 0)

Unnamed: 0,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,insulin,glyburide.metformin,glipizide.metformin,glimepiride.pioglitazone,metformin.rosiglitazone,metformin.pioglitazone,change,diabetesMed,readmitted,IsTrain,numchange,num_meds,number_inpatient_log1p,number_emergency_log1p,number_outpatient_log1p,num_medications|time_in_hospital,num_medications|num_procedures,time_in_hospital|num_lab_procedures,num_medications|num_lab_procedures,num_medications|number_diagnoses,change|num_medications,number_diagnoses|time_in_hospital,num_medications|numchange
time_in_hospital,1.0,0.321422,0.142128,0.441269,-0.0242403,-0.00377071,0.0603086,0.23108,0.0686566,-0.0131906,0.0314425,0.00683306,0.00130456,0.0113109,,0.0142184,0.0248419,0.00305761,-0.00419106,0.00071561,-0.000700619,-0.00209051,0.00565119,-0.00651072,0.0960369,-0.00153164,0.00082738,,-0.00307462,0.00277518,0.0952036,0.0605008,0.0535551,0.00614847,0.13827,0.065374,0.0603086,-0.00377071,-0.0249324,0.859278,0.268218,0.854669,0.503453,0.456436,0.232548,0.935697,0.22915
num_lab_procedures,0.321422,1.0,0.0128407,0.230022,-0.0382266,0.00541652,0.0425208,0.143153,0.271241,-0.0517005,0.00554208,-0.00787776,-0.00324057,-0.00695697,,0.0158403,0.000510257,0.00168761,-0.0183142,-0.0108082,-0.0042415,-0.00411569,0.00513405,-0.00222464,0.102512,-0.0159054,-0.00059664,,0.00180388,-0.00392897,0.0647629,0.0390746,0.0282952,-0.00976447,0.11242,0.0270223,0.0425208,0.00541652,-0.0396103,0.329521,0.113424,0.67301,0.730303,0.259308,0.134591,0.324129,0.155646
num_procedures,0.142128,0.0128407,1.0,0.336372,-0.0182694,-0.0276392,-0.0209318,0.0721486,-0.0312722,-0.0564241,0.00316871,-0.00613946,0.00516638,0.00787612,,0.00493563,0.00421683,-0.00239266,0.013088,0.0103994,0.00122053,-0.00103091,-0.00560175,-1.18997e-05,0.000411625,-0.00792691,-0.00409578,,-0.00323412,-0.000890984,-0.0154681,-0.0177623,0.00252691,-0.00615282,-0.011331,-0.0138177,-0.0209318,-0.0276392,-0.0178414,0.263069,0.877877,0.136626,0.245463,0.294636,0.112056,0.153807,0.0763169
num_medications,0.441269,0.230022,0.336372,1.0,0.0337059,0.0143535,0.0448937,0.263812,0.0119833,0.0785685,0.0259391,0.0281815,0.00167633,0.0458825,,0.0558185,0.0441653,0.00298901,0.0750351,0.0586854,0.0123469,0.00250466,0.0042301,-0.000880399,0.192207,0.0111756,0.00483698,,-0.00223379,0.00281629,0.234656,0.190265,0.0412812,-0.000954342,0.192817,0.22801,0.0448937,0.0143535,0.0356686,0.752277,0.59909,0.43997,0.773731,0.895569,0.534099,0.46374,0.393927
number_outpatient,-0.0242403,-0.0382266,-0.0182694,0.0337059,1.0,0.0984239,0.0564355,0.0637195,-0.0327182,0.0247493,0.00367538,0.0150669,-0.00509704,-0.000482775,,0.00514305,0.0122648,-0.0024335,0.0199191,0.0112678,0.015018,0.00292247,-0.00212656,-0.00477226,-0.00755204,-0.0029738,0.0131781,,-0.00122775,-0.00122775,0.0278144,0.0245636,0.0121446,0.0117249,0.0177762,0.0232899,0.0564355,0.0984239,0.992823,-0.00224822,-0.0119434,-0.0310818,-0.0020623,0.0489253,0.0334948,-0.00156223,0.0213346
number_emergency,-0.00377071,0.00541652,-0.0276392,0.0143535,0.0984239,1.0,0.0823429,0.0520307,-0.0139423,-0.00332065,0.0130169,0.00638293,-0.00769707,0.00499763,,0.00430984,-0.014163,-0.00378824,0.0158191,0.0125039,-0.00234467,0.0149687,-0.00164018,0.00160183,0.0391356,0.00174101,-0.00231963,,-0.000946945,-0.000946945,0.0415306,0.0354441,0.015117,0.00542,0.0465269,0.027477,0.0823429,1.0,0.10065,0.00524618,-0.0171189,-0.00170949,0.0148101,0.0342056,0.0358569,0.0157441,0.0422041
number_inpatient,0.0603086,0.0425208,-0.0209318,0.0448937,0.0564355,0.0823429,1.0,0.0699958,-0.0519959,-0.021964,0.016319,0.00122465,0.00342653,0.00201904,,0.000311792,-0.00220992,0.00108808,-0.00332678,-0.00300814,-0.00551716,-0.00192041,-0.00237858,-0.00495221,0.0278987,-0.00612131,-0.00336391,,0.0117887,-0.00137325,0.00895118,0.0236349,0.0558543,0.0363126,0.0190709,0.00476615,1.0,0.0823429,0.0586904,0.0513956,-0.00674001,0.0539872,0.0519943,0.0641483,0.020045,0.0703599,0.0244951
number_diagnoses,0.23108,0.143153,0.0721486,0.263812,0.0637195,0.0520307,0.0699958,1.0,-0.00614239,-0.068781,0.0326669,0.015068,-0.0142341,0.013258,,-0.00713844,-0.0221087,-0.000467598,0.00781752,-0.00596435,0.00535556,-0.00143349,0.00520647,-0.0125278,0.0801757,-0.00223638,0.00240136,,0.00368111,-0.00644687,0.0433029,0.0193094,0.0450799,0.0115086,0.065245,0.0115529,0.0699958,0.0520307,0.0656108,0.254672,0.122663,0.226561,0.263454,0.614944,0.130597,0.495774,0.125167
A1Cresult,0.0686566,0.271241,-0.0312722,0.0119833,-0.0327182,-0.0139423,-0.0519959,-0.00614239,1.0,0.0369654,0.0150467,0.0011316,-0.00372833,0.0161293,,0.00771678,0.000619434,-0.00228707,-0.00522496,0.00301742,-0.00253856,-0.00187533,-0.00326477,-0.000415913,0.0999305,-0.00591185,0.00396085,,-0.00188488,-0.00188488,0.0920847,0.0725244,-0.00678327,-0.00754762,0.110001,0.0773425,-0.0519959,-0.0139423,-0.0340228,0.057138,-0.000952452,0.172135,0.170879,0.0218947,0.0745783,0.0543342,0.0957209
metformin,-0.0131906,-0.0517005,-0.0564241,0.0785685,0.0247493,-0.00332065,-0.021964,-0.068781,0.0369654,1.0,0.000686546,0.0161401,-0.0105832,0.039933,,0.0705006,0.138535,-0.00821935,0.0504043,0.0920853,0.00432394,0.00999827,-0.00355871,3.82331e-06,-0.0271567,-0.0288618,-0.000977218,,0.00787935,0.00787935,0.352689,0.295329,-0.00774915,-0.00575451,0.0183353,0.541675,-0.021964,-0.00332065,0.0240559,0.0167058,-0.0125963,-0.0372305,0.00672593,0.0276982,0.29912,-0.0351848,0.0248369


In [44]:
pd.options.display.max_rows = 400

c = df2.corr().abs()
s = c.unstack()
print(s.shape)
so = s.sort_values(ascending=False)

so[38:120]

(2209,)


glyburide                            glyburide                              1.000000
tolbutamide                          tolbutamide                            1.000000
pioglitazone                         pioglitazone                           1.000000
rosiglitazone                        rosiglitazone                          1.000000
change|num_medications               change|num_medications                 1.000000
tolazamide                           tolazamide                             1.000000
time_in_hospital                     time_in_hospital                       1.000000
number_emergency_log1p               number_emergency                       1.000000
number_emergency                     number_emergency_log1p                 1.000000
number_inpatient                     number_inpatient_log1p                 1.000000
number_inpatient_log1p               number_inpatient                       1.000000
number_outpatient_log1p              number_outpatient           

In [57]:
df2['diag_1'] = df2['diag_1'].astype('object')
df_pd = pd.get_dummies(df2, columns=['race', 'gender', 'admission_type_id', 'discharge_disposition_id',
                                      'admission_source_id', 'max_glu_serum', 'A1Cresult', 'diag_1'])

non_num_cols = ['race', 'gender', 'admission_type_id', 'discharge_disposition_id', 'admission_source_id', 'max_glu_serum', 'A1Cresult', 'diag_1' ]

In [46]:
num_cols = list(set(list(df._get_numeric_data().columns))- {'readmitted', 'change','IsTrain'})
num_cols

['number_emergency',
 'glyburide.metformin',
 'glipizide.metformin',
 'number_diagnoses|time_in_hospital',
 'diabetesMed',
 'num_medications|num_procedures',
 'number_inpatient',
 'glipizide',
 'number_emergency_log1p',
 'num_medications|number_diagnoses',
 'chlorpropamide',
 'insulin',
 'troglitazone',
 'num_procedures',
 'time_in_hospital',
 'number_diagnoses',
 'metformin',
 'metformin.pioglitazone',
 'num_medications|time_in_hospital',
 'num_lab_procedures',
 'miglitol',
 'change|num_medications',
 'repaglinide',
 'glimepiride.pioglitazone',
 'number_outpatient_log1p',
 'num_medications|num_lab_procedures',
 'A1Cresult',
 'acarbose',
 'num_meds',
 'number_inpatient_log1p',
 'acetohexamide',
 'rosiglitazone',
 'num_medications|numchange',
 'num_medications',
 'time_in_hospital|num_lab_procedures',
 'metformin.rosiglitazone',
 'glyburide',
 'tolbutamide',
 'pioglitazone',
 'glimepiride',
 'numchange',
 'tolazamide',
 'nateglinide',
 'number_outpatient']

In [58]:
new_non_num_cols = []
for i in non_num_cols:
    for j in df_pd.columns:
        if i in j:
            new_non_num_cols.append(j)

# new_non_num_cols
new_non_num_cols

['race_AfricanAmerican',
 'race_Asian',
 'race_Caucasian',
 'race_Hispanic',
 'race_Other',
 'gender_0',
 'gender_1',
 'admission_type_id_1',
 'admission_type_id_3',
 'admission_type_id_4',
 'admission_type_id_5',
 'discharge_disposition_id_1',
 'discharge_disposition_id_2',
 'discharge_disposition_id_7',
 'discharge_disposition_id_10',
 'discharge_disposition_id_18',
 'discharge_disposition_id_19',
 'discharge_disposition_id_20',
 'discharge_disposition_id_27',
 'discharge_disposition_id_28',
 'admission_source_id_1',
 'admission_source_id_4',
 'admission_source_id_7',
 'admission_source_id_8',
 'admission_source_id_9',
 'admission_source_id_11',
 'max_glu_serum_-99',
 'max_glu_serum_0',
 'max_glu_serum_1',
 'A1Cresult_-99',
 'A1Cresult_0',
 'A1Cresult_1',
 'diag_1_0.0',
 'diag_1_1.0',
 'diag_1_2.0',
 'diag_1_3.0',
 'diag_1_4.0',
 'diag_1_5.0',
 'diag_1_6.0',
 'diag_1_7.0',
 'diag_1_8.0']

In [59]:
l = []
for feature in list(df_pd.columns):
    if '|' in feature:
        l.append(feature)
l

['num_medications|time_in_hospital',
 'num_medications|num_procedures',
 'time_in_hospital|num_lab_procedures',
 'num_medications|num_lab_procedures',
 'num_medications|number_diagnoses',
 'age|number_diagnoses',
 'change|num_medications',
 'number_diagnoses|time_in_hospital',
 'num_medications|numchange']

In [60]:
df_pd['IsTrain'].head()

1    1
2    1
3    1
4    1
5    1
Name: IsTrain, dtype: int64

In [61]:
df_train = df_pd[df_pd['IsTrain']==1]
df_train.index = list(range(len(df_train)))

df_test = df_pd[df_pd['IsTrain']==0]
df_test.index = list(range(len(df_test)))

In [62]:
df_train.drop(['encounter_id','patient_nbr','IsTrain', 'diag_2', 'diag_3'], axis = 1, inplace = True)
df_test.drop(['encounter_id','patient_nbr','IsTrain', 'diag_2', 'diag_3'], axis = 1, inplace=True)

In [63]:
df_train.columns

Index(['age', 'time_in_hospital', 'num_lab_procedures', 'num_procedures',
       'num_medications', 'number_outpatient', 'number_emergency',
       'number_inpatient', 'number_diagnoses', 'metformin', 'repaglinide',
       'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide',
       'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone',
       'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide',
       'insulin', 'glyburide.metformin', 'glipizide.metformin',
       'glimepiride.pioglitazone', 'metformin.rosiglitazone',
       'metformin.pioglitazone', 'change', 'diabetesMed', 'readmitted',
       'numchange', 'num_meds', 'number_inpatient_log1p',
       'number_emergency_log1p', 'number_outpatient_log1p',
       'num_medications|time_in_hospital', 'num_medications|num_procedures',
       'time_in_hospital|num_lab_procedures',
       'num_medications|num_lab_procedures',
       'num_medications|number_diagnoses', 'age|number_diagnoses',
       'change|num_

In [64]:
df_train.head()

Unnamed: 0,age,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,metformin,...,A1Cresult_1,diag_1_0.0,diag_1_1.0,diag_1_2.0,diag_1_3.0,diag_1_4.0,diag_1_5.0,diag_1_6.0,diag_1_7.0,diag_1_8.0
0,2,-0.438603,0.805968,-0.80931,0.277594,-0.264763,-0.205982,-0.329314,0.879788,0,...,0,1,0,0,0,0,0,0,0,0
1,4,-0.777582,0.051157,-0.238557,0.036471,-0.264763,-0.205982,-0.329314,-0.121681,0,...,0,1,0,0,0,0,0,0,0,0
2,5,-1.116561,0.403402,-0.80931,-0.928017,-0.264763,-0.205982,-0.329314,-1.123149,0,...,0,0,0,0,0,0,0,0,0,1
3,6,-0.438603,-0.603012,2.615208,0.036471,-0.264763,-0.205982,-0.329314,0.879788,0,...,0,0,1,0,0,0,0,0,0,0
4,7,-0.099625,1.359497,-0.238557,0.639277,-0.264763,-0.205982,-0.329314,-0.121681,1,...,0,0,1,0,0,0,0,0,0,0


In [65]:
df_test.head()

Unnamed: 0,age,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,metformin,...,A1Cresult_1,diag_1_0.0,diag_1_1.0,diag_1_2.0,diag_1_3.0,diag_1_4.0,diag_1_5.0,diag_1_6.0,diag_1_7.0,diag_1_8.0
0,3,-0.777582,-1.609427,2.044455,-0.325212,1.592509,-0.205982,1.224358,-0.622415,0,...,0,1,0,0,0,0,0,0,0,0
1,9,2.951182,1.258855,0.332196,1.483204,-0.264763,-0.205982,-0.329314,0.379053,0,...,0,0,1,0,0,0,0,0,0,0
2,5,1.595268,0.202119,0.332196,0.157032,-0.264763,-0.205982,-0.329314,0.879788,0,...,0,0,0,0,0,1,0,0,0,0
3,8,-0.438603,-1.206861,1.473702,0.277594,-0.264763,-0.205982,-0.329314,-0.622415,0,...,0,0,0,0,0,0,0,0,0,1
4,8,-1.116561,0.504044,-0.80931,-0.686895,-0.264763,-0.205982,-0.329314,0.379053,0,...,0,0,0,1,0,0,0,0,0,0


In [66]:
#### save final dataset to csv, split back into train and test data 
df_train.to_csv('df_train.csv', index=0)
df_test.to_csv('df_test.csv', index=0)
# no encounter or patient id