In [25]:
import pandas as pd
import numpy as np
import math

In [26]:
#import the test and train csv files, then combine them to diabetes01 with a flaf indicating test or not:
diabetesTrain = pd.read_csv('train_multi.csv')
diabetesTest = pd.read_csv('test_multi.csv')

diabetesTrain['IsTrain']=1
diabetesTest['IsTrain']=0

In [27]:
#load the initial data file:
diabetes01 = pd.concat([diabetesTrain, diabetesTest], axis=0)
diabetes01.index = list(range(len(diabetes01)))

In [28]:
#Examine the initial data file:
diabetes01.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,insulin,glyburide.metformin,glipizide.metformin,glimepiride.pioglitazone,metformin.rosiglitazone,metformin.pioglitazone,change,diabetesMed,readmitted,IsTrain
0,2278392,8222157,Caucasian,Female,[0-10),,6,25,1,1,...,No,No,No,No,No,No,No,No,NO,1
1,64410,86047875,AfricanAmerican,Female,[20-30),,1,1,7,2,...,No,No,No,No,No,No,No,Yes,NO,1
2,55842,84259809,Caucasian,Male,[60-70),,3,1,2,4,...,Steady,No,No,No,No,No,Ch,Yes,NO,1
3,12522,48330783,Caucasian,Female,[80-90),,2,1,4,13,...,Steady,No,No,No,No,No,Ch,Yes,NO,1
4,15738,63555939,Caucasian,Female,[90-100),,3,3,4,12,...,Steady,No,No,No,No,No,Ch,Yes,NO,1


In [29]:
#Rewrite the 24 medication columns to make clearer that they are all comparing the same thing:

Dcolumns = list(diabetes01.columns)
for i in range(24, 47):
    Dcolumns[i] = "med_" + Dcolumns[i]
Dcolumns[48] = "med_any"

diabetes01.columns = Dcolumns

In [30]:
#age is strictly divided into decades of life. We should make this numeric for now:
diabetes01['age'].value_counts()
diabetes01['age'] = diabetes01['age'].replace(['[0-10)', '[10-20)', '[20-30)', '[30-40)', '[40-50)', '[50-60)', 
                                              '[60-70)', '[70-80)', '[80-90)', '[90-100)'], [1,2,3,4,5,6,7,8,9,10])

In [31]:
'''Some patients show up multiple times in the analysis. This certainly might be relevant, 
but it is covered in other features (num_outpatient, num_inpatient, etc). We should remove encounter_id and patient_nbr.'''
diabetes01['patient_nbr'].value_counts(10)
diabetes01 = diabetes01.drop(['encounter_id', 'patient_nbr'], axis=1)

In [32]:
diabetes01.head()

Unnamed: 0,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,...,med_insulin,med_glyburide.metformin,med_glipizide.metformin,med_glimepiride.pioglitazone,med_metformin.rosiglitazone,med_metformin.pioglitazone,change,med_any,readmitted,IsTrain
0,Caucasian,Female,1,,6,25,1,1,,Pediatrics-Endocrinology,...,No,No,No,No,No,No,No,No,NO,1
1,AfricanAmerican,Female,3,,1,1,7,2,,,...,No,No,No,No,No,No,No,Yes,NO,1
2,Caucasian,Male,7,,3,1,2,4,,,...,Steady,No,No,No,No,No,Ch,Yes,NO,1
3,Caucasian,Female,9,,2,1,4,13,,,...,Steady,No,No,No,No,No,Ch,Yes,NO,1
4,Caucasian,Female,10,,3,3,4,12,,InternalMedicine,...,Steady,No,No,No,No,No,Ch,Yes,NO,1


In [33]:
#Race has a ? variable, which we need to keep track of for now (will address/impute later):
diabetes01['race'].value_counts()
FeaturesWithMissing = ['race']

In [34]:
#Gender has an Unknown variable also, with only three values. Let's see what these look like:
diabetes01['gender'].value_counts()

Female             35190
Male               31028
Unknown/Invalid        3
Name: gender, dtype: int64

In [35]:
'''#Examine in more detail some info from these 3 gender-unknown patients.
diabetes01[diabetes01['gender']=='Unknown/Invalid']
print(diabetes01.iloc[30506,])
print(diabetes01.iloc[75551,])
print(diabetes01.iloc[82573,])'''

"#Examine in more detail some info from these 3 gender-unknown patients.\ndiabetes01[diabetes01['gender']=='Unknown/Invalid']\nprint(diabetes01.iloc[30506,])\nprint(diabetes01.iloc[75551,])\nprint(diabetes01.iloc[82573,])"

In [36]:
#We could remove these, but to keep it from becoming complicated, let's just impute with "female" (more common) instead:

diabetes01['gender'] = diabetes01['gender'].replace(['Unknown/Invalid'], ['Female'])
diabetes01['gender'].value_counts()

Female    35193
Male      31028
Name: gender, dtype: int64

In [37]:
#97% of data on weight is missing. We can not do anything with this, particularly because it could be non-random. Remove this variable:
diabetes01 = diabetes01.drop(['weight'], axis=1)

In [38]:
#Admission types look OK, except that 5,  8 and 6 are the same. We should combine them all to 5:
diabetes01['admission_type_id'].value_counts()
diabetes01['admission_type_id'] = diabetes01['admission_type_id'].replace([8], [6])
diabetes01['admission_type_id'] = diabetes01['admission_type_id'].replace([6], [5])

In [39]:
'''These need some adjusting. 11, 19, 20, 21 mean the patient died. Clearly, readmission rates will be 0 here, and this could \
be written into an algorithm, but for now, they should certainly be rewritten as the same thing.

18, 25, and 26 are all the same thing also (unknown).'''
diabetes01['discharge_disposition_id'].value_counts()

1     38717
3      9038
6      7540
18     2664
11     1642
22     1474
2      1460
5       834
25      613
4       537
7       402
14      365
13      363
23      268
28      105
8        70
15       45
24       32
9        19
17        9
19        8
16        5
27        4
12        3
10        2
20        2
Name: discharge_disposition_id, dtype: int64

In [40]:
#Write to csv to visualize in ggplot2, a far superior visualization tool:
#diabetes01.to_csv('diabetesmod.csv')

In [41]:
'''After visualization, and careful reading of the descriptions, I would rewrite the 30 discharge categories.'''
replacelist = ['home', 'hospital', 'nursing', 'nursing', 'hospice', 'hhealth', 'leftAMA', 'hhealth', 'hospital', 'hospital',
              'died', 'hospital', 'hospice', 'hospice', 'hospital', 'outpatient', 'outpatient', 'unknown', 'died', 'died', 
              'died', 'outpatient', 'hospital', 'nursing', 'unknown', 'unknown', 'nursing', 'psych', 'hospital', 'outpatient']

diabetes01['discharge_disposition_id'] = diabetes01['discharge_disposition_id'].replace(list(range(1,31)), replacelist)

In [42]:
diabetes01['discharge_disposition_id'].value_counts()

home          38717
nursing        9611
hhealth        7610
unknown        3277
hospital       1797
died           1652
hospice        1562
outpatient     1488
leftAMA         402
psych           105
Name: discharge_disposition_id, dtype: int64

In [43]:
#Rewrite the column as discharge disposition:
newcollist = list(diabetes01.columns)
newcollist[newcollist.index('discharge_disposition_id')]='discharge_disposition'
diabetes01.columns = newcollist

In [44]:
diabetes01['admission_source_id'].value_counts()

7     35827
1     19925
17     4323
4      2515
6      1866
2       794
5       619
3       129
9       109
20       80
8        12
22        9
10        6
11        2
14        2
25        2
13        1
Name: admission_source_id, dtype: int64

In [45]:
#There are no missing values in the "time in hospital"
print(sum(diabetes01['time_in_hospital'].isna()))
print(diabetes01['time_in_hospital'].describe())

0
count    66221.000000
mean         4.342550
std          2.982172
min          1.000000
25%          2.000000
50%          4.000000
75%          6.000000
max         14.000000
Name: time_in_hospital, dtype: float64


In [46]:
#For now, we can keep payer_code, although there are a large number of missing values and this feature would not seem to be important
diabetes01['payer_code'].value_counts()

MC    20457
HM     3954
BC     3354
SP     3138
MD     2273
CP     1752
UN     1752
CM     1276
OG      700
PO      448
DM      329
CH      113
WC      111
OT       57
MP       40
SI       38
FR        1
Name: payer_code, dtype: int64

In [47]:
#This category is completely unwieldy, and we will have to figure out what to do with this too. It has 40% missing values:
diabetes01['medical_specialty'].value_counts()

InternalMedicine                        9912
Family/GeneralPractice                  4777
Emergency/Trauma                        4559
Cardiology                              3499
Surgery-General                         2053
Orthopedics                             1073
Nephrology                               947
Orthopedics-Reconstructive               940
Radiologist                              757
Psychiatry                               592
ObstetricsandGynecology                  561
Pulmonology                              543
Surgery-Cardiovascular/Thoracic          522
Urology                                  500
Surgery-Neuro                            392
Gastroenterology                         350
Surgery-Vascular                         333
PhysicalMedicineandRehabilitation        297
Oncology                                 248
Pediatrics                               175
Neurology                                157
Hematology/Oncology                      143
Pediatrics

In [48]:
#Num_lab_procedures is clear
print(diabetes01['num_lab_procedures'].describe())
sum(diabetes01['num_lab_procedures'].isna())

count    66221.000000
mean        42.697921
std         19.720143
min          1.000000
25%         31.000000
50%         44.000000
75%         56.000000
max        132.000000
Name: num_lab_procedures, dtype: float64


0

In [49]:
#Num_procedures is also clear:
print(diabetes01['num_procedures'].describe())
sum(diabetes01['num_procedures'].isna())

count    66221.000000
mean         1.388110
std          1.723031
min          0.000000
25%          0.000000
50%          1.000000
75%          2.000000
max          6.000000
Name: num_procedures, dtype: float64


0

In [50]:
#Num_medications is also clear:
print(diabetes01['num_medications'].describe())
sum(diabetes01['num_medications'].isna())

count    66221.000000
mean        15.881790
std          8.384629
min          1.000000
25%         10.000000
50%         14.000000
75%         20.000000
max         81.000000
Name: num_medications, dtype: float64


0

In [51]:
#Num_outpatient is also clear:
print(diabetes01['number_outpatient'].describe())
sum(diabetes01['number_outpatient'].isna())

count    66221.000000
mean         0.301204
std          1.083986
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max         40.000000
Name: number_outpatient, dtype: float64


0

In [52]:
#Num_emergency is also clear:
print(diabetes01['number_emergency'].describe())
sum(diabetes01['number_emergency'].isna())

count    66221.000000
mean         0.151765
std          0.746954
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max         64.000000
Name: number_emergency, dtype: float64


0

In [53]:
#Num_inpatient is also clear:
print(diabetes01['number_inpatient'].describe())
sum(diabetes01['number_inpatient'].isna())

count    66221.000000
mean         0.526374
std          1.172518
min          0.000000
25%          0.000000
50%          0.000000
75%          1.000000
max         21.000000
Name: number_inpatient, dtype: float64


0

In [54]:
#Num_diagnoses is also clear:
print(diabetes01['number_diagnoses'].describe())
sum(diabetes01['number_diagnoses'].isna())

count    66221.000000
mean         7.302215
std          1.985369
min          1.000000
25%          6.000000
50%          8.000000
75%          9.000000
max         16.000000
Name: number_diagnoses, dtype: float64


0

In [55]:
'''Num_diagnoses and diag_1/diag_2/diag_3 are highly related to each other. If num_diagnoses are 3 or under, diag_3 (and
diag_2) will be empty. A "primary" diagnosis is essentially what the patient is in there for. Secondary diagnoses are other
things the patient has. I would recommend combining diag_1 thru diag_3 together'''

'Num_diagnoses and diag_1/diag_2/diag_3 are highly related to each other. If num_diagnoses are 3 or under, diag_3 (and\ndiag_2) will be empty. A "primary" diagnosis is essentially what the patient is in there for. Secondary diagnoses are other\nthings the patient has. I would recommend combining diag_1 thru diag_3 together'

In [56]:
#Max glucose in serum is present, though many points were not measured. We should rewrite these as "NotTaken"
diabetes01['max_glu_serum'].value_counts()
diabetes01['max_glu_serum'] = diabetes01['max_glu_serum'].replace(['None'], ['NotTaken'])

In [57]:
#Same story for A1C measurement
diabetes01['A1Cresult'].value_counts()
diabetes01['A1Cresult'] = diabetes01['A1Cresult'].replace(['None'], ['NotTaken'])

In [58]:
#Write a function to rewrite the disease codes according to a modified version of the publication:
def convertdiseases(min, max, newname):
    d1 = diabetes01['diag_1'].tolist()
    d2 = diabetes01['diag_2'].tolist()
    d3 = diabetes01['diag_3'].tolist()
    
    for i in range(len(d1)):
        try:
            if float(d1[i]) >= min and float(d1[i]) < max:
                d1[i] = newname
        except:
            pass
        try:
            if float(d2[i]) >= min and float(d2[i]) < max:
                d2[i] = newname
        except:
            pass
        try:
            if float(d3[i]) >= min and float(d3[i]) < max:
                d3[i] = newname
        except:
            pass
    
    diabetes01['diag_1'] = pd.Series(d1)
    diabetes01['diag_2'] = pd.Series(d2)
    diabetes01['diag_3'] = pd.Series(d3)


In [59]:
convertdiseases(340, 459, 'circulatory')

In [60]:
#That worked pretty well. Let's do this for all additional values and combinations:
diabetes01['diag_1'].value_counts()

circulatory    20021
786             2598
486             2139
715             1583
780             1289
682             1268
491             1202
996             1196
276             1176
38              1134
584              996
250.8            978
599              978
820              785
V57              782
518              748
574              681
250.6            661
577              653
562              626
296              590
722              585
250.7            555
250.13           555
560              550
493              550
998              520
507              467
250.02           445
578              401
               ...  
187                1
27                 1
250.51             1
E909               1
75                 1
994                1
804                1
V60                1
160                1
885                1
39                 1
971                1
649                1
170                1
817                1
299                1
914          

In [61]:
convertdiseases(785, 786, 'circulatory')
convertdiseases(745, 748, 'circulatory')
convertdiseases(459, 460, 'circulatory')
convertdiseases(460, 520, 'respiratory')
convertdiseases(786, 787, 'respiratory')
convertdiseases(748, 749, 'respiratory')
convertdiseases(520, 580, 'digestive')
convertdiseases(787, 788, 'digestive')
convertdiseases(749, 752, 'digestive')
convertdiseases(800, 1000, 'injury')
convertdiseases(710, 740, 'musculoskeletal')
convertdiseases(754, 757, 'musculoskeletal')
convertdiseases(580, 630, 'urogenital')
convertdiseases(788, 789, 'urogenital')
convertdiseases(752, 754, 'urogenital')
convertdiseases(140, 240, 'neoplasm')
convertdiseases(1, 140, 'infection')
convertdiseases(290, 320, 'mentaldis')
convertdiseases(280, 290, 'blooddis')
convertdiseases(320, 360, 'nervous')
convertdiseases(360, 390, 'nervous')
convertdiseases(740, 743, 'nervous')
convertdiseases(630, 680, 'pregnancy')
convertdiseases(780, 782, 'other')
convertdiseases(784, 785, 'other')
convertdiseases(790, 800, 'other')
convertdiseases(743, 745, 'other')
convertdiseases(757, 760, 'other')

In [62]:
'''This successfully converted our targets. Now we still have things to convert. All patients have diabetes, so the 250
classifications are not important insofar as they diagnose diabetes. We can, however, glean addition diabetic info from the 
decimal codes, where they exist. They would need to go to their own categories, however.'''

'''First, lets get rid of the EV codes, which the publication refers to as injuries or additional diagnosic information'''
diabetes01['diag_1'].value_counts()

circulatory        20175
respiratory         8891
digestive           6114
injury              4737
musculoskeletal     3499
urogenital          3404
neoplasm            2633
infection           1824
other               1635
mentaldis           1484
682                 1268
276                 1176
250.8                978
V57                  782
blooddis             664
250.6                661
pregnancy            574
250.7                555
250.13               555
250.02               445
250.11               371
789                  336
278                  334
250.12               283
nervous              266
250.82               236
250.1                205
250                  192
V58                  161
707                  157
                   ...  
709                    4
694                    4
705                    4
686                    4
261                    3
272                    3
245                    3
262                    3
692                    3


In [63]:
diabetes01['diag_1'] = diabetes01['diag_1'].replace('V[0-9]+', 'injury', regex=True)
diabetes01['diag_1'] = diabetes01['diag_1'].replace('E[0-9]+', 'injury', regex=True)
diabetes01['diag_2'] = diabetes01['diag_2'].replace('V[0-9]+', 'injury', regex=True)
diabetes01['diag_2'] = diabetes01['diag_2'].replace('E[0-9]+', 'injury', regex=True)
diabetes01['diag_3'] = diabetes01['diag_3'].replace('V[0-9]+', 'injury', regex=True)
diabetes01['diag_3'] = diabetes01['diag_3'].replace('E[0-9]+', 'injury', regex=True)

In [64]:
#This is looking better, but apprently, some ICD-9 codes weren't covered in our initial conversion:
diabetes01['diag_1'].value_counts()

circulatory        20175
respiratory         8891
digestive           6114
injury              5821
musculoskeletal     3499
urogenital          3404
neoplasm            2633
infection           1824
other               1635
mentaldis           1484
682                 1268
276                 1176
250.8                978
blooddis             664
250.6                661
pregnancy            574
250.7                555
250.13               555
250.02               445
250.11               371
789                  336
278                  334
250.12               283
nervous              266
250.82               236
250.1                205
250                  192
707                  157
250.03               154
250.4                135
                   ...  
273                    6
250.9                  6
251                    6
250.5                  5
708                    5
685                    5
705                    4
709                    4
694                    4


In [65]:
convertdiseases(240, 250, 'metabolic')
convertdiseases(251, 280, 'metabolic')
convertdiseases(680, 710, 'skin')
convertdiseases(782, 783, 'skin')

In [66]:
#All that is missing now is 783, 789
diabetes01['diag_1'].value_counts()

circulatory        20175
respiratory         8891
digestive           6114
injury              5821
musculoskeletal     3499
urogenital          3404
neoplasm            2633
infection           1824
metabolic           1798
other               1635
skin                1625
mentaldis           1484
250.8                978
blooddis             664
250.6                661
pregnancy            574
250.13               555
250.7                555
250.02               445
250.11               371
789                  336
250.12               283
nervous              266
250.82               236
250.1                205
250                  192
250.03               154
250.4                135
250.81               119
250.22               104
250.2                 80
250.41                55
250.83                55
250.01                52
250.42                51
250.92                33
783                   22
250.23                19
250.43                16
250.32                14


In [67]:
convertdiseases(789, 790, 'other')
convertdiseases(783, 784, 'metabolic')

In [68]:
'''We now have all values, other than ? and the diabetes codes, as a category. Let's create a new column with additional 
diabetes info (from the 250 codes), then revert all 250's and ? to 'NoDisease', a value which will go away after diag1, 2, 3
combination.'''

"We now have all values, other than ? and the diabetes codes, as a category. Let's create a new column with additional \ndiabetes info (from the 250 codes), then revert all 250's and ? to 'NoDisease', a value which will go away after diag1, 2, 3\ncombination."

In [69]:
#We can create 4 rows (eventually condensed to 2): diabetes_feature (1-3) and Type (1 or 2):

#Write a function to migrate data from diabetes codes to new columns, then revert them to "NoDisease":
'''This function returns a modified version of our DF, and creates four rows. diabfeature1, 2, 3 which are extra diabetic 
features of the patient, and diabtype which is a report of either Type1, Type2, or (typically) unknown for the patient, 
based on ICD code'''
def convertdiabetescodes(df):
    df2 = df.copy()
    d1 = list(df2['diag_1'])
    d2 = list(df2['diag_2'])
    d3 = list(df2['diag_3'])
    
    f1 = []
    f2 = []
    f3 = []
    diabtype1 = []
    diabtype2 = []
    diabtype3 = []
    
    for i in range(len(d1)):
        try:
            
            if 100*float(d1[i]) % 10 == 1 or 100*float(d1[i]) % 10 == 3:
                diabtype1.append('Type1')
            elif 100*float(d1[i]) % 10 == 2:
                diabtype1.append('Type2')
            elif 100*float(d1[i]) % 10 == 0:
                diabtype1.append('Unknown')
            else:
                diabtype1.append('Unknown')
                
            if 100*float(d1[i]) % 100 >= 10 and 100*float(d1[i]) % 100 < 20:
                f1.append('ketoacidosis')
            elif 100*float(d1[i]) % 100 >= 20 and 100*float(d1[i]) % 100 < 30:
                f1.append('hyperosmolarity')
            elif 100*float(d1[i]) % 100 >= 30 and 100*float(d1[i]) % 100 < 40:
                f1.append('coma')
            elif 100*float(d1[i]) % 100 >= 40 and 100*float(d1[i]) % 100 < 50:
                f1.append('renal')
            elif 100*float(d1[i]) % 100 >= 50 and 100*float(d1[i]) % 100 < 60:
                f1.append('ophthalmic')
            elif 100*float(d1[i]) % 100 >= 60 and 100*float(d1[i]) % 100 < 70:
                f1.append('neurologic')
            elif 100*float(d1[i]) % 100 >= 70 and 100*float(d1[i]) % 100 < 80:
                f1.append('circulatory')
            elif 100*float(d1[i]) % 100 >= 80:
                f1.append('other')
            elif 100*float(d1[i]) % 100 >=0 and 100*float(d1[i]) % 100 < 10:
                f1.append('No')
            else:
                f1.append('No')
                
        except:
            diabtype1.append('No')
            f1.append('No')
            
        try:
            if 100*float(d2[i]) % 10 == 1 or 100*float(d2[i]) % 10 == 3:
                diabtype2.append('Type1')
            elif 100*float(d2[i]) % 10 == 2:
                diabtype2.append('Type2')
            elif 100*float(d2[i]) % 10 == 0:
                diabtype2.append('Unknown')
            else:
                diabtype2.append('Unknown')
                
            if 100*float(d2[i]) % 100 >= 10 and 100*float(d2[i]) % 100 < 20:
                f2.append('ketoacidosis')
            elif 100*float(d2[i]) % 100 >= 20 and 100*float(d2[i]) % 100 < 30:
                f2.append('hyperosmolarity')
            elif 100*float(d2[i]) % 100 >= 30 and 100*float(d2[i]) % 100 < 40:
                f2.append('coma')
            elif 100*float(d2[i]) % 100 >= 40 and 100*float(d2[i]) % 100 < 50:
                f2.append('renal')
            elif 100*float(d2[i]) % 100 >= 50 and 100*float(d2[i]) % 100 < 60:
                f2.append('ophthalmic')
            elif 100*float(d2[i]) % 100 >= 60 and 100*float(d2[i]) % 100 < 70:
                f2.append('neurologic')
            elif 100*float(d2[i]) % 100 >= 70 and 100*float(d2[i]) % 100 < 80:
                f2.append('circulatory')
            elif 100*float(d2[i]) % 100 >= 80:
                f2.append('other')
            elif 100*float(d2[i]) % 100 >=0 and 100*float(d2[i]) % 100 < 10:
                f2.append('No')
            else:
                f2.append('No')
                
        except:
            diabtype2.append('No')
            f2.append('No')
            
        try:
            if 100*float(d3[i]) % 10 == 1 or 100*float(d3[i]) % 10 == 3:
                diabtype3.append('Type1')
            elif 100*float(d3[i]) % 10 == 2:
                diabtype3.append('Type2')
            elif 100*float(d3[i]) % 10 == 0:
                diabtype3.append('Unknown')
            else:
                diabtype3.append('Unknown')
                
            if 100*float(d3[i]) % 100 >= 10 and 100*float(d3[i]) % 100 < 20:
                f3.append('ketoacidosis')
            elif 100*float(d3[i]) % 100 >= 20 and 100*float(d3[i]) % 100 < 30:
                f3.append('hyperosmolarity')
            elif 100*float(d3[i]) % 100 >= 30 and 100*float(d3[i]) % 100 < 40:
                f3.append('coma')
            elif 100*float(d3[i]) % 100 >= 40 and 100*float(d3[i]) % 100 < 50:
                f3.append('renal')
            elif 100*float(d3[i]) % 100 >= 50 and 100*float(d3[i]) % 100 < 60:
                f3.append('ophthalmic')
            elif 100*float(d3[i]) % 100 >= 60 and 100*float(d3[i]) % 100 < 70:
                f3.append('neurologic')
            elif 100*float(d3[i]) % 100 >= 70 and 100*float(d3[i]) % 100 < 80:
                f3.append('circulatory')
            elif 100*float(d3[i]) % 100 >= 80:
                f3.append('other')
            elif 100*float(d3[i]) % 100 >=0 and 100*float(d3[i]) % 100 < 10:
                f3.append('No')
            else:
                f3.append('No')
                
        except:
            diabtype3.append('No')
            f3.append('No')
    
    finaltype = []
    
    for i in range(len(diabtype1)):
        if diabtype1 == 'Type1' or diabtype2 == 'Type1' or diabtype3 == 'Type1':
            finaltype.append('Type1')
        elif diabtype1 == 'Type2' or diabtype2 == 'Type2' or diabtype3 == 'Type2':
            finaltype.append('Type2')
        else:
            finaltype.append('Unknown')
   
    df2['diabfeature1'] = f1
    df2['diabfeature2'] = f2
    df2['diabfeature3'] = f3
    df2['diabtype'] = finaltype
    
    return df2

In [70]:
#We can create 4 rows (eventually condensed to 2): diabetes_feature (1-3) and Type (1 or 2):

#Write a function to migrate data from diabetes codes to new columns, then revert them to "NoDisease":
'''This function returns a modified version of our DF, and creates four rows. diabfeature1, 2, 3 which are extra diabetic 
features of the patient, and diabtype which is a report of either Type1, Type2, or (typically) unknown for the patient, 
based on ICD code'''
def convertdiabetescodes(df):
    df2 = df.copy()
    d1 = list(df2['diag_1'])
    d2 = list(df2['diag_2'])
    d3 = list(df2['diag_3'])
    
    f1 = []
    f2 = []
    f3 = []
    diabtype1 = []
    diabtype2 = []
    diabtype3 = []
    
    for i in range(len(d1)):
        try:
            
            if int(100*float(d1[i])) % 10 == 1 or int(100*float(d1[i])) % 10 == 3:
                diabtype1.append('Type1')
            elif int(100*float(d1[i])) % 10 == 2:
                diabtype1.append('Type2')
            elif int(100*float(d1[i])) % 10 == 0:
                diabtype1.append('Unknown')
            else:
                diabtype1.append('Unknown')
                
            if int(100*float(d1[i])) % 100 >= 10 and int(100*float(d1[i])) % 100 < 20:
                f1.append('ketoacidosis')
            elif int(100*float(d1[i])) % 100 >= 20 and int(100*float(d1[i])) % 100 < 30:
                f1.append('hyperosmolarity')
            elif int(100*float(d1[i])) % 100 >= 30 and int(100*float(d1[i])) % 100 < 40:
                f1.append('coma')
            elif int(100*float(d1[i])) % 100 >= 40 and int(100*float(d1[i])) % 100 < 50:
                f1.append('renal')
            elif int(100*float(d1[i])) % 100 >= 50 and int(100*float(d1[i])) % 100 < 60:
                f1.append('ophthalmic')
            elif int(100*float(d1[i])) % 100 >= 60 and int(100*float(d1[i])) % 100 < 70:
                f1.append('neurologic')
            elif int(100*float(d1[i])) % 100 >= 70 and int(100*float(d1[i])) % 100 < 80:
                f1.append('circulatory')
            elif int(100*float(d1[i])) % 100 >= 80:
                f1.append('other')
            elif int(100*float(d1[i])) % 100 >=0 and int(100*float(d1[i])) % 100 < 10:
                f1.append('No')
            else:
                f1.append('No')
                
        except:
            diabtype1.append('No')
            f1.append('No')
            
        try:
            if int(100*float(d2[i])) % 10 == 1 or int(100*float(d2[i])) % 10 == 3:
                diabtype2.append('Type1')
            elif int(100*float(d2[i])) % 10 == 2:
                diabtype2.append('Type2')
            elif int(100*float(d2[i])) % 10 == 0:
                diabtype2.append('Unknown')
            else:
                diabtype2.append('Unknown')
                
            if int(100*float(d2[i])) % 100 >= 10 and int(100*float(d2[i])) % 100 < 20:
                f2.append('ketoacidosis')
            elif int(100*float(d2[i])) % 100 >= 20 and int(100*float(d2[i])) % 100 < 30:
                f2.append('hyperosmolarity')
            elif int(100*float(d2[i])) % 100 >= 30 and int(100*float(d2[i])) % 100 < 40:
                f2.append('coma')
            elif int(100*float(d2[i])) % 100 >= 40 and int(100*float(d2[i])) % 100 < 50:
                f2.append('renal')
            elif int(100*float(d2[i])) % 100 >= 50 and int(100*float(d2[i])) % 100 < 60:
                f2.append('ophthalmic')
            elif int(100*float(d2[i])) % 100 >= 60 and int(100*float(d2[i])) % 100 < 70:
                f2.append('neurologic')
            elif int(100*float(d2[i])) % 100 >= 70 and int(100*float(d2[i])) % 100 < 80:
                f2.append('circulatory')
            elif int(100*float(d2[i])) % 100 >= 80:
                f2.append('other')
            elif int(100*float(d2[i])) % 100 >=0 and int(100*float(d2[i])) % 100 < 10:
                f2.append('No')
            else:
                f2.append('No')
                
        except:
            diabtype2.append('No')
            f2.append('No')
            
        try:
            if int(100*float(d3[i])) % 10 == 1 or int(100*float(d3[i])) % 10 == 3:
                diabtype3.append('Type1')
            elif int(100*float(d3[i])) % 10 == 2:
                diabtype3.append('Type2')
            elif int(100*float(d3[i])) % 10 == 0:
                diabtype3.append('Unknown')
            else:
                diabtype3.append('Unknown')
                
            if int(100*float(d3[i])) % 100 >= 10 and int(100*float(d3[i])) % 100 < 20:
                f3.append('ketoacidosis')
            elif int(100*float(d3[i])) % 100 >= 20 and int(100*float(d3[i])) % 100 < 30:
                f3.append('hyperosmolarity')
            elif int(100*float(d3[i])) % 100 >= 30 and int(100*float(d3[i])) % 100 < 40:
                f3.append('coma')
            elif int(100*float(d3[i])) % 100 >= 40 and int(100*float(d3[i])) % 100 < 50:
                f3.append('renal')
            elif int(100*float(d3[i])) % 100 >= 50 and int(100*float(d3[i])) % 100 < 60:
                f3.append('ophthalmic')
            elif int(100*float(d3[i])) % 100 >= 60 and int(100*float(d3[i])) % 100 < 70:
                f3.append('neurologic')
            elif int(100*float(d3[i])) % 100 >= 70 and int(100*float(d3[i])) % 100 < 80:
                f3.append('circulatory')
            elif int(100*float(d3[i])) % 100 >= 80:
                f3.append('other')
            elif int(100*float(d3[i])) % 100 >=0 and int(100*float(d3[i])) % 100 < 10:
                f3.append('No')
            else:
                f3.append('No')
                
        except:
            diabtype3.append('No')
            f3.append('No')
    
    finaltype = []
    
    for i in range(len(diabtype1)):
        if diabtype1[i] == 'Type1' or diabtype2[i] == 'Type1' or diabtype3[i] == 'Type1':
            finaltype.append('Type1')
        elif diabtype1[i] == 'Type2' or diabtype2[i] == 'Type2' or diabtype3[i] == 'Type2':
            finaltype.append('Type2')
        else:
            finaltype.append('Unknown')
   
    df2['diabfeature1'] = f1
    df2['diabfeature2'] = f2
    df2['diabfeature3'] = f3
    df2['diabtype'] = finaltype
    
    return df2

In [71]:
#Now, let's update diabetes01 to diabetes02 with these extra columns:
diabetes02 = convertdiabetescodes(diabetes01)

In [72]:
#Now, we can see that we have Type1/2 information for some of the patients.
diabetes02['diabtype'].value_counts()

Unknown    58273
Type2       4036
Type1       3912
Name: diabtype, dtype: int64

In [73]:
#We can also see supplementary diabeteic features for these patients based on the code presented:
diabetes02['diabfeature1'].value_counts()

No                 61625
other               1442
ketoacidosis        1414
neurologic           661
circulatory          555
renal                257
hyperosmolarity      211
coma                  48
ophthalmic             8
Name: diabfeature1, dtype: int64

In [74]:
#Save file and reopen:

#diabetes02.to_csv('diabetes02.csv')
#diabetes02 = pd.read_csv('diabetes02.csv', index_col=0)

In [75]:
#Let's see if we can impute additional columns for Type1/Type2 based on what we have here
diabetes00 = pd.read_csv('diabetic_data.csv')

In [76]:
diabetes00['diabtype'] = diabetes02['diabtype']

In [77]:
'''A visual inspection of some random samples of this data is discouraging, and shows differening type status (1 vs 2) for the same patient in different encounters. Based on this fact, and the low prevalence of this information to begin with, we should not include diabtype in our analysis or try to impute it within the same patient.'''
diabetes00[diabetes00['diabtype']!='Unknown'].sort_values('patient_nbr')[['patient_nbr', 'diabtype']].head(100)
diabetes00[diabetes00['diabtype']!='Unknown'].sort_values('patient_nbr')[['patient_nbr', 'diabtype']].iloc[300:400,]

Unnamed: 0,patient_nbr,diabtype
5097,459639,Type1
26493,460359,Type2
1127,461520,Type2
19161,461583,Type1
1195,462825,Type1
875,463302,Type1
6825,463338,Type1
7150,463473,Type2
16999,463932,Type2
398,463932,Type1


In [78]:
#Drop diabtype from this dataframe:
diabetes02 = diabetes02.drop('diabtype', axis=1)

In [79]:
'''We also need to get rid of diabetes from the diag list (we know these patients have diabetes.) Anything still numeric (or ?)
in the diag_ features should be changed to "Nothing"'''

diabetes02['diag_1'] = diabetes02['diag_1'].replace('?', 'Nothing').astype('str')
diabetes02['diag_2'] = diabetes02['diag_2'].replace('?', 'Nothing').astype('str')
diabetes02['diag_3'] = diabetes02['diag_3'].replace('?', 'Nothing').astype('str')

d1 = list(diabetes02['diag_1'])
for i in range(len(d1)):
    try:
        pd.to_numeric(d1[i])
        d1[i] = 'Nothing'
    except:
        pass
diabetes02['diag_1'] = pd.Series(d1)
        
d1 = list(diabetes02['diag_2'])
for i in range(len(d1)):
    try:
        pd.to_numeric(d1[i])
        d1[i] = 'Nothing'
    except:
        pass
diabetes02['diag_2'] = pd.Series(d1)

d1 = list(diabetes02['diag_3'])
for i in range(len(d1)):
    try:
        pd.to_numeric(d1[i])
        d1[i] = 'Nothing'
    except:
        pass
diabetes02['diag_3'] = pd.Series(d1)
        

In [80]:
#Now, we have our data frame reduced to all string representations of diseases, or "Nothing." We can now combine diag 1-3:
diabetes02['diag_3'].value_counts()

circulatory        20189
Nothing            11335
metabolic           6213
injury              4651
respiratory         4627
urogenital          4189
digestive           2516
mentaldis           2051
other               1756
blooddis            1599
skin                1558
neoplasm            1346
musculoskeletal     1276
infection           1214
nan                 1085
nervous              363
pregnancy            253
Name: diag_3, dtype: int64

In [81]:
#Let's combine diagnosis (diag_1, diag_2, diag_3) into dummy variables and add them. We will do this for diag and diabfeature:

#This is a function to do this
def createcombineddummies(df, c1, c2, c3=None, c4=None, c5=None, prefix=''):
    
    '''This would need to be modified if you have values not in every Series (we don't have that problem here)'''
    
    collist = [c1, c2]
    
    if c3 is not None:
        collist.append(c3)
    if c4 is not None:
        collist.append(c4)
    if c5 is not None:
        collist.append(c5)
        
    for i in range(len(collist)):
        
        if i == 0:
            tempDF = pd.get_dummies(df[collist[i]], prefix=prefix)
        
        if i > 0:
            tempDF1 = pd.get_dummies(df[collist[i]], prefix=prefix)
            tempDF = tempDF + tempDF1
        
    #Do we need this code? It's probably useful. Reduces everything to 1 (if a patien has 2 resporatory conditions, for example)
    tempDF = tempDF.clip(upper=1)
    tempDF = tempDF.drop(prefix + '_Nothing', axis=1)
    
    return tempDF

In [82]:
#Let's try creating this dummy DF and see how it looks:
diagDummy = createcombineddummies(diabetes02, 'diag_1', 'diag_2', 'diag_3', prefix='diag')

In [83]:
#This looks good. Let's combine it with diabetes02 and remove diag_1-3
diagDummy.head(10)

Unnamed: 0,diag_blooddis,diag_circulatory,diag_digestive,diag_infection,diag_injury,diag_mentaldis,diag_metabolic,diag_musculoskeletal,diag_nan,diag_neoplasm,diag_nervous,diag_other,diag_pregnancy,diag_respiratory,diag_skin,diag_urogenital
0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0
2,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0
3,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0
5,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0
8,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0
9,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [84]:
# Drop diag_1-3 and add diagDummy to DF
diabetes03 = diabetes02.drop(['diag_1', 'diag_2', 'diag_3'], axis=1)
diabetes03 = pd.concat([diabetes03, diagDummy], axis=1)

In [85]:
diabetes03.head()

Unnamed: 0,race,gender,age,admission_type_id,discharge_disposition,admission_source_id,time_in_hospital,payer_code,medical_specialty,num_lab_procedures,...,diag_metabolic,diag_musculoskeletal,diag_nan,diag_neoplasm,diag_nervous,diag_other,diag_pregnancy,diag_respiratory,diag_skin,diag_urogenital
0,Caucasian,Female,1,5,unknown,1,1,,Pediatrics-Endocrinology,41,...,0,0,1,0,0,0,0,0,0,0
1,AfricanAmerican,Female,3,1,home,7,2,,,11,...,0,0,0,0,0,0,1,0,0,0
2,Caucasian,Male,7,3,home,2,4,,,70,...,0,0,0,0,0,0,0,0,0,0
3,Caucasian,Female,9,2,home,4,13,,,68,...,0,0,0,0,0,0,0,0,0,0
4,Caucasian,Female,10,3,nursing,4,12,,InternalMedicine,33,...,0,0,0,1,0,0,0,1,0,0


In [86]:
#We need to modify this function (Annoying) because we have a different negative value for these columns:

#This is a function to do this
def createcombineddummiesF(df, c1, c2, c3=None, c4=None, c5=None, prefix=''):
    
    '''This would need to be modified if you have values not in every Series (we don't have that problem here)'''
    
    collist = [c1, c2]
    
    if c3 is not None:
        collist.append(c3)
    if c4 is not None:
        collist.append(c4)
    if c5 is not None:
        collist.append(c5)
        
    for i in range(len(collist)):
        
        if i == 0:
            tempDF = pd.get_dummies(df[collist[i]], prefix=prefix)
        
        if i > 0:
            tempDF1 = pd.get_dummies(df[collist[i]], prefix=prefix)
            tempDF = tempDF + tempDF1
        
    #Do we need this code? It's probably useful. Reduces everything to 1 (if a patien has 2 resporatory conditions, for example)
    tempDF = tempDF.clip(upper=1)
    tempDF = tempDF.drop(prefix + '_No', axis=1)
    
    return tempDF

In [87]:
#Now do a similar thing for diagfeature:
featureDummy = createcombineddummiesF(diabetes02, 'diabfeature1', 'diabfeature2', 'diabfeature3', prefix='diabfeat')

In [88]:
#Out of curiosity, let's see how many values are in each columns:
np.sum(featureDummy)

diabfeat_circulatory         719
diabfeat_coma                 59
diabfeat_hyperosmolarity     260
diabfeat_ketoacidosis       1668
diabfeat_neurologic         1828
diabfeat_ophthalmic          366
diabfeat_other              2290
diabfeat_renal              1016
dtype: int64

In [89]:
'''This is a decent amount of information which could possibly help with predicting readmission rates (not sure about coma)'''

'This is a decent amount of information which could possibly help with predicting readmission rates (not sure about coma)'

In [90]:
diabetes03 = diabetes03.drop(['diabfeature1', 'diabfeature2', 'diabfeature3'], axis=1)
diabetes03 = pd.concat([diabetes03, featureDummy], axis=1)

In [91]:
diabetes03.head()

Unnamed: 0,race,gender,age,admission_type_id,discharge_disposition,admission_source_id,time_in_hospital,payer_code,medical_specialty,num_lab_procedures,...,diag_skin,diag_urogenital,diabfeat_circulatory,diabfeat_coma,diabfeat_hyperosmolarity,diabfeat_ketoacidosis,diabfeat_neurologic,diabfeat_ophthalmic,diabfeat_other,diabfeat_renal
0,Caucasian,Female,1,5,unknown,1,1,,Pediatrics-Endocrinology,41,...,0,0,0,0,0,0,0,0,1,0
1,AfricanAmerican,Female,3,1,home,7,2,,,11,...,0,0,0,0,0,0,0,0,0,0
2,Caucasian,Male,7,3,home,2,4,,,70,...,0,0,0,0,0,0,0,0,0,0
3,Caucasian,Female,9,2,home,4,13,,,68,...,0,0,0,0,0,0,0,0,0,0
4,Caucasian,Female,10,3,nursing,4,12,,InternalMedicine,33,...,0,0,0,0,0,0,0,0,0,0


In [92]:
#For now, we can drop medical_specialty and payer_code (the publication keeps specialty, but this should be quite correlated with diagnoses):
diabetes03 = diabetes03.drop('payer_code', axis=1)
diabetes03 = diabetes03.drop('medical_specialty', axis=1)

In [93]:
#diabetes03['medical_specialty'].value_counts()

In [94]:
'''#This is an attempt to succinctly reduce this list, partially influenced by the paper:
diabetes03['medical_specialty'] = diabetes03['medical_specialty'].replace('Emergency/Trauma', 'Emergency')
diabetes03['medical_specialty'] = diabetes03['medical_specialty'].replace('Family/GeneralPractice', 'FamilyGP')
diabetes03['medical_specialty'] = diabetes03['medical_specialty'].replace('Surgery-General', 'Surgery')
diabetes03['medical_specialty'] = diabetes03['medical_specialty'].replace('Orthopedics-Reconstructive', 'Orthopedics')
diabetes03['medical_specialty'] = diabetes03['medical_specialty'].replace('Surgery-Cardiovascular/Thoracic', 'Surgery')
diabetes03['medical_specialty'] = diabetes03['medical_specialty'].replace('Surgery-Vascular', 'Surgery')
diabetes03['medical_specialty'] = diabetes03['medical_specialty'].replace('Surgery-Neuro', 'Surgery')
diabetes03['medical_specialty'] = diabetes03['medical_specialty'].replace('PhysicalMedicineandRehabilitation', 'Rehab')'''

'''I'm kind of over it. There are so many and honestly, this data should correlate with other things'''

"I'm kind of over it. There are so many and honestly, this data should correlate with other things"

In [95]:
'''diabetes03['medical_specialty'] = diabetes03['medical_specialty'].replace('PhysicalMedicineandRehabilitation', 'Rehab')'''

"diabetes03['medical_specialty'] = diabetes03['medical_specialty'].replace('PhysicalMedicineandRehabilitation', 'Rehab')"

In [96]:
#Now, we need to adjust the values in referral. Every value above 7 (8 and above) are extremely small or missing. We should rewrite all things 8 and above as just 8:
diabetes03['admission_source_id'] = diabetes03['admission_source_id'].clip(upper=8)

In [97]:
#Now it is maxed out at 8; all values from 9 to 25 are converted to 8:
diabetes03['admission_source_id'].describe()

count    66221.000000
mean         5.034777
std          2.788706
min          1.000000
25%          1.000000
50%          7.000000
75%          7.000000
max          8.000000
Name: admission_source_id, dtype: float64

In [98]:
#At this point, we can fork the data into two DFs: One (AllDummy) will dummify all columns, including the medicine columns
#Another one, OrdMed, will make the medicinal columns ordinal:
#Not taking = 0, down=0.5, steady=1, up=1.5. Not sure where either of these models would work better

#A few last things to do are convert the ? in 'race' to Unknown:
diabetes03['race'] = diabetes03['race'].replace(['?'], ['unknown'])

#And for now, we should probably save the outcome (readmitted) as a un-dummified column with 3 digitized outputs:
#No=0, >30=1, <30=2 (this is completely subject to change)
diabetes03['readmitted'] = diabetes03['readmitted'].replace(['NO', '>30', '<30'], [0, 1, 2])

In [99]:
#One other thing: We should check value counts for all the med columns (some are VERY sparse):
for i in range(16, 39):
    colname = list(diabetes03.columns)[i]
    print(diabetes03[colname].value_counts())

No        52939
Steady    12177
Up          720
Down        385
Name: med_metformin, dtype: int64
No        65322
Steady      796
Up           77
Down         26
Name: med_repaglinide, dtype: int64
No        65780
Steady      420
Up           14
Down          7
Name: med_nateglinide, dtype: int64
No        66169
Steady       50
Down          1
Up            1
Name: med_chlorpropamide, dtype: int64
No        62937
Steady     2933
Up          225
Down        126
Name: med_glimepiride, dtype: int64
No    66221
Name: med_acetohexamide, dtype: int64
No        58265
Steady     7123
Up          484
Down        349
Name: med_glipizide, dtype: int64
No        59289
Steady     6053
Up          538
Down        341
Name: med_glyburide, dtype: int64
No        66205
Steady       16
Name: med_tolbutamide, dtype: int64
No        61650
Steady     4356
Up          142
Down         73
Name: med_pioglitazone, dtype: int64
No        62265
Steady     3765
Up          126
Down         65
Name: med_rosiglitaz

In [100]:
#Some of these drugs can clearly be removed from this DF:
removelist = ['med_metformin.pioglitazone', 'med_metformin.rosiglitazone', 'med_glimepiride.pioglitazone',
             'med_troglitazone', 'med_acetohexamide', 'med_citoglipton', 'med_examide']
diabetes03 = diabetes03.drop(removelist, axis=1)

## This is where the file is different from the original file. In the first one, we had 2 DF's, one with all dummies for the medications, and another one with columns converted to ordinal instead. Here, we are taking out the ordinal, and creating two additional DF's instead. One which says if the patient is taking that medication at all, and another one which says whether the patient has has any change in that medication. We are also adding another column (to all 3 DF's!) named diabchange, indicating if there was any change at all to any diabetic medicine (1 = yes, 0 = no)

In [101]:
#Create three DF's, each corresponding to one of the criteria listed above:
DiabetesAllDummy = diabetes03.copy()
DiabetesTakingMed = diabetes03.copy()
DiabetesAnyChange = diabetes03.copy()

In [102]:
#Function to replace a DF with dummy variables:
def ReplaceWithDummies(df, dummylist):
    df2 = df.copy()
    for var in dummylist:
        topindex = df2[var].value_counts().sort_values(ascending=False).index[0]
        dummies = pd.get_dummies(df2[var], prefix=var)
        dummies = dummies.drop(var + "_" + str(topindex), axis=1)
        df2 = pd.concat([df2, dummies], axis=1)
        df2 = df2.drop(var, axis=1)
    return df2

In [103]:
#Define columns to be dummified: (AllDummy vs OrdMed):
MedColumns = list(diabetes03.columns[16:32])

OtherDummyColumns = ['race', 'gender', 'discharge_disposition', 'max_glu_serum', 'A1Cresult', 'change', 'med_any',
                     'admission_type_id', 'admission_source_id']
AllDummyColumns = OtherDummyColumns.copy()
AllDummyColumns.extend(MedColumns)

In [104]:
#Make all dummy DFs (all columns for the "AlDummy", the non-Med Columns for TakingMed and AnyChange)
DiabetesAllDummy = ReplaceWithDummies(DiabetesAllDummy, AllDummyColumns)
DiabetesTakingMed = ReplaceWithDummies(DiabetesTakingMed, OtherDummyColumns)
DiabetesAnyChange = ReplaceWithDummies(DiabetesAnyChange, OtherDummyColumns)

In [105]:
#Now for the TakingMed DF, replace anything other than No with 1:
for med in MedColumns:
    DiabetesTakingMed[med] = DiabetesTakingMed[med].replace(['No', 'Steady', 'Down', 'Up'], [0, 1, 1, 1])

In [106]:
#Now for the AnyChange DF, replace No and Steady with 0 and Up or Down with 1:
for med in MedColumns:
    DiabetesAnyChange[med] = DiabetesAnyChange[med].replace(['No', 'Steady', 'Down', 'Up'], [0, 0, 1, 1])

In [107]:
#Next, create a new feature, "diabchange", which says if ANY diabeteic medication was changed at all:
DiabetesAnyChange['diabchange'] = 0
for med in MedColumns:
    DiabetesAnyChange['diabchange'] += DiabetesAnyChange[med]
DiabetesAnyChange['diabchange'] = DiabetesAnyChange['diabchange'].clip(upper=1)

In [108]:
#Add these diabchange columns to DiabetesAllDummy and DiabetesTakingMed:
DiabetesAllDummy['diabchange'] = DiabetesAnyChange['diabchange']
DiabetesTakingMed['diabchange'] = DiabetesAnyChange['diabchange']

In [109]:
#Check if all values are numeric by converting to numeric:
for var in list(DiabetesAllDummy.columns):
    DiabetesAllDummy[var]  = pd.to_numeric(DiabetesAllDummy[var])
#Yes

for var in list(DiabetesTakingMed.columns):
    DiabetesTakingMed[var]  = pd.to_numeric(DiabetesTakingMed[var])
#Yes
    
for var in list(DiabetesAnyChange.columns):
    DiabetesAnyChange[var]  = pd.to_numeric(DiabetesAnyChange[var])
#Yes

In [110]:
#Finally, remove columns where the patients died
DiabetesAllDummy = DiabetesAllDummy[DiabetesAllDummy['discharge_disposition_died']==0]
DiabetesAllDummy = DiabetesAllDummy.drop('discharge_disposition_died', axis=1)
DiabetesTakingMed = DiabetesTakingMed[DiabetesTakingMed['discharge_disposition_died']==0]
DiabetesTakingMed = DiabetesTakingMed.drop('discharge_disposition_died', axis=1)
DiabetesAnyChange = DiabetesAnyChange[DiabetesAnyChange['discharge_disposition_died']==0]
DiabetesAnyChange = DiabetesAnyChange.drop('discharge_disposition_died', axis=1)

DiabetesAllDummy.index = list(range(len(DiabetesAllDummy)))
DiabetesTakingMed.index = list(range(len(DiabetesTakingMed)))
DiabetesAnyChange.index = list(range(len(DiabetesAnyChange)))

In [111]:
#Write these two DF's to CSV and use for further analysis:
DiabetesAllDummy.to_csv('DiabetesAllDummy.csv')
DiabetesTakingMed.to_csv('DiabetesTakingMed.csv')
DiabetesAnyChange.to_csv('DiabetesAnyChange.csv')

## Important Note: For now, I have readmitted in 3 columns: 0 (never), 1 (>30 days), and 2 (<30 days). Technically, '2' is the one we are looking for, so any analysis needs to convert 2 to 1 and 1 to 0. But I didn't want to remove this data yet because could it possibly be relevant in some way? 