In [425]:
import pandas as pd
import numpy as np
import math

In [426]:
#load the initial data file:
diabetes01 = pd.read_csv('diabetic_data.csv')

In [427]:
#Examine the initial data file:
diabetes01.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [428]:
#Rewrite the 24 medication columns to make clearer that they are all comparing the same thing:

Dcolumns = list(diabetes01.columns)
for i in range(24, 47):
    Dcolumns[i] = "med_" + Dcolumns[i]
Dcolumns[48] = "med_any"

diabetes01.columns = Dcolumns

In [429]:
#age is strictly divided into decades of life. We should make this numeric for now:
diabetes01['age'].value_counts()
diabetes01['age'] = diabetes01['age'].replace(['[0-10)', '[10-20)', '[20-30)', '[30-40)', '[40-50)', '[50-60)', 
                                              '[60-70)', '[70-80)', '[80-90)', '[90-100)'], [1,2,3,4,5,6,7,8,9,10])

In [430]:
'''Some patients show up multiple times in the analysis. This certainly might be relevant, 
but it is covered in other features (num_outpatient, num_inpatient, etc). We should remove encounter_id and patient_nbr.'''
diabetes01['patient_nbr'].value_counts(10)
diabetes01 = diabetes01.drop(['encounter_id', 'patient_nbr'], axis=1)

In [431]:
diabetes01.head()

Unnamed: 0,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,...,med_citoglipton,med_insulin,med_glyburide-metformin,med_glipizide-metformin,med_glimepiride-pioglitazone,med_metformin-rosiglitazone,med_metformin-pioglitazone,change,med_any,readmitted
0,Caucasian,Female,1,?,6,25,1,1,?,Pediatrics-Endocrinology,...,No,No,No,No,No,No,No,No,No,NO
1,Caucasian,Female,2,?,1,1,7,3,?,?,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,AfricanAmerican,Female,3,?,1,1,7,2,?,?,...,No,No,No,No,No,No,No,No,Yes,NO
3,Caucasian,Male,4,?,1,1,7,2,?,?,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,Caucasian,Male,5,?,1,1,7,1,?,?,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [432]:
#Race has a ? variable, which we need to keep track of for now (will address/impute later):
diabetes01['race'].value_counts()
FeaturesWithMissing = ['race']

In [433]:
#Gender has an Unknown variable also, with only three values. Let's see what these look like:
diabetes01['gender'].value_counts()

Female             54708
Male               47055
Unknown/Invalid        3
Name: gender, dtype: int64

In [434]:
#Examine in more detail some info from these 3 gender-unknown patients.
diabetes01[diabetes01['gender']=='Unknown/Invalid']
print(diabetes01.iloc[30506,])
print(diabetes01.iloc[75551,])
print(diabetes01.iloc[82573,])

race                                      Other
gender                          Unknown/Invalid
age                                           8
weight                                        ?
admission_type_id                             3
discharge_disposition_id                      1
admission_source_id                           1
time_in_hospital                              1
payer_code                                    ?
medical_specialty                    Cardiology
num_lab_procedures                            5
num_procedures                                5
num_medications                              22
number_outpatient                             0
number_emergency                              0
number_inpatient                              0
diag_1                                      414
diag_2                                      411
diag_3                                      401
number_diagnoses                              4
max_glu_serum                           

In [435]:
#We could remove these, but to keep it from becoming complicated, let's just impute with "female" (more common) instead:

diabetes01['gender'] = diabetes01['gender'].replace(['Unknown/Invalid'], ['Female'])
diabetes01['gender'].value_counts()

Female    54711
Male      47055
Name: gender, dtype: int64

In [436]:
#97% of data on weight is missing. We can not do anything with this, particularly because it could be non-random. Remove this variable:
diabetes01 = diabetes01.drop(['weight'], axis=1)

In [437]:
#Admission types look OK, except that 8 and 6 are the same. We should combine them:
diabetes01['admission_type_id'].value_counts()
diabetes01['admission_type_id'] = diabetes01['admission_type_id'].replace([8], [6])

In [438]:
'''These need some adjusting. 11, 19, 20, 21 mean the patient died. Clearly, readmission rates will be 0 here, and this could \
be written into an algorithm, but for now, they should certainly be rewritten as the same thing.

18, 25, and 26 are all the same thing also (unknown).'''
diabetes01['discharge_disposition_id'].value_counts()

1     60234
3     13954
6     12902
18     3691
2      2128
22     1993
11     1642
5      1184
25      989
4       815
7       623
23      412
13      399
14      372
28      139
8       108
15       63
24       48
9        21
17       14
16       11
19        8
10        6
27        5
12        3
20        2
Name: discharge_disposition_id, dtype: int64

In [439]:
#Write to csv to visualize in ggplot2, a far superior visualization tool:
#diabetes01.to_csv('diabetesmod.csv')

In [440]:
'''After visualization, and careful reading of the descriptions, I would rewrite the 30 discharge categories.'''
replacelist = ['home', 'hospital', 'nursing', 'nursing', 'hospice', 'hhealth', 'leftAMA', 'hhealth', 'hospital', 'hospital',
              'died', 'hospital', 'hospice', 'hospice', 'hospital', 'outpatient', 'outpatient', 'unknown', 'died', 'died', 
              'died', 'outpatient', 'hospital', 'nursing', 'unknown', 'unknown', 'nursing', 'psych', 'hospital', 'outpatient']

diabetes01['discharge_disposition_id'] = diabetes01['discharge_disposition_id'].replace(list(range(1,31)), replacelist)

In [441]:
diabetes01['discharge_disposition_id'].value_counts()

home          60234
nursing       14822
hhealth       13010
unknown        4680
hospital       2633
outpatient     2018
hospice        1955
died           1652
leftAMA         623
psych           139
Name: discharge_disposition_id, dtype: int64

In [442]:
#Rewrite the column as discharge disposition:
newcollist = list(diabetes01.columns)
newcollist[newcollist.index('discharge_disposition_id')]='discharge_disposition'
diabetes01.columns = newcollist

In [443]:
diabetes01['admission_source_id'].value_counts()

7     57494
1     29565
17     6781
4      3187
6      2264
2      1104
5       855
3       187
20      161
9       125
8        16
22       12
10        8
11        2
14        2
25        2
13        1
Name: admission_source_id, dtype: int64

In [444]:
#There are no missing values in the "time in hospital"
print(sum(diabetes01['time_in_hospital'].isna()))
print(diabetes01['time_in_hospital'].describe())

0
count    101766.000000
mean          4.395987
std           2.985108
min           1.000000
25%           2.000000
50%           4.000000
75%           6.000000
max          14.000000
Name: time_in_hospital, dtype: float64


In [445]:
#For now, we can keep payer_code, although there are a large number of missing values and this feature would not seem to be important
diabetes01['payer_code'].value_counts()

?     40256
MC    32439
HM     6274
SP     5007
BC     4655
MD     3532
CP     2533
UN     2448
CM     1937
OG     1033
PO      592
DM      549
CH      146
WC      135
OT       95
MP       79
SI       55
FR        1
Name: payer_code, dtype: int64

In [446]:
#This category is completely unwieldy, and we will have to figure out what to do with this too. It has 40% missing values:
diabetes01['medical_specialty'].value_counts()

?                                    49949
InternalMedicine                     14635
Emergency/Trauma                      7565
Family/GeneralPractice                7440
Cardiology                            5352
Surgery-General                       3099
Nephrology                            1613
Orthopedics                           1400
Orthopedics-Reconstructive            1233
Radiologist                           1140
Pulmonology                            871
Psychiatry                             854
Urology                                685
ObstetricsandGynecology                671
Surgery-Cardiovascular/Thoracic        652
Gastroenterology                       564
Surgery-Vascular                       533
Surgery-Neuro                          468
PhysicalMedicineandRehabilitation      391
Oncology                               348
Pediatrics                             254
Hematology/Oncology                    207
Neurology                              203
Pediatrics-

In [447]:
#Num_lab_procedures is clear
print(diabetes01['num_lab_procedures'].describe())
sum(diabetes01['num_lab_procedures'].isna())

count    101766.000000
mean         43.095641
std          19.674362
min           1.000000
25%          31.000000
50%          44.000000
75%          57.000000
max         132.000000
Name: num_lab_procedures, dtype: float64


0

In [448]:
#Num_procedures is also clear:
print(diabetes01['num_procedures'].describe())
sum(diabetes01['num_procedures'].isna())

count    101766.000000
mean          1.339730
std           1.705807
min           0.000000
25%           0.000000
50%           1.000000
75%           2.000000
max           6.000000
Name: num_procedures, dtype: float64


0

In [449]:
#Num_medications is also clear:
print(diabetes01['num_medications'].describe())
sum(diabetes01['num_medications'].isna())

count    101766.000000
mean         16.021844
std           8.127566
min           1.000000
25%          10.000000
50%          15.000000
75%          20.000000
max          81.000000
Name: num_medications, dtype: float64


0

In [450]:
#Num_outpatient is also clear:
print(diabetes01['number_outpatient'].describe())
sum(diabetes01['number_outpatient'].isna())

count    101766.000000
mean          0.369357
std           1.267265
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max          42.000000
Name: number_outpatient, dtype: float64


0

In [451]:
#Num_emergency is also clear:
print(diabetes01['number_emergency'].describe())
sum(diabetes01['number_emergency'].isna())

count    101766.000000
mean          0.197836
std           0.930472
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max          76.000000
Name: number_emergency, dtype: float64


0

In [452]:
#Num_inpatient is also clear:
print(diabetes01['number_inpatient'].describe())
sum(diabetes01['number_inpatient'].isna())

count    101766.000000
mean          0.635566
std           1.262863
min           0.000000
25%           0.000000
50%           0.000000
75%           1.000000
max          21.000000
Name: number_inpatient, dtype: float64


0

In [453]:
#Num_diagnoses is also clear:
print(diabetes01['number_diagnoses'].describe())
sum(diabetes01['number_diagnoses'].isna())

count    101766.000000
mean          7.422607
std           1.933600
min           1.000000
25%           6.000000
50%           8.000000
75%           9.000000
max          16.000000
Name: number_diagnoses, dtype: float64


0

In [454]:
'''Num_diagnoses and diag_1/diag_2/diag_3 are highly related to each other. If num_diagnoses are 3 or under, diag_3 (and
diag_2) will be empty. A "primary" diagnosis is essentially what the patient is in there for. Secondary diagnoses are other
things the patient has. I would recommend combining diag_1 thru diag_3 together'''

'Num_diagnoses and diag_1/diag_2/diag_3 are highly related to each other. If num_diagnoses are 3 or under, diag_3 (and\ndiag_2) will be empty. A "primary" diagnosis is essentially what the patient is in there for. Secondary diagnoses are other\nthings the patient has. I would recommend combining diag_1 thru diag_3 together'

In [455]:
#Max glucose in serum is present, though many points were not measured. We should rewrite these as "NotTaken"
diabetes01['max_glu_serum'].value_counts()
diabetes01['max_glu_serum'] = diabetes01['max_glu_serum'].replace(['None'], ['NotTaken'])

In [456]:
#Same story for A1C measurement
diabetes01['A1Cresult'].value_counts()
diabetes01['A1Cresult'] = diabetes01['A1Cresult'].replace(['None'], ['NotTaken'])

In [457]:
#Write a function to rewrite the disease codes according to a modified version of the publication:
def convertdiseases(min, max, newname):
    d1 = diabetes01['diag_1'].tolist()
    d2 = diabetes01['diag_2'].tolist()
    d3 = diabetes01['diag_3'].tolist()
    
    for i in range(len(d1)):
        try:
            if float(d1[i]) >= min and float(d1[i]) < max:
                d1[i] = newname
        except:
            pass
        try:
            if float(d2[i]) >= min and float(d2[i]) < max:
                d2[i] = newname
        except:
            pass
        try:
            if float(d3[i]) >= min and float(d3[i]) < max:
                d3[i] = newname
        except:
            pass
    
    diabetes01['diag_1'] = pd.Series(d1)
    diabetes01['diag_2'] = pd.Series(d2)
    diabetes01['diag_3'] = pd.Series(d3)


In [458]:
convertdiseases(340, 459, 'circulatory')

In [459]:
#That worked pretty well. Let's do this for all additional values and combinations:
diabetes01['diag_1'].value_counts()

circulatory    31074
786             4016
486             3508
491             2275
715             2151
682             2042
780             2019
996             1967
276             1889
38              1688
250.8           1680
599             1595
584             1520
V57             1207
250.6           1183
518             1115
820             1082
577             1057
493             1056
562              989
574              965
296              896
560              876
250.7            871
250.13           851
998              784
722              771
250.02           675
578              663
250.11           625
               ...  
834                1
704                1
84                 1
114                1
982                1
838                1
976                1
E909               1
885                1
V70                1
250.51             1
97                 1
145                1
57                 1
684                1
217                1
207          

In [460]:
convertdiseases(785, 786, 'circulatory')
convertdiseases(745, 748, 'circulatory')
convertdiseases(459, 460, 'circulatory')
convertdiseases(460, 520, 'respiratory')
convertdiseases(786, 787, 'respiratory')
convertdiseases(748, 749, 'respiratory')
convertdiseases(520, 580, 'digestive')
convertdiseases(787, 788, 'digestive')
convertdiseases(749, 752, 'digestive')
convertdiseases(800, 1000, 'injury')
convertdiseases(710, 740, 'musculoskeletal')
convertdiseases(754, 757, 'musculoskeletal')
convertdiseases(580, 630, 'urogenital')
convertdiseases(788, 789, 'urogenital')
convertdiseases(752, 754, 'urogenital')
convertdiseases(140, 240, 'neoplasm')
convertdiseases(1, 140, 'infection')
convertdiseases(290, 320, 'mentaldis')
convertdiseases(280, 290, 'blooddis')
convertdiseases(320, 360, 'nervous')
convertdiseases(360, 390, 'nervous')
convertdiseases(740, 743, 'nervous')
convertdiseases(630, 680, 'pregnancy')
convertdiseases(780, 782, 'other')
convertdiseases(784, 785, 'other')
convertdiseases(790, 800, 'other')
convertdiseases(743, 745, 'other')
convertdiseases(757, 760, 'other')

In [461]:
'''This successfully converted our targets. Now we still have things to convert. All patients have diabetes, so the 250
classifications are not important insofar as they diagnose diabetes. We can, however, glean addition diabetic info from the 
decimal codes, where they exist. They would need to go to their own categories, however.'''

'''First, lets get rid of the EV codes, which the publication refers to as injuries or additional diagnosic information'''
diabetes01['diag_1'].value_counts()

circulatory        31293
respiratory        14423
digestive           9480
injury              6974
urogenital          5122
musculoskeletal     4972
neoplasm            3433
infection           2768
other               2544
mentaldis           2262
682                 2042
276                 1889
250.8               1680
V57                 1207
250.6               1183
blooddis            1103
250.7                871
250.13               851
pregnancy            687
250.02               675
250.11               625
789                  561
250.12               417
250.82               412
278                  379
nervous              376
250.1                313
250.4                267
707                  257
250                  235
                   ...  
245                    4
250.52                 4
261                    4
250.91                 4
240                    3
271                    3
262                    3
266                    3
706                    3


In [462]:
diabetes01['diag_1'] = diabetes01['diag_1'].replace('V[0-9]+', 'injury', regex=True)
diabetes01['diag_1'] = diabetes01['diag_1'].replace('E[0-9]+', 'injury', regex=True)
diabetes01['diag_2'] = diabetes01['diag_2'].replace('V[0-9]+', 'injury', regex=True)
diabetes01['diag_2'] = diabetes01['diag_2'].replace('E[0-9]+', 'injury', regex=True)
diabetes01['diag_3'] = diabetes01['diag_3'].replace('V[0-9]+', 'injury', regex=True)
diabetes01['diag_3'] = diabetes01['diag_3'].replace('E[0-9]+', 'injury', regex=True)

In [463]:
#This is looking better, but apprently, some ICD-9 codes weren't covered in our initial conversion:
diabetes01['diag_1'].value_counts()

circulatory        31293
respiratory        14423
digestive           9480
injury              8619
urogenital          5122
musculoskeletal     4972
neoplasm            3433
infection           2768
other               2544
mentaldis           2262
682                 2042
276                 1889
250.8               1680
250.6               1183
blooddis            1103
250.7                871
250.13               851
pregnancy            687
250.02               675
250.11               625
789                  561
250.12               417
250.82               412
278                  379
nervous              376
250.1                313
250.4                267
707                  257
250                  235
250.03               201
                   ...  
686                    8
709                    8
250.9                  7
694                    6
273                    6
685                    6
272                    6
696                    6
692                    5


In [464]:
convertdiseases(240, 250, 'metabolic')
convertdiseases(251, 280, 'metabolic')
convertdiseases(680, 710, 'skin')
convertdiseases(782, 783, 'skin')

In [465]:
#All that is missing now is 783, 789
diabetes01['diag_1'].value_counts()

circulatory        31293
respiratory        14423
digestive           9480
injury              8619
urogenital          5122
musculoskeletal     4972
neoplasm            3433
infection           2768
metabolic           2702
skin                2614
other               2544
mentaldis           2262
250.8               1680
250.6               1183
blooddis            1103
250.7                871
250.13               851
pregnancy            687
250.02               675
250.11               625
789                  561
250.12               417
250.82               412
nervous              376
250.1                313
250.4                267
250                  235
250.03               201
250.81               186
250.22               156
250.2                114
250.83                95
250.41                91
250.42                90
250.01                61
250.92                52
783                   29
250.23                29
250.43                25
?                     21


In [466]:
convertdiseases(789, 790, 'other')
convertdiseases(783, 784, 'metabolic')

In [467]:
'''We now have all values, other than ? and the diabetes codes, as a category. Let's create a new column with additional 
diabetes info (from the 250 codes), then revert all 250's and ? to 'NoDisease', a value which will go away after diag1, 2, 3
combination.'''

"We now have all values, other than ? and the diabetes codes, as a category. Let's create a new column with additional \ndiabetes info (from the 250 codes), then revert all 250's and ? to 'NoDisease', a value which will go away after diag1, 2, 3\ncombination."

In [468]:
#We can create 4 rows (eventually condensed to 2): diabetes_feature (1-3) and Type (1 or 2):

#Write a function to migrate data from diabetes codes to new columns, then revert them to "NoDisease":
'''This function returns a modified version of our DF, and creates four rows. diabfeature1, 2, 3 which are extra diabetic 
features of the patient, and diabtype which is a report of either Type1, Type2, or (typically) unknown for the patient, 
based on ICD code'''
def convertdiabetescodes(df):
    df2 = df.copy()
    d1 = list(df2['diag_1'])
    d2 = list(df2['diag_2'])
    d3 = list(df2['diag_3'])
    
    f1 = []
    f2 = []
    f3 = []
    diabtype1 = []
    diabtype2 = []
    diabtype3 = []
    
    for i in range(len(d1)):
        try:
            
            if 100*float(d1[i]) % 10 == 1 or 100*float(d1[i]) % 10 == 3:
                diabtype1.append('Type1')
            elif 100*float(d1[i]) % 10 == 2:
                diabtype1.append('Type2')
            elif 100*float(d1[i]) % 10 == 0:
                diabtype1.append('Unknown')
            else:
                diabtype1.append('Unknown')
                
            if 100*float(d1[i]) % 100 >= 10 and 100*float(d1[i]) % 100 < 20:
                f1.append('ketoacidosis')
            elif 100*float(d1[i]) % 100 >= 20 and 100*float(d1[i]) % 100 < 30:
                f1.append('hyperosmolarity')
            elif 100*float(d1[i]) % 100 >= 30 and 100*float(d1[i]) % 100 < 40:
                f1.append('coma')
            elif 100*float(d1[i]) % 100 >= 40 and 100*float(d1[i]) % 100 < 50:
                f1.append('renal')
            elif 100*float(d1[i]) % 100 >= 50 and 100*float(d1[i]) % 100 < 60:
                f1.append('ophthalmic')
            elif 100*float(d1[i]) % 100 >= 60 and 100*float(d1[i]) % 100 < 70:
                f1.append('neurologic')
            elif 100*float(d1[i]) % 100 >= 70 and 100*float(d1[i]) % 100 < 80:
                f1.append('circulatory')
            elif 100*float(d1[i]) % 100 >= 80:
                f1.append('other')
            elif 100*float(d1[i]) % 100 >=0 and 100*float(d1[i]) % 100 < 10:
                f1.append('No')
            else:
                f1.append('No')
                
        except:
            diabtype1.append('No')
            f1.append('No')
            
        try:
            if 100*float(d2[i]) % 10 == 1 or 100*float(d2[i]) % 10 == 3:
                diabtype2.append('Type1')
            elif 100*float(d2[i]) % 10 == 2:
                diabtype2.append('Type2')
            elif 100*float(d2[i]) % 10 == 0:
                diabtype2.append('Unknown')
            else:
                diabtype2.append('Unknown')
                
            if 100*float(d2[i]) % 100 >= 10 and 100*float(d2[i]) % 100 < 20:
                f2.append('ketoacidosis')
            elif 100*float(d2[i]) % 100 >= 20 and 100*float(d2[i]) % 100 < 30:
                f2.append('hyperosmolarity')
            elif 100*float(d2[i]) % 100 >= 30 and 100*float(d2[i]) % 100 < 40:
                f2.append('coma')
            elif 100*float(d2[i]) % 100 >= 40 and 100*float(d2[i]) % 100 < 50:
                f2.append('renal')
            elif 100*float(d2[i]) % 100 >= 50 and 100*float(d2[i]) % 100 < 60:
                f2.append('ophthalmic')
            elif 100*float(d2[i]) % 100 >= 60 and 100*float(d2[i]) % 100 < 70:
                f2.append('neurologic')
            elif 100*float(d2[i]) % 100 >= 70 and 100*float(d2[i]) % 100 < 80:
                f2.append('circulatory')
            elif 100*float(d2[i]) % 100 >= 80:
                f2.append('other')
            elif 100*float(d2[i]) % 100 >=0 and 100*float(d2[i]) % 100 < 10:
                f2.append('No')
            else:
                f2.append('No')
                
        except:
            diabtype2.append('No')
            f2.append('No')
            
        try:
            if 100*float(d3[i]) % 10 == 1 or 100*float(d3[i]) % 10 == 3:
                diabtype3.append('Type1')
            elif 100*float(d3[i]) % 10 == 2:
                diabtype3.append('Type2')
            elif 100*float(d3[i]) % 10 == 0:
                diabtype3.append('Unknown')
            else:
                diabtype3.append('Unknown')
                
            if 100*float(d3[i]) % 100 >= 10 and 100*float(d3[i]) % 100 < 20:
                f3.append('ketoacidosis')
            elif 100*float(d3[i]) % 100 >= 20 and 100*float(d3[i]) % 100 < 30:
                f3.append('hyperosmolarity')
            elif 100*float(d3[i]) % 100 >= 30 and 100*float(d3[i]) % 100 < 40:
                f3.append('coma')
            elif 100*float(d3[i]) % 100 >= 40 and 100*float(d3[i]) % 100 < 50:
                f3.append('renal')
            elif 100*float(d3[i]) % 100 >= 50 and 100*float(d3[i]) % 100 < 60:
                f3.append('ophthalmic')
            elif 100*float(d3[i]) % 100 >= 60 and 100*float(d3[i]) % 100 < 70:
                f3.append('neurologic')
            elif 100*float(d3[i]) % 100 >= 70 and 100*float(d3[i]) % 100 < 80:
                f3.append('circulatory')
            elif 100*float(d3[i]) % 100 >= 80:
                f3.append('other')
            elif 100*float(d3[i]) % 100 >=0 and 100*float(d3[i]) % 100 < 10:
                f3.append('No')
            else:
                f3.append('No')
                
        except:
            diabtype3.append('No')
            f3.append('No')
    
    finaltype = []
    
    for i in range(len(diabtype1)):
        if diabtype1 == 'Type1' or diabtype2 == 'Type1' or diabtype3 == 'Type1':
            finaltype.append('Type1')
        elif diabtype1 == 'Type2' or diabtype2 == 'Type2' or diabtype3 == 'Type2':
            finaltype.append('Type2')
        else:
            finaltype.append('Unknown')
   
    df2['diabfeature1'] = f1
    df2['diabfeature2'] = f2
    df2['diabfeature3'] = f3
    df2['diabtype'] = finaltype
    
    return df2

In [469]:
#We can create 4 rows (eventually condensed to 2): diabetes_feature (1-3) and Type (1 or 2):

#Write a function to migrate data from diabetes codes to new columns, then revert them to "NoDisease":
'''This function returns a modified version of our DF, and creates four rows. diabfeature1, 2, 3 which are extra diabetic 
features of the patient, and diabtype which is a report of either Type1, Type2, or (typically) unknown for the patient, 
based on ICD code'''
def convertdiabetescodes(df):
    df2 = df.copy()
    d1 = list(df2['diag_1'])
    d2 = list(df2['diag_2'])
    d3 = list(df2['diag_3'])
    
    f1 = []
    f2 = []
    f3 = []
    diabtype1 = []
    diabtype2 = []
    diabtype3 = []
    
    for i in range(len(d1)):
        try:
            
            if int(100*float(d1[i])) % 10 == 1 or int(100*float(d1[i])) % 10 == 3:
                diabtype1.append('Type1')
            elif int(100*float(d1[i])) % 10 == 2:
                diabtype1.append('Type2')
            elif int(100*float(d1[i])) % 10 == 0:
                diabtype1.append('Unknown')
            else:
                diabtype1.append('Unknown')
                
            if int(100*float(d1[i])) % 100 >= 10 and int(100*float(d1[i])) % 100 < 20:
                f1.append('ketoacidosis')
            elif int(100*float(d1[i])) % 100 >= 20 and int(100*float(d1[i])) % 100 < 30:
                f1.append('hyperosmolarity')
            elif int(100*float(d1[i])) % 100 >= 30 and int(100*float(d1[i])) % 100 < 40:
                f1.append('coma')
            elif int(100*float(d1[i])) % 100 >= 40 and int(100*float(d1[i])) % 100 < 50:
                f1.append('renal')
            elif int(100*float(d1[i])) % 100 >= 50 and int(100*float(d1[i])) % 100 < 60:
                f1.append('ophthalmic')
            elif int(100*float(d1[i])) % 100 >= 60 and int(100*float(d1[i])) % 100 < 70:
                f1.append('neurologic')
            elif int(100*float(d1[i])) % 100 >= 70 and int(100*float(d1[i])) % 100 < 80:
                f1.append('circulatory')
            elif int(100*float(d1[i])) % 100 >= 80:
                f1.append('other')
            elif int(100*float(d1[i])) % 100 >=0 and int(100*float(d1[i])) % 100 < 10:
                f1.append('No')
            else:
                f1.append('No')
                
        except:
            diabtype1.append('No')
            f1.append('No')
            
        try:
            if int(100*float(d2[i])) % 10 == 1 or int(100*float(d2[i])) % 10 == 3:
                diabtype2.append('Type1')
            elif int(100*float(d2[i])) % 10 == 2:
                diabtype2.append('Type2')
            elif int(100*float(d2[i])) % 10 == 0:
                diabtype2.append('Unknown')
            else:
                diabtype2.append('Unknown')
                
            if int(100*float(d2[i])) % 100 >= 10 and int(100*float(d2[i])) % 100 < 20:
                f2.append('ketoacidosis')
            elif int(100*float(d2[i])) % 100 >= 20 and int(100*float(d2[i])) % 100 < 30:
                f2.append('hyperosmolarity')
            elif int(100*float(d2[i])) % 100 >= 30 and int(100*float(d2[i])) % 100 < 40:
                f2.append('coma')
            elif int(100*float(d2[i])) % 100 >= 40 and int(100*float(d2[i])) % 100 < 50:
                f2.append('renal')
            elif int(100*float(d2[i])) % 100 >= 50 and int(100*float(d2[i])) % 100 < 60:
                f2.append('ophthalmic')
            elif int(100*float(d2[i])) % 100 >= 60 and int(100*float(d2[i])) % 100 < 70:
                f2.append('neurologic')
            elif int(100*float(d2[i])) % 100 >= 70 and int(100*float(d2[i])) % 100 < 80:
                f2.append('circulatory')
            elif int(100*float(d2[i])) % 100 >= 80:
                f2.append('other')
            elif int(100*float(d2[i])) % 100 >=0 and int(100*float(d2[i])) % 100 < 10:
                f2.append('No')
            else:
                f2.append('No')
                
        except:
            diabtype2.append('No')
            f2.append('No')
            
        try:
            if int(100*float(d3[i])) % 10 == 1 or int(100*float(d3[i])) % 10 == 3:
                diabtype3.append('Type1')
            elif int(100*float(d3[i])) % 10 == 2:
                diabtype3.append('Type2')
            elif int(100*float(d3[i])) % 10 == 0:
                diabtype3.append('Unknown')
            else:
                diabtype3.append('Unknown')
                
            if int(100*float(d3[i])) % 100 >= 10 and int(100*float(d3[i])) % 100 < 20:
                f3.append('ketoacidosis')
            elif int(100*float(d3[i])) % 100 >= 20 and int(100*float(d3[i])) % 100 < 30:
                f3.append('hyperosmolarity')
            elif int(100*float(d3[i])) % 100 >= 30 and int(100*float(d3[i])) % 100 < 40:
                f3.append('coma')
            elif int(100*float(d3[i])) % 100 >= 40 and int(100*float(d3[i])) % 100 < 50:
                f3.append('renal')
            elif int(100*float(d3[i])) % 100 >= 50 and int(100*float(d3[i])) % 100 < 60:
                f3.append('ophthalmic')
            elif int(100*float(d3[i])) % 100 >= 60 and int(100*float(d3[i])) % 100 < 70:
                f3.append('neurologic')
            elif int(100*float(d3[i])) % 100 >= 70 and int(100*float(d3[i])) % 100 < 80:
                f3.append('circulatory')
            elif int(100*float(d3[i])) % 100 >= 80:
                f3.append('other')
            elif int(100*float(d3[i])) % 100 >=0 and int(100*float(d3[i])) % 100 < 10:
                f3.append('No')
            else:
                f3.append('No')
                
        except:
            diabtype3.append('No')
            f3.append('No')
    
    finaltype = []
    
    for i in range(len(diabtype1)):
        if diabtype1[i] == 'Type1' or diabtype2[i] == 'Type1' or diabtype3[i] == 'Type1':
            finaltype.append('Type1')
        elif diabtype1[i] == 'Type2' or diabtype2[i] == 'Type2' or diabtype3[i] == 'Type2':
            finaltype.append('Type2')
        else:
            finaltype.append('Unknown')
   
    df2['diabfeature1'] = f1
    df2['diabfeature2'] = f2
    df2['diabfeature3'] = f3
    df2['diabtype'] = finaltype
    
    return df2

In [470]:
#Now, let's update diabetes01 to diabetes02 with these extra columns:
diabetes02 = convertdiabetescodes(diabetes01)

In [471]:
#Now, we can see that we have Type1/2 information for some of the patients.
diabetes02['diabtype'].value_counts()

Unknown    89142
Type2       6418
Type1       6206
Name: diabtype, dtype: int64

In [472]:
#We can also see supplementary diabeteic features for these patients based on the code presented:
diabetes02['diabfeature1'].value_counts()

No                 94181
other               2453
ketoacidosis        2206
neurologic          1183
circulatory          871
renal                473
hyperosmolarity      311
coma                  71
ophthalmic            17
Name: diabfeature1, dtype: int64

In [473]:
#Save file and reopen:

#diabetes02.to_csv('diabetes02.csv')
diabetes02 = pd.read_csv('diabetes02.csv', index_col=0)

In [474]:
#Let's see if we can impute additional columns for Type1/Type2 based on what we have here
diabetes00 = pd.read_csv('diabetic_data.csv')

In [475]:
diabetes00['diabtype'] = diabetes02['diabtype']

In [476]:
'''A visual inspection of some random samples of this data is discouraging, and shows differening type status (1 vs 2) for the same patient in different encounters. Based on this fact, and the low prevalence of this information to begin with, we should not include diabtype in our analysis or try to impute it within the same patient.'''
diabetes00[diabetes00['diabtype']!='Unknown'].sort_values('patient_nbr')[['patient_nbr', 'diabtype']].head(100)
diabetes00[diabetes00['diabtype']!='Unknown'].sort_values('patient_nbr')[['patient_nbr', 'diabtype']].iloc[300:400,]

Unnamed: 0,patient_nbr,diabtype
10159,401976,Type1
897,404289,Type1
7667,409959,Type2
25221,411552,Type1
22652,415674,Type2
35384,416835,Type1
23452,421155,Type1
17274,423387,Type1
13801,424053,Type1
2741,424053,Type2


In [477]:
#Drop diabtype from this dataframe:
diabetes02 = diabetes02.drop('diabtype', axis=1)

In [478]:
'''We also need to get rid of diabetes from the diag list (we know these patients have diabetes.) Anything still numeric (or ?)
in the diag_ features should be changed to "Nothing"'''

diabetes02['diag_1'] = diabetes02['diag_1'].replace('?', 'Nothing').astype('str')
diabetes02['diag_2'] = diabetes02['diag_2'].replace('?', 'Nothing').astype('str')
diabetes02['diag_3'] = diabetes02['diag_3'].replace('?', 'Nothing').astype('str')

d1 = list(diabetes02['diag_1'])
for i in range(len(d1)):
    try:
        pd.to_numeric(d1[i])
        d1[i] = 'Nothing'
    except:
        pass
diabetes02['diag_1'] = pd.Series(d1)
        
d1 = list(diabetes02['diag_2'])
for i in range(len(d1)):
    try:
        pd.to_numeric(d1[i])
        d1[i] = 'Nothing'
    except:
        pass
diabetes02['diag_2'] = pd.Series(d1)

d1 = list(diabetes02['diag_3'])
for i in range(len(d1)):
    try:
        pd.to_numeric(d1[i])
        d1[i] = 'Nothing'
    except:
        pass
diabetes02['diag_3'] = pd.Series(d1)
        

In [479]:
#Now, we have our data frame reduced to all string representations of diseases, or "Nothing." We can now combine diag 1-3:
diabetes02['diag_3'].value_counts()

circulatory        31576
Nothing            18580
metabolic           9208
respiratory         7358
injury              7002
urogenital          6700
digestive           3936
mentaldis           3136
other               2675
skin                2607
blooddis            2490
musculoskeletal     1923
infection           1861
neoplasm            1856
nervous              546
pregnancy            309
Name: diag_3, dtype: int64

In [480]:
#Let's combine diagnosis (diag_1, diag_2, diag_3) into dummy variables and add them. We will do this for diag and diabfeature:

#This is a function to do this
def createcombineddummies(df, c1, c2, c3=None, c4=None, c5=None, prefix=''):
    
    '''This would need to be modified if you have values not in every Series (we don't have that problem here)'''
    
    collist = [c1, c2]
    
    if c3 is not None:
        collist.append(c3)
    if c4 is not None:
        collist.append(c4)
    if c5 is not None:
        collist.append(c5)
        
    for i in range(len(collist)):
        
        if i == 0:
            tempDF = pd.get_dummies(df[collist[i]], prefix=prefix)
        
        if i > 0:
            tempDF1 = pd.get_dummies(df[collist[i]], prefix=prefix)
            tempDF = tempDF + tempDF1
        
    #Do we need this code? It's probably useful. Reduces everything to 1 (if a patien has 2 resporatory conditions, for example)
    tempDF = tempDF.clip(upper=1)
    tempDF = tempDF.drop(prefix + '_Nothing', axis=1)
    
    return tempDF

In [481]:
#Let's try creating this dummy DF and see how it looks:
diagDummy = createcombineddummies(diabetes02, 'diag_1', 'diag_2', 'diag_3', prefix='diag')

In [482]:
#This looks good. Let's combine it with diabetes02 and remove diag_1-3
diagDummy.head(10)

Unnamed: 0,diag_blooddis,diag_circulatory,diag_digestive,diag_infection,diag_injury,diag_mentaldis,diag_metabolic,diag_musculoskeletal,diag_neoplasm,diag_nervous,diag_other,diag_pregnancy,diag_respiratory,diag_skin,diag_urogenital
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0
3,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
5,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
6,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
7,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0
8,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0
9,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0


In [483]:
# Drop diag_1-3 and add diagDummy to DF
diabetes03 = diabetes02.drop(['diag_1', 'diag_2', 'diag_3'], axis=1)
diabetes03 = pd.concat([diabetes03, diagDummy], axis=1)

In [484]:
diabetes03.head()

Unnamed: 0,race,gender,age,admission_type_id,discharge_disposition,admission_source_id,time_in_hospital,payer_code,medical_specialty,num_lab_procedures,...,diag_mentaldis,diag_metabolic,diag_musculoskeletal,diag_neoplasm,diag_nervous,diag_other,diag_pregnancy,diag_respiratory,diag_skin,diag_urogenital
0,Caucasian,Female,1,6,unknown,1,1,?,Pediatrics-Endocrinology,41,...,0,0,0,0,0,0,0,0,0,0
1,Caucasian,Female,2,1,home,7,3,?,?,59,...,0,1,0,0,0,0,0,0,0,0
2,AfricanAmerican,Female,3,1,home,7,2,?,?,11,...,0,0,0,0,0,0,1,0,0,0
3,Caucasian,Male,4,1,home,7,2,?,?,44,...,0,0,0,0,0,0,0,0,0,0
4,Caucasian,Male,5,1,home,7,1,?,?,51,...,0,0,0,1,0,0,0,0,0,0


In [485]:
#We need to modify this function (Annoying) because we have a different negative value for these columns:

#This is a function to do this
def createcombineddummiesF(df, c1, c2, c3=None, c4=None, c5=None, prefix=''):
    
    '''This would need to be modified if you have values not in every Series (we don't have that problem here)'''
    
    collist = [c1, c2]
    
    if c3 is not None:
        collist.append(c3)
    if c4 is not None:
        collist.append(c4)
    if c5 is not None:
        collist.append(c5)
        
    for i in range(len(collist)):
        
        if i == 0:
            tempDF = pd.get_dummies(df[collist[i]], prefix=prefix)
        
        if i > 0:
            tempDF1 = pd.get_dummies(df[collist[i]], prefix=prefix)
            tempDF = tempDF + tempDF1
        
    #Do we need this code? It's probably useful. Reduces everything to 1 (if a patien has 2 resporatory conditions, for example)
    tempDF = tempDF.clip(upper=1)
    tempDF = tempDF.drop(prefix + '_No', axis=1)
    
    return tempDF

In [486]:
#Now do a similar thing for diagfeature:
featureDummy = createcombineddummiesF(diabetes02, 'diabfeature1', 'diabfeature2', 'diabfeature3', prefix='diabfeat')

In [487]:
#Out of curiosity, let's see how many values are in each columns:
np.sum(featureDummy)

diabfeat_circulatory        1142
diabfeat_coma                 86
diabfeat_hyperosmolarity     396
diabfeat_ketoacidosis       2589
diabfeat_neurologic         3158
diabfeat_ophthalmic          577
diabfeat_other              3883
diabfeat_renal              1828
dtype: int64

In [488]:
'''This is a decent amount of information which could possibly help with predicting readmission rates (not sure about coma)'''

'This is a decent amount of information which could possibly help with predicting readmission rates (not sure about coma)'

In [489]:
diabetes03 = diabetes03.drop(['diabfeature1', 'diabfeature2', 'diabfeature3'], axis=1)
diabetes03 = pd.concat([diabetes03, featureDummy], axis=1)

In [490]:
diabetes03.head()

Unnamed: 0,race,gender,age,admission_type_id,discharge_disposition,admission_source_id,time_in_hospital,payer_code,medical_specialty,num_lab_procedures,...,diag_skin,diag_urogenital,diabfeat_circulatory,diabfeat_coma,diabfeat_hyperosmolarity,diabfeat_ketoacidosis,diabfeat_neurologic,diabfeat_ophthalmic,diabfeat_other,diabfeat_renal
0,Caucasian,Female,1,6,unknown,1,1,?,Pediatrics-Endocrinology,41,...,0,0,0,0,0,0,0,0,1,0
1,Caucasian,Female,2,1,home,7,3,?,?,59,...,0,0,0,0,0,0,0,0,0,0
2,AfricanAmerican,Female,3,1,home,7,2,?,?,11,...,0,0,0,0,0,0,0,0,0,0
3,Caucasian,Male,4,1,home,7,2,?,?,44,...,0,0,0,0,0,0,0,0,0,1
4,Caucasian,Male,5,1,home,7,1,?,?,51,...,0,0,0,0,0,0,0,0,0,0


In [491]:
#For now, we can drop medical_specialty and payer_code (the publication keeps specialty, but this should be quite correlated with diagnoses):
diabetes03 = diabetes03.drop('payer_code', axis=1)
diabetes03 = diabetes03.drop('medical_specialty', axis=1)

In [492]:
#diabetes03['medical_specialty'].value_counts()

In [493]:
'''#This is an attempt to succinctly reduce this list, partially influenced by the paper:
diabetes03['medical_specialty'] = diabetes03['medical_specialty'].replace('Emergency/Trauma', 'Emergency')
diabetes03['medical_specialty'] = diabetes03['medical_specialty'].replace('Family/GeneralPractice', 'FamilyGP')
diabetes03['medical_specialty'] = diabetes03['medical_specialty'].replace('Surgery-General', 'Surgery')
diabetes03['medical_specialty'] = diabetes03['medical_specialty'].replace('Orthopedics-Reconstructive', 'Orthopedics')
diabetes03['medical_specialty'] = diabetes03['medical_specialty'].replace('Surgery-Cardiovascular/Thoracic', 'Surgery')
diabetes03['medical_specialty'] = diabetes03['medical_specialty'].replace('Surgery-Vascular', 'Surgery')
diabetes03['medical_specialty'] = diabetes03['medical_specialty'].replace('Surgery-Neuro', 'Surgery')
diabetes03['medical_specialty'] = diabetes03['medical_specialty'].replace('PhysicalMedicineandRehabilitation', 'Rehab')'''

'''I'm kind of over it. There are so many and honestly, this data should correlate with other things'''

"I'm kind of over it. There are so many and honestly, this data should correlate with other things"

In [494]:
'''diabetes03['medical_specialty'] = diabetes03['medical_specialty'].replace('PhysicalMedicineandRehabilitation', 'Rehab')'''

"diabetes03['medical_specialty'] = diabetes03['medical_specialty'].replace('PhysicalMedicineandRehabilitation', 'Rehab')"

In [495]:
#Now, we need to adjust the values in referral. Every value above 7 (8 and above) are extremely small or missing. We should rewrite all things 8 and above as just 8:
diabetes03['admission_source_id'] = diabetes03['admission_source_id'].clip(upper=8)

In [496]:
#Now it is maxed out at 8; all values from 9 to 25 are converted to 8:
diabetes03['admission_source_id'].describe()

count    101763.00000
mean          5.13216
std           2.77094
min           1.00000
25%           1.00000
50%           7.00000
75%           7.00000
max           8.00000
Name: admission_source_id, dtype: float64

In [497]:
#At this point, we can fork the data into two DFs: One (AllDummy) will dummify all columns, including the medicine columns
#Another one, OrdMed, will make the medicinal columns ordinal:
#Not taking = 0, down=0.5, steady=1, up=1.5. Not sure where either of these models would work better

#A few last things to do are convert the ? in 'race' to Unknown:
diabetes03['race'] = diabetes03['race'].replace(['?'], ['unknown'])

#And for now, we should probably save the outcome (readmitted) as a un-dummified column with 3 digitized outputs:
#No=0, >30=1, <30=2 (this is completely subject to change)
diabetes03['readmitted'] = diabetes03['readmitted'].replace(['NO', '>30', '<30'], [0, 1, 2])

In [498]:
#One other thing: We should check value counts for all the med columns (some are VERY sparse):
for i in range(16, 39):
    colname = list(diabetes03.columns)[i]
    print(diabetes03[colname].value_counts())

No        81776
Steady    18345
Up         1067
Down        575
Name: med_metformin, dtype: int64
No        100224
Steady      1384
Up           110
Down          45
Name: med_repaglinide, dtype: int64
No        101060
Steady       668
Up            24
Down          11
Name: med_nateglinide, dtype: int64
No        101677
Steady        79
Up             6
Down           1
Name: med_chlorpropamide, dtype: int64
No        96572
Steady     4670
Up          327
Down        194
Name: med_glimepiride, dtype: int64
No        101762
Steady         1
Name: med_acetohexamide, dtype: int64
No        89078
Steady    11355
Up          770
Down        560
Name: med_glipizide, dtype: int64
No        91113
Steady     9274
Up          812
Down        564
Name: med_glyburide, dtype: int64
No        101740
Steady        23
Name: med_tolbutamide, dtype: int64
No        94436
Steady     6975
Up          234
Down        118
Name: med_pioglitazone, dtype: int64
No        95399
Steady     6099
Up          178


In [499]:
#Some of these drugs can clearly be removed from this DF:
removelist = ['med_metformin-pioglitazone', 'med_metformin-rosiglitazone', 'med_glimepiride-pioglitazone',
             'med_troglitazone', 'med_acetohexamide', 'med_citoglipton', 'med_examide']
diabetes03 = diabetes03.drop(removelist, axis=1)

## This is where the file is different from the original file. In the first one, we had 2 DF's, one with all dummies for the medications, and another one with columns converted to ordinal instead. Here, we are taking out the ordinal, and creating two additional DF's instead. One which says if the patient is taking that medication at all, and another one which says whether the patient has has any change in that medication. We are also adding another column (to all 3 DF's!) named diabchange, indicating if there was any change at all to any diabetic medicine (1 = yes, 0 = no)

In [500]:
#Create three DF's, each corresponding to one of the criteria listed above:
DiabetesAllDummy = diabetes03.copy()
DiabetesTakingMed = diabetes03.copy()
DiabetesAnyChange = diabetes03.copy()

In [501]:
#Function to replace a DF with dummy variables:
def ReplaceWithDummies(df, dummylist):
    df2 = df.copy()
    for var in dummylist:
        topindex = df2[var].value_counts().sort_values(ascending=False).index[0]
        dummies = pd.get_dummies(df2[var], prefix=var)
        dummies = dummies.drop(var + "_" + str(topindex), axis=1)
        df2 = pd.concat([df2, dummies], axis=1)
        df2 = df2.drop(var, axis=1)
    return df2

In [502]:
#Define columns to be dummified: (AllDummy vs OrdMed):
MedColumns = list(diabetes03.columns[16:32])

OtherDummyColumns = ['race', 'gender', 'discharge_disposition', 'max_glu_serum', 'A1Cresult', 'change', 'med_any',
                     'admission_type_id', 'admission_source_id']
AllDummyColumns = OtherDummyColumns.copy()
AllDummyColumns.extend(MedColumns)

In [503]:
#Make all dummy DFs (all columns for the "AlDummy", the non-Med Columns for TakingMed and AnyChange)
DiabetesAllDummy = ReplaceWithDummies(DiabetesAllDummy, AllDummyColumns)
DiabetesTakingMed = ReplaceWithDummies(DiabetesTakingMed, OtherDummyColumns)
DiabetesAnyChange = ReplaceWithDummies(DiabetesAnyChange, OtherDummyColumns)

In [504]:
#Now for the TakingMed DF, replace anything other than No with 1:
for med in MedColumns:
    DiabetesTakingMed[med] = DiabetesTakingMed[med].replace(['No', 'Steady', 'Down', 'Up'], [0, 1, 1, 1])

In [505]:
#Now for the AnyChange DF, replace No and Steady with 0 and Up or Down with 1:
for med in MedColumns:
    DiabetesAnyChange[med] = DiabetesAnyChange[med].replace(['No', 'Steady', 'Down', 'Up'], [0, 0, 1, 1])

In [506]:
#Next, create a new feature, "diabchange", which says if ANY diabeteic medication was changed at all:
DiabetesAnyChange['diabchange'] = 0
for med in MedColumns:
    DiabetesAnyChange['diabchange'] += DiabetesAnyChange[med]
DiabetesAnyChange['diabchange'] = DiabetesAnyChange['diabchange'].clip(upper=1)

In [507]:
#Add these diabchange columns to DiabetesAllDummy and DiabetesTakingMed:
DiabetesAllDummy['diabchange'] = DiabetesAnyChange['diabchange']
DiabetesTakingMed['diabchange'] = DiabetesAnyChange['diabchange']

In [508]:
#Check if all values are numeric by converting to numeric:
for var in list(DiabetesAllDummy.columns):
    DiabetesAllDummy[var]  = pd.to_numeric(DiabetesAllDummy[var])
#Yes

for var in list(DiabetesTakingMed.columns):
    DiabetesTakingMed[var]  = pd.to_numeric(DiabetesTakingMed[var])
#Yes
    
for var in list(DiabetesAnyChange.columns):
    DiabetesAnyChange[var]  = pd.to_numeric(DiabetesAnyChange[var])
#Yes

In [509]:
#Finally, remove columns where the patients died
DiabetesAllDummy = DiabetesAllDummy[DiabetesAllDummy['discharge_disposition_died']==0]
DiabetesTakingMed = DiabetesTakingMed[DiabetesTakingMed['discharge_disposition_died']==0]
DiabetesAnyChange = DiabetesAnyChange[DiabetesAnyChange['discharge_disposition_died']==0]

DiabetesAllDummy.index = list(range(len(DiabetesAllDummy)))
DiabetesTakingMed.index = list(range(len(DiabetesTakingMed)))
DiabetesAnyChange.index = list(range(len(DiabetesAnyChange)))

In [510]:
#Write these two DF's to CSV and use for further analysis:
DiabetesAllDummy.to_csv('DiabetesAllDummy.csv')
DiabetesTakingMed.to_csv('DiabetesTakingMed.csv')
DiabetesAnyChange.to_csv('DiabetesAnyChange.csv')

## Important Note: For now, I have readmitted in 3 columns: 0 (never), 1 (>30 days), and 2 (<30 days). Technically, '2' is the one we are looking for, so any analysis needs to convert 2 to 1 and 1 to 0. But I didn't want to remove this data yet because could it possibly be relevant in some way? 