In [84]:
import pandas as pd
import numpy as np
import math

In [85]:
#import the test and train csv files, then combine them to diabetes01 with a flaf indicating test or not:
diabetesTrain = pd.read_csv('train_multi.csv')
diabetesTest = pd.read_csv('test_multi.csv')

diabetesTrain['IsTrain']=1
diabetesTest['IsTrain']=0

In [86]:
#load the initial data file:
diabetes01 = pd.concat([diabetesTrain, diabetesTest], axis=0)
diabetes01.index = list(range(len(diabetes01)))

In [87]:
#Examine the initial data file:
diabetes01.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,insulin,glyburide.metformin,glipizide.metformin,glimepiride.pioglitazone,metformin.rosiglitazone,metformin.pioglitazone,change,diabetesMed,readmitted,IsTrain
0,2278392,8222157,Caucasian,Female,[0-10),,6,25,1,1,...,No,No,No,No,No,No,No,No,NO,1
1,149190,55629189,Caucasian,Female,[10-20),,1,1,7,3,...,Up,No,No,No,No,No,Ch,Yes,>30,1
2,500364,82442376,Caucasian,Male,[30-40),,1,1,7,2,...,Up,No,No,No,No,No,Ch,Yes,NO,1
3,16680,42519267,Caucasian,Male,[40-50),,1,1,7,1,...,Steady,No,No,No,No,No,Ch,Yes,NO,1
4,35754,82637451,Caucasian,Male,[50-60),,2,1,2,3,...,Steady,No,No,No,No,No,No,Yes,>30,1


In [88]:
#Rewrite the 24 medication columns to make clearer that they are all comparing the same thing:

Dcolumns = list(diabetes01.columns)
for i in range(24, 47):
    Dcolumns[i] = "med_" + Dcolumns[i]
Dcolumns[48] = "med_any"

diabetes01.columns = Dcolumns

In [89]:
#age is strictly divided into decades of life. We should make this numeric for now:
diabetes01['age'].value_counts()
diabetes01['age'] = diabetes01['age'].replace(['[0-10)', '[10-20)', '[20-30)', '[30-40)', '[40-50)', '[50-60)', 
                                              '[60-70)', '[70-80)', '[80-90)', '[90-100)'], [1,2,3,4,5,6,7,8,9,10])

In [90]:
#Gender has an Unknown variable, with only three values. Let's see what these look like:
diabetes01['gender'].value_counts()

Female             54708
Male               47055
Unknown/Invalid        3
Name: gender, dtype: int64

In [91]:
#We could remove these, but to keep it from becoming complicated, let's just impute with "female" (more common) instead:

diabetes01['gender'] = diabetes01['gender'].replace(['Unknown/Invalid'], ['Female'])
diabetes01['gender'].value_counts()

Female    54711
Male      47055
Name: gender, dtype: int64

In [92]:
#97% of data on weight is missing. We can not do anything with this, particularly because it could be non-random. Remove this variable:
diabetes01 = diabetes01.drop(['weight'], axis=1)

In [93]:
#Admission types look OK, except that 5,  8 and 6 are the same. We should combine them all to 5:
diabetes01['admission_type_id'].value_counts()
diabetes01['admission_type_id'] = diabetes01['admission_type_id'].replace([8], [6])
diabetes01['admission_type_id'] = diabetes01['admission_type_id'].replace([6], [5])

In [94]:
'''These need some adjusting. 11, 19, 20, 21 mean the patient died. Clearly, readmission rates will be 0 here, and this could \
be written into an algorithm, but for now, they should certainly be rewritten as the same thing.

18, 25, and 26 are all the same thing also (unknown).'''
diabetes01['discharge_disposition_id'].value_counts()

1     60234
3     13954
6     12902
18     3691
2      2128
22     1993
11     1642
5      1184
25      989
4       815
7       623
23      412
13      399
14      372
28      139
8       108
15       63
24       48
9        21
17       14
16       11
19        8
10        6
27        5
12        3
20        2
Name: discharge_disposition_id, dtype: int64

In [95]:
#Write to csv to visualize in ggplot2, a far superior visualization tool:
#diabetes01.to_csv('diabetesmod.csv')

In [96]:
'''After visualization, and careful reading of the descriptions, I would rewrite the 30 discharge categories.'''
replacelist = ['home', 'hospital', 'nursing', 'nursing', 'hospice', 'hhealth', 'leftAMA', 'hhealth', 'hospital', 'hospital',
              'died', 'hospital', 'hospice', 'hospice', 'hospital', 'outpatient', 'outpatient', 'unknown', 'died', 'died', 
              'died', 'outpatient', 'hospital', 'nursing', 'unknown', 'unknown', 'nursing', 'psych', 'hospital', 'outpatient']

diabetes01['discharge_disposition_id'] = diabetes01['discharge_disposition_id'].replace(list(range(1,31)), replacelist)

In [97]:
diabetes01['discharge_disposition_id'].value_counts()

home          60234
nursing       14822
hhealth       13010
unknown        4680
hospital       2633
outpatient     2018
hospice        1955
died           1652
leftAMA         623
psych           139
Name: discharge_disposition_id, dtype: int64

In [98]:
#Rewrite the column as discharge disposition:
newcollist = list(diabetes01.columns)
newcollist[newcollist.index('discharge_disposition_id')]='discharge_disposition'
diabetes01.columns = newcollist

In [99]:
#There are no missing values in the "time in hospital"
print(sum(diabetes01['time_in_hospital'].isna()))
print(diabetes01['time_in_hospital'].describe())

0
count    101766.000000
mean          4.395987
std           2.985108
min           1.000000
25%           2.000000
50%           4.000000
75%           6.000000
max          14.000000
Name: time_in_hospital, dtype: float64


In [100]:
#This category is completely unwieldy, and we will have to figure out what to do with this too. It has 40% missing values:
diabetes01['medical_specialty'].value_counts()

InternalMedicine                     14635
Emergency/Trauma                      7565
Family/GeneralPractice                7440
Cardiology                            5352
Surgery-General                       3099
Nephrology                            1613
Orthopedics                           1400
Orthopedics-Reconstructive            1233
Radiologist                           1140
Pulmonology                            871
Psychiatry                             854
Urology                                685
ObstetricsandGynecology                671
Surgery-Cardiovascular/Thoracic        652
Gastroenterology                       564
Surgery-Vascular                       533
Surgery-Neuro                          468
PhysicalMedicineandRehabilitation      391
Oncology                               348
Pediatrics                             254
Hematology/Oncology                    207
Neurology                              203
Pediatrics-Endocrinology               159
Otolaryngol

In [101]:
diabetes01['medical_specialty'] = diabetes01['medical_specialty'].str.replace('-.*$', '', regex=True)

In [102]:
diabetes01['medical_specialty'] = diabetes01['medical_specialty'].replace('Hematology/Oncology', 'Oncology')
diabetes01['medical_specialty'] = diabetes01['medical_specialty'].replace('ObstetricsandGynecology', 'OBGYN')
diabetes01['medical_specialty'] = diabetes01['medical_specialty'].replace('Gynecology', 'OBGYN')
diabetes01['medical_specialty'] = diabetes01['medical_specialty'].replace('Obsterics&Gynecology', 'OBGYN')
diabetes01['medical_specialty'] = diabetes01['medical_specialty'].replace('Obsterics', 'OBGYN')
diabetes01['medical_specialty'] = diabetes01['medical_specialty'].replace('Neurophysiology', 'Neurology')
diabetes01['medical_specialty'] = diabetes01['medical_specialty'].replace('Surgeon', 'Surgery')

In [103]:
valueseries = diabetes01['medical_specialty'].value_counts().copy()

for i in range(len(valueseries)):
    if valueseries.index[i] in ['InternalMedicine', 'Emergency/Trauma', 'Family/GeneralPractice', 'Cardiology', 'Surgery',
                        'Orthopedics', 'Nephrology', 'Radiologist', 'Pulmonology', 'Psychiatry', 'OBGYN', 'Urology',
                        'Gastroenterology', 'Oncology', 'Pediatrics']:
        continue
    else:
        diabetes01['medical_specialty'] = diabetes01['medical_specialty'].replace(
            [valueseries.index[i]], ['Other'])

In [21]:
#We are left with <15 categories, which may add value to the analysis:
diabetes01['medical_specialty'].value_counts()

InternalMedicine          14635
Emergency/Trauma           7565
Family/GeneralPractice     7440
Cardiology                 5359
Surgery                    5076
Orthopedics                2633
Nephrology                 1613
Other                      1519
Radiologist                1140
Pulmonology                 871
Psychiatry                  862
OBGYN                       754
Urology                     685
Gastroenterology            564
Oncology                    555
Pediatrics                  546
Name: medical_specialty, dtype: int64

In [22]:
#Num_lab_procedures is clear
print(diabetes01['num_lab_procedures'].describe())
sum(diabetes01['num_lab_procedures'].isna())

count    101766.000000
mean         43.095641
std          19.674362
min           1.000000
25%          31.000000
50%          44.000000
75%          57.000000
max         132.000000
Name: num_lab_procedures, dtype: float64


0

In [23]:
#Num_procedures is also clear:
print(diabetes01['num_procedures'].describe())
sum(diabetes01['num_procedures'].isna())

count    101766.000000
mean          1.339730
std           1.705807
min           0.000000
25%           0.000000
50%           1.000000
75%           2.000000
max           6.000000
Name: num_procedures, dtype: float64


0

In [24]:
#Num_medications is also clear:
print(diabetes01['num_medications'].describe())
sum(diabetes01['num_medications'].isna())

count    101766.000000
mean         16.021844
std           8.127566
min           1.000000
25%          10.000000
50%          15.000000
75%          20.000000
max          81.000000
Name: num_medications, dtype: float64


0

In [25]:
#Num_outpatient is also clear:
print(diabetes01['number_outpatient'].describe())
sum(diabetes01['number_outpatient'].isna())

count    101766.000000
mean          0.369357
std           1.267265
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max          42.000000
Name: number_outpatient, dtype: float64


0

In [26]:
#Num_emergency is also clear:
print(diabetes01['number_emergency'].describe())
sum(diabetes01['number_emergency'].isna())

count    101766.000000
mean          0.197836
std           0.930472
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max          76.000000
Name: number_emergency, dtype: float64


0

In [27]:
#Num_inpatient is also clear:
print(diabetes01['number_inpatient'].describe())
sum(diabetes01['number_inpatient'].isna())

count    101766.000000
mean          0.635566
std           1.262863
min           0.000000
25%           0.000000
50%           0.000000
75%           1.000000
max          21.000000
Name: number_inpatient, dtype: float64


0

In [28]:
#Num_diagnoses is also clear:
print(diabetes01['number_diagnoses'].describe())
sum(diabetes01['number_diagnoses'].isna())

count    101766.000000
mean          7.422607
std           1.933600
min           1.000000
25%           6.000000
50%           8.000000
75%           9.000000
max          16.000000
Name: number_diagnoses, dtype: float64


0

In [29]:
'''Num_diagnoses and diag_1/diag_2/diag_3 are highly related to each other. If num_diagnoses are 3 or under, diag_3 (and
diag_2) will be empty. A "primary" diagnosis is essentially what the patient is in there for. Secondary diagnoses are other
things the patient has. I would recommend combining diag_1 thru diag_3 together'''

'''But after viewing other scripts from team members, we conclude that we should keep diag_1 (primary) as well'''

'But after viewing other scripts from team members, we conclude that we should keep diag_1 (primary) as well'

In [30]:
#Max glucose in serum is present, though many points were not measured. We should rewrite these as "NotTaken"
diabetes01['max_glu_serum'].value_counts()
diabetes01['max_glu_serum'] = diabetes01['max_glu_serum'].replace(['None'], ['NotTaken'])

In [31]:
#Same story for A1C measurement
diabetes01['A1Cresult'].value_counts()
diabetes01['A1Cresult'] = diabetes01['A1Cresult'].replace(['None'], ['NotTaken'])

In [32]:
#Write a function to rewrite the disease codes according to a modified version of the publication:
def convertdiseases(min, max, newname):
    d1 = diabetes01['diag_1'].tolist()
    d2 = diabetes01['diag_2'].tolist()
    d3 = diabetes01['diag_3'].tolist()
    
    for i in range(len(d1)):
        try:
            if float(d1[i]) >= min and float(d1[i]) < max:
                d1[i] = newname
        except:
            pass
        try:
            if float(d2[i]) >= min and float(d2[i]) < max:
                d2[i] = newname
        except:
            pass
        try:
            if float(d3[i]) >= min and float(d3[i]) < max:
                d3[i] = newname
        except:
            pass
    
    diabetes01['diag_1'] = pd.Series(d1)
    diabetes01['diag_2'] = pd.Series(d2)
    diabetes01['diag_3'] = pd.Series(d3)


In [33]:
convertdiseases(340, 459, 'circulatory')

In [34]:
#That worked pretty well. Let's do this for all additional values and combinations:
diabetes01['diag_1'].value_counts()

circulatory    31074
786             4016
486             3508
491             2275
715             2151
682             2042
780             2019
996             1967
276             1889
38              1688
250.8           1680
599             1595
584             1520
V57             1207
250.6           1183
518             1115
820             1082
577             1057
493             1056
562              989
574              965
296              896
560              876
250.7            871
250.13           851
998              784
722              771
250.02           675
578              663
250.11           625
               ...  
V51                1
957                1
84                 1
V67                1
57                 1
870                1
911                1
690                1
691                1
955                1
976                1
143                1
314                1
988                1
219                1
906                1
994          

In [35]:
convertdiseases(785, 786, 'circulatory')
convertdiseases(745, 748, 'circulatory')
convertdiseases(459, 460, 'circulatory')
convertdiseases(460, 520, 'respiratory')
convertdiseases(786, 787, 'respiratory')
convertdiseases(748, 749, 'respiratory')
convertdiseases(520, 580, 'digestive')
convertdiseases(787, 788, 'digestive')
convertdiseases(749, 752, 'digestive')
convertdiseases(800, 1000, 'injury')
convertdiseases(710, 740, 'musculoskeletal')
convertdiseases(754, 757, 'musculoskeletal')
convertdiseases(580, 630, 'urogenital')
convertdiseases(788, 789, 'urogenital')
convertdiseases(752, 754, 'urogenital')
convertdiseases(140, 240, 'neoplasm')
convertdiseases(1, 140, 'infection')
convertdiseases(290, 320, 'mentaldis')
convertdiseases(280, 290, 'blooddis')
convertdiseases(320, 360, 'nervous')
convertdiseases(360, 390, 'nervous')
convertdiseases(740, 743, 'nervous')
convertdiseases(630, 680, 'pregnancy')
convertdiseases(780, 782, 'other')
convertdiseases(784, 785, 'other')
convertdiseases(790, 800, 'other')
convertdiseases(743, 745, 'other')
convertdiseases(757, 760, 'other')

In [36]:
'''This successfully converted our targets. Now we still have things to convert. All patients have diabetes, so the 250
classifications are not important insofar as they diagnose diabetes. We can, however, glean addition diabetic info from the 
decimal codes, where they exist. They would need to go to their own categories, however.'''

'''First, lets get rid of the EV codes, which the publication refers to as injuries or additional diagnosic information'''
diabetes01['diag_1'].value_counts()

circulatory        31293
respiratory        14423
digestive           9480
injury              6974
urogenital          5122
musculoskeletal     4972
neoplasm            3433
infection           2768
other               2544
mentaldis           2262
682                 2042
276                 1889
250.8               1680
V57                 1207
250.6               1183
blooddis            1103
250.7                871
250.13               851
pregnancy            687
250.02               675
250.11               625
789                  561
250.12               417
250.82               412
278                  379
nervous              376
250.1                313
250.4                267
707                  257
250                  235
                   ...  
250.91                 4
250.52                 4
245                    4
250.53                 4
706                    3
271                    3
266                    3
262                    3
240                    3


In [37]:
diabetes01['diag_1'] = diabetes01['diag_1'].replace('V[0-9]+', 'injury', regex=True)
diabetes01['diag_1'] = diabetes01['diag_1'].replace('E[0-9]+', 'injury', regex=True)
diabetes01['diag_2'] = diabetes01['diag_2'].replace('V[0-9]+', 'injury', regex=True)
diabetes01['diag_2'] = diabetes01['diag_2'].replace('E[0-9]+', 'injury', regex=True)
diabetes01['diag_3'] = diabetes01['diag_3'].replace('V[0-9]+', 'injury', regex=True)
diabetes01['diag_3'] = diabetes01['diag_3'].replace('E[0-9]+', 'injury', regex=True)

In [38]:
#This is looking better, but apprently, some ICD-9 codes weren't covered in our initial conversion:
diabetes01['diag_1'].value_counts()

circulatory        31293
respiratory        14423
digestive           9480
injury              8619
urogenital          5122
musculoskeletal     4972
neoplasm            3433
infection           2768
other               2544
mentaldis           2262
682                 2042
276                 1889
250.8               1680
250.6               1183
blooddis            1103
250.7                871
250.13               851
pregnancy            687
250.02               675
250.11               625
789                  561
250.12               417
250.82               412
278                  379
nervous              376
250.1                313
250.4                267
707                  257
250                  235
250.03               201
                   ...  
709                    8
250.5                  8
250.9                  7
272                    6
694                    6
273                    6
696                    6
685                    6
692                    5


In [39]:
convertdiseases(240, 250, 'metabolic')
convertdiseases(251, 280, 'metabolic')
convertdiseases(680, 710, 'skin')
convertdiseases(782, 783, 'skin')

In [40]:
#All that is missing now is 783, 789
diabetes01['diag_1'].value_counts()

circulatory        31293
respiratory        14423
digestive           9480
injury              8619
urogenital          5122
musculoskeletal     4972
neoplasm            3433
infection           2768
metabolic           2702
skin                2614
other               2544
mentaldis           2262
250.8               1680
250.6               1183
blooddis            1103
250.7                871
250.13               851
pregnancy            687
250.02               675
250.11               625
789                  561
250.12               417
250.82               412
nervous              376
250.1                313
250.4                267
250                  235
250.03               201
250.81               186
250.22               156
250.2                114
250.83                95
250.41                91
250.42                90
250.01                61
250.92                52
250.23                29
783                   29
250.43                25
250.3                 20


In [41]:
convertdiseases(789, 790, 'other')
convertdiseases(783, 784, 'metabolic')

In [42]:
'''We now have all values, other than ? and the diabetes codes, as a category. Let's create a new column with additional 
diabetes info (from the 250 codes), then revert all 250's and ? to 'NoDisease', a value which will go away after diag1, 2, 3
combination.'''

"We now have all values, other than ? and the diabetes codes, as a category. Let's create a new column with additional \ndiabetes info (from the 250 codes), then revert all 250's and ? to 'NoDisease', a value which will go away after diag1, 2, 3\ncombination."

In [43]:
#We can create 4 rows (eventually condensed to 2): diabetes_feature (1-3) and Type (1 or 2):

#Write a function to migrate data from diabetes codes to new columns, then revert them to "NoDisease":
'''This function returns a modified version of our DF, and creates four rows. diabfeature1, 2, 3 which are extra diabetic 
features of the patient, and diabtype which is a report of either Type1, Type2, or (typically) unknown for the patient, 
based on ICD code'''
def convertdiabetescodes(df):
    df2 = df.copy()
    d1 = list(df2['diag_1'])
    d2 = list(df2['diag_2'])
    d3 = list(df2['diag_3'])
    
    f1 = []
    f2 = []
    f3 = []
    diabtype1 = []
    diabtype2 = []
    diabtype3 = []
    
    for i in range(len(d1)):
        try:
            
            if 100*float(d1[i]) % 10 == 1 or 100*float(d1[i]) % 10 == 3:
                diabtype1.append('Type1')
            elif 100*float(d1[i]) % 10 == 2:
                diabtype1.append('Type2')
            elif 100*float(d1[i]) % 10 == 0:
                diabtype1.append('Unknown')
            else:
                diabtype1.append('Unknown')
                
            if 100*float(d1[i]) % 100 >= 10 and 100*float(d1[i]) % 100 < 20:
                f1.append('ketoacidosis')
            elif 100*float(d1[i]) % 100 >= 20 and 100*float(d1[i]) % 100 < 30:
                f1.append('hyperosmolarity')
            elif 100*float(d1[i]) % 100 >= 30 and 100*float(d1[i]) % 100 < 40:
                f1.append('coma')
            elif 100*float(d1[i]) % 100 >= 40 and 100*float(d1[i]) % 100 < 50:
                f1.append('renal')
            elif 100*float(d1[i]) % 100 >= 50 and 100*float(d1[i]) % 100 < 60:
                f1.append('ophthalmic')
            elif 100*float(d1[i]) % 100 >= 60 and 100*float(d1[i]) % 100 < 70:
                f1.append('neurologic')
            elif 100*float(d1[i]) % 100 >= 70 and 100*float(d1[i]) % 100 < 80:
                f1.append('circulatory')
            elif 100*float(d1[i]) % 100 >= 80:
                f1.append('other')
            elif 100*float(d1[i]) % 100 >=0 and 100*float(d1[i]) % 100 < 10:
                f1.append('No')
            else:
                f1.append('No')
                
        except:
            diabtype1.append('No')
            f1.append('No')
            
        try:
            if 100*float(d2[i]) % 10 == 1 or 100*float(d2[i]) % 10 == 3:
                diabtype2.append('Type1')
            elif 100*float(d2[i]) % 10 == 2:
                diabtype2.append('Type2')
            elif 100*float(d2[i]) % 10 == 0:
                diabtype2.append('Unknown')
            else:
                diabtype2.append('Unknown')
                
            if 100*float(d2[i]) % 100 >= 10 and 100*float(d2[i]) % 100 < 20:
                f2.append('ketoacidosis')
            elif 100*float(d2[i]) % 100 >= 20 and 100*float(d2[i]) % 100 < 30:
                f2.append('hyperosmolarity')
            elif 100*float(d2[i]) % 100 >= 30 and 100*float(d2[i]) % 100 < 40:
                f2.append('coma')
            elif 100*float(d2[i]) % 100 >= 40 and 100*float(d2[i]) % 100 < 50:
                f2.append('renal')
            elif 100*float(d2[i]) % 100 >= 50 and 100*float(d2[i]) % 100 < 60:
                f2.append('ophthalmic')
            elif 100*float(d2[i]) % 100 >= 60 and 100*float(d2[i]) % 100 < 70:
                f2.append('neurologic')
            elif 100*float(d2[i]) % 100 >= 70 and 100*float(d2[i]) % 100 < 80:
                f2.append('circulatory')
            elif 100*float(d2[i]) % 100 >= 80:
                f2.append('other')
            elif 100*float(d2[i]) % 100 >=0 and 100*float(d2[i]) % 100 < 10:
                f2.append('No')
            else:
                f2.append('No')
                
        except:
            diabtype2.append('No')
            f2.append('No')
            
        try:
            if 100*float(d3[i]) % 10 == 1 or 100*float(d3[i]) % 10 == 3:
                diabtype3.append('Type1')
            elif 100*float(d3[i]) % 10 == 2:
                diabtype3.append('Type2')
            elif 100*float(d3[i]) % 10 == 0:
                diabtype3.append('Unknown')
            else:
                diabtype3.append('Unknown')
                
            if 100*float(d3[i]) % 100 >= 10 and 100*float(d3[i]) % 100 < 20:
                f3.append('ketoacidosis')
            elif 100*float(d3[i]) % 100 >= 20 and 100*float(d3[i]) % 100 < 30:
                f3.append('hyperosmolarity')
            elif 100*float(d3[i]) % 100 >= 30 and 100*float(d3[i]) % 100 < 40:
                f3.append('coma')
            elif 100*float(d3[i]) % 100 >= 40 and 100*float(d3[i]) % 100 < 50:
                f3.append('renal')
            elif 100*float(d3[i]) % 100 >= 50 and 100*float(d3[i]) % 100 < 60:
                f3.append('ophthalmic')
            elif 100*float(d3[i]) % 100 >= 60 and 100*float(d3[i]) % 100 < 70:
                f3.append('neurologic')
            elif 100*float(d3[i]) % 100 >= 70 and 100*float(d3[i]) % 100 < 80:
                f3.append('circulatory')
            elif 100*float(d3[i]) % 100 >= 80:
                f3.append('other')
            elif 100*float(d3[i]) % 100 >=0 and 100*float(d3[i]) % 100 < 10:
                f3.append('No')
            else:
                f3.append('No')
                
        except:
            diabtype3.append('No')
            f3.append('No')
    
    finaltype = []
    
    for i in range(len(diabtype1)):
        if diabtype1 == 'Type1' or diabtype2 == 'Type1' or diabtype3 == 'Type1':
            finaltype.append('Type1')
        elif diabtype1 == 'Type2' or diabtype2 == 'Type2' or diabtype3 == 'Type2':
            finaltype.append('Type2')
        else:
            finaltype.append('Unknown')
   
    df2['diabfeature1'] = f1
    df2['diabfeature2'] = f2
    df2['diabfeature3'] = f3
    df2['diabtype'] = finaltype
    
    return df2

In [44]:
#We can create 4 rows (eventually condensed to 2): diabetes_feature (1-3) and Type (1 or 2):

#Write a function to migrate data from diabetes codes to new columns, then revert them to "NoDisease":
'''This function returns a modified version of our DF, and creates four rows. diabfeature1, 2, 3 which are extra diabetic 
features of the patient, and diabtype which is a report of either Type1, Type2, or (typically) unknown for the patient, 
based on ICD code'''
def convertdiabetescodes(df):
    df2 = df.copy()
    d1 = list(df2['diag_1'])
    d2 = list(df2['diag_2'])
    d3 = list(df2['diag_3'])
    
    f1 = []
    f2 = []
    f3 = []
    diabtype1 = []
    diabtype2 = []
    diabtype3 = []
    
    for i in range(len(d1)):
        try:
            
            if int(100*float(d1[i])) % 10 == 1 or int(100*float(d1[i])) % 10 == 3:
                diabtype1.append('Type1')
            elif int(100*float(d1[i])) % 10 == 2:
                diabtype1.append('Type2')
            elif int(100*float(d1[i])) % 10 == 0:
                diabtype1.append('Unknown')
            else:
                diabtype1.append('Unknown')
                
            if int(100*float(d1[i])) % 100 >= 10 and int(100*float(d1[i])) % 100 < 20:
                f1.append('ketoacidosis')
            elif int(100*float(d1[i])) % 100 >= 20 and int(100*float(d1[i])) % 100 < 30:
                f1.append('hyperosmolarity')
            elif int(100*float(d1[i])) % 100 >= 30 and int(100*float(d1[i])) % 100 < 40:
                f1.append('coma')
            elif int(100*float(d1[i])) % 100 >= 40 and int(100*float(d1[i])) % 100 < 50:
                f1.append('renal')
            elif int(100*float(d1[i])) % 100 >= 50 and int(100*float(d1[i])) % 100 < 60:
                f1.append('ophthalmic')
            elif int(100*float(d1[i])) % 100 >= 60 and int(100*float(d1[i])) % 100 < 70:
                f1.append('neurologic')
            elif int(100*float(d1[i])) % 100 >= 70 and int(100*float(d1[i])) % 100 < 80:
                f1.append('circulatory')
            elif int(100*float(d1[i])) % 100 >= 80:
                f1.append('other')
            elif int(100*float(d1[i])) % 100 >=0 and int(100*float(d1[i])) % 100 < 10:
                f1.append('No')
            else:
                f1.append('No')
                
        except:
            diabtype1.append('No')
            f1.append('No')
            
        try:
            if int(100*float(d2[i])) % 10 == 1 or int(100*float(d2[i])) % 10 == 3:
                diabtype2.append('Type1')
            elif int(100*float(d2[i])) % 10 == 2:
                diabtype2.append('Type2')
            elif int(100*float(d2[i])) % 10 == 0:
                diabtype2.append('Unknown')
            else:
                diabtype2.append('Unknown')
                
            if int(100*float(d2[i])) % 100 >= 10 and int(100*float(d2[i])) % 100 < 20:
                f2.append('ketoacidosis')
            elif int(100*float(d2[i])) % 100 >= 20 and int(100*float(d2[i])) % 100 < 30:
                f2.append('hyperosmolarity')
            elif int(100*float(d2[i])) % 100 >= 30 and int(100*float(d2[i])) % 100 < 40:
                f2.append('coma')
            elif int(100*float(d2[i])) % 100 >= 40 and int(100*float(d2[i])) % 100 < 50:
                f2.append('renal')
            elif int(100*float(d2[i])) % 100 >= 50 and int(100*float(d2[i])) % 100 < 60:
                f2.append('ophthalmic')
            elif int(100*float(d2[i])) % 100 >= 60 and int(100*float(d2[i])) % 100 < 70:
                f2.append('neurologic')
            elif int(100*float(d2[i])) % 100 >= 70 and int(100*float(d2[i])) % 100 < 80:
                f2.append('circulatory')
            elif int(100*float(d2[i])) % 100 >= 80:
                f2.append('other')
            elif int(100*float(d2[i])) % 100 >=0 and int(100*float(d2[i])) % 100 < 10:
                f2.append('No')
            else:
                f2.append('No')
                
        except:
            diabtype2.append('No')
            f2.append('No')
            
        try:
            if int(100*float(d3[i])) % 10 == 1 or int(100*float(d3[i])) % 10 == 3:
                diabtype3.append('Type1')
            elif int(100*float(d3[i])) % 10 == 2:
                diabtype3.append('Type2')
            elif int(100*float(d3[i])) % 10 == 0:
                diabtype3.append('Unknown')
            else:
                diabtype3.append('Unknown')
                
            if int(100*float(d3[i])) % 100 >= 10 and int(100*float(d3[i])) % 100 < 20:
                f3.append('ketoacidosis')
            elif int(100*float(d3[i])) % 100 >= 20 and int(100*float(d3[i])) % 100 < 30:
                f3.append('hyperosmolarity')
            elif int(100*float(d3[i])) % 100 >= 30 and int(100*float(d3[i])) % 100 < 40:
                f3.append('coma')
            elif int(100*float(d3[i])) % 100 >= 40 and int(100*float(d3[i])) % 100 < 50:
                f3.append('renal')
            elif int(100*float(d3[i])) % 100 >= 50 and int(100*float(d3[i])) % 100 < 60:
                f3.append('ophthalmic')
            elif int(100*float(d3[i])) % 100 >= 60 and int(100*float(d3[i])) % 100 < 70:
                f3.append('neurologic')
            elif int(100*float(d3[i])) % 100 >= 70 and int(100*float(d3[i])) % 100 < 80:
                f3.append('circulatory')
            elif int(100*float(d3[i])) % 100 >= 80:
                f3.append('other')
            elif int(100*float(d3[i])) % 100 >=0 and int(100*float(d3[i])) % 100 < 10:
                f3.append('No')
            else:
                f3.append('No')
                
        except:
            diabtype3.append('No')
            f3.append('No')
    
    finaltype = []
    
    for i in range(len(diabtype1)):
        if diabtype1[i] == 'Type1' or diabtype2[i] == 'Type1' or diabtype3[i] == 'Type1':
            finaltype.append('Type1')
        elif diabtype1[i] == 'Type2' or diabtype2[i] == 'Type2' or diabtype3[i] == 'Type2':
            finaltype.append('Type2')
        else:
            finaltype.append('Unknown')
   
    df2['diabfeature1'] = f1
    df2['diabfeature2'] = f2
    df2['diabfeature3'] = f3
    df2['diabtype'] = finaltype
    
    return df2

In [45]:
#Now, let's update diabetes01 to diabetes02 with these extra columns:
diabetes02 = convertdiabetescodes(diabetes01)

In [46]:
#Now, we can see that we have Type1/2 information for some of the patients.
diabetes02['diabtype'].value_counts()

Unknown    89142
Type2       6418
Type1       6206
Name: diabtype, dtype: int64

In [47]:
#We can also see supplementary diabeteic features for these patients based on the code presented:
diabetes02['diabfeature1'].value_counts()

No                 94181
other               2453
ketoacidosis        2206
neurologic          1183
circulatory          871
renal                473
hyperosmolarity      311
coma                  71
ophthalmic            17
Name: diabfeature1, dtype: int64

In [48]:
#Save file and reopen:

#diabetes02.to_csv('diabetes02.csv')
#diabetes02 = pd.read_csv('diabetes02.csv', index_col=0)

In [49]:
#Let's see if we can impute additional columns for Type1/Type2 based on what we have here
diabetes00 = pd.read_csv('diabetic_data.csv')

In [50]:
diabetes00['diabtype'] = diabetes02['diabtype']

In [51]:
'''A visual inspection of some random samples of this data is discouraging, and shows differening type status (1 vs 2) for the same patient in different encounters. Based on this fact, and the low prevalence of this information to begin with, we should not include diabtype in our analysis or try to impute it within the same patient.'''
diabetes00[diabetes00['diabtype']!='Unknown'].sort_values('patient_nbr')[['patient_nbr', 'diabtype']].head(100)
diabetes00[diabetes00['diabtype']!='Unknown'].sort_values('patient_nbr')[['patient_nbr', 'diabtype']].iloc[300:400,]

Unnamed: 0,patient_nbr,diabtype
8637,437886,Type1
5189,438615,Type2
2967,439452,Type2
14203,439884,Type2
7110,439884,Type2
23097,440136,Type2
12448,440271,Type2
4685,440271,Type1
15932,443358,Type1
7746,444357,Type1


In [52]:
#Drop diabtype from this dataframe:
diabetes02 = diabetes02.drop('diabtype', axis=1)

In [53]:
'''We also need to get rid of diabetes from the diag list (we know these patients have diabetes.) Anything still numeric (or ?)
in the diag_ features should be changed to "Nothing"'''

diabetes02['diag_1'] = diabetes02['diag_1'].replace('?', 'Nothing').astype('str')
diabetes02['diag_2'] = diabetes02['diag_2'].replace('?', 'Nothing').astype('str')
diabetes02['diag_3'] = diabetes02['diag_3'].replace('?', 'Nothing').astype('str')

d1 = list(diabetes02['diag_1'])
for i in range(len(d1)):
    try:
        pd.to_numeric(d1[i])
        d1[i] = 'Nothing'
    except:
        pass
diabetes02['diag_1'] = pd.Series(d1)
        
d1 = list(diabetes02['diag_2'])
for i in range(len(d1)):
    try:
        pd.to_numeric(d1[i])
        d1[i] = 'Nothing'
    except:
        pass
diabetes02['diag_2'] = pd.Series(d1)

d1 = list(diabetes02['diag_3'])
for i in range(len(d1)):
    try:
        pd.to_numeric(d1[i])
        d1[i] = 'Nothing'
    except:
        pass
diabetes02['diag_3'] = pd.Series(d1)
        

In [54]:
#Now, we have our data frame reduced to all string representations of diseases, or "Nothing." We can now combine diag 1-3:
diabetes02['diag_3'].value_counts()
#We also had nan values (other mislabeled strings that meant no disease). Let's change these to Nothing.

circulatory        31577
Nothing            17157
metabolic           9208
respiratory         7358
injury              7004
urogenital          6700
digestive           3936
mentaldis           3136
other               2675
skin                2607
blooddis            2490
musculoskeletal     1923
infection           1861
neoplasm            1856
nan                 1423
nervous              546
pregnancy            309
Name: diag_3, dtype: int64

In [55]:
diabetes02['diag_1'] = diabetes02['diag_1'].replace(['nan'], ['Nothing'])
diabetes02['diag_2'] = diabetes02['diag_2'].replace(['nan'], ['Nothing'])
diabetes02['diag_3'] = diabetes02['diag_3'].replace(['nan'], ['Nothing'])

In [56]:
#We want to combine data for the 3 diag types, but ALSO keep diag_1 as the primary as well. So let's make a copy of it then dummify that copy separately:
diabetes02['primarydiag'] = diabetes02['diag_1'].copy()

In [57]:
#Fill in the Nan values from medical_specialty as unknown. Delete payer code:
diabetes02['medical_specialty'][diabetes02['medical_specialty'].isna()] = 'Unknown'
diabetes02 = diabetes02.drop('payer_code', axis=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [58]:
diabetes02.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,admission_type_id,discharge_disposition,admission_source_id,time_in_hospital,medical_specialty,...,med_metformin.rosiglitazone,med_metformin.pioglitazone,change,med_any,readmitted,IsTrain,diabfeature1,diabfeature2,diabfeature3,primarydiag
0,2278392,8222157,Caucasian,Female,1,5,unknown,1,1,Pediatrics,...,No,No,No,No,NO,1,other,No,No,Nothing
1,149190,55629189,Caucasian,Female,2,1,home,7,3,Unknown,...,No,No,Ch,Yes,>30,1,No,No,No,metabolic
2,500364,82442376,Caucasian,Male,4,1,home,7,2,Unknown,...,No,No,Ch,Yes,NO,1,No,renal,No,infection
3,16680,42519267,Caucasian,Male,5,1,home,7,1,Unknown,...,No,No,Ch,Yes,NO,1,No,No,No,neoplasm
4,35754,82637451,Caucasian,Male,6,2,home,2,3,Unknown,...,No,No,No,Yes,>30,1,No,No,No,circulatory


In [59]:
#Now, we need to adjust the values in referral. Every value above 7 (8 and above) are extremely small or missing. We should rewrite all things 8 and above as just 8:
diabetes02['admission_source_id'] = diabetes02['admission_source_id'].clip(upper=8)

In [60]:
#Now it is maxed out at 8; all values from 9 to 25 are converted to 8:
diabetes02['admission_source_id'].describe()

count    101766.000000
mean          5.132156
std           2.770942
min           1.000000
25%           1.000000
50%           7.000000
75%           7.000000
max           8.000000
Name: admission_source_id, dtype: float64

In [61]:
#Let's combine diagnosis (diag_1, diag_2, diag_3) into dummy variables and add them. We will do this for diag and diabfeature:

#This is a function to do this
def createcombineddummies(df, c1, c2, c3=None, c4=None, c5=None, prefix=''):
    
    '''This would need to be modified if you have values not in every Series (we don't have that problem here)'''
    
    collist = [c1, c2]
    
    if c3 is not None:
        collist.append(c3)
    if c4 is not None:
        collist.append(c4)
    if c5 is not None:
        collist.append(c5)
        
    for i in range(len(collist)):
        
        if i == 0:
            tempDF = pd.get_dummies(df[collist[i]], prefix=prefix)
        
        if i > 0:
            tempDF1 = pd.get_dummies(df[collist[i]], prefix=prefix)
            tempDF = tempDF + tempDF1
        
    #Do we need this code? It's probably useful. Reduces everything to 1 (if a patien has 2 resporatory conditions, for example)
    tempDF = tempDF.clip(upper=1)
    tempDF = tempDF.drop(prefix + '_Nothing', axis=1)
    
    return tempDF

In [62]:
#Let's try creating this dummy DF and see how it looks:
diagDummy = createcombineddummies(diabetes02, 'diag_1', 'diag_2', 'diag_3', prefix='diag')

In [63]:
#This looks good. Let's combine it with diabetes02 and remove diag_1-3
diagDummy.head(10)

Unnamed: 0,diag_blooddis,diag_circulatory,diag_digestive,diag_infection,diag_injury,diag_mentaldis,diag_metabolic,diag_musculoskeletal,diag_neoplasm,diag_nervous,diag_other,diag_pregnancy,diag_respiratory,diag_skin,diag_urogenital
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
2,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
5,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
6,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0
7,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0
8,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0
9,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [64]:
# Drop diag_1-3 and add diagDummy to DF
diabetes03 = diabetes02.drop(['diag_1', 'diag_2', 'diag_3'], axis=1)
diabetes03 = pd.concat([diabetes03, diagDummy], axis=1)

In [65]:
diabetes03.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,admission_type_id,discharge_disposition,admission_source_id,time_in_hospital,medical_specialty,...,diag_mentaldis,diag_metabolic,diag_musculoskeletal,diag_neoplasm,diag_nervous,diag_other,diag_pregnancy,diag_respiratory,diag_skin,diag_urogenital
0,2278392,8222157,Caucasian,Female,1,5,unknown,1,1,Pediatrics,...,0,0,0,0,0,0,0,0,0,0
1,149190,55629189,Caucasian,Female,2,1,home,7,3,Unknown,...,0,1,0,0,0,0,0,0,0,0
2,500364,82442376,Caucasian,Male,4,1,home,7,2,Unknown,...,0,0,0,0,0,0,0,0,0,0
3,16680,42519267,Caucasian,Male,5,1,home,7,1,Unknown,...,0,0,0,1,0,0,0,0,0,0
4,35754,82637451,Caucasian,Male,6,2,home,2,3,Unknown,...,0,0,0,0,0,0,0,0,0,0


In [66]:
#We need to modify this function (Annoying) because we have a different negative value for these columns:

#This is a function to do this
def createcombineddummiesF(df, c1, c2, c3=None, c4=None, c5=None, prefix=''):
    
    '''This would need to be modified if you have values not in every Series (we don't have that problem here)'''
    
    collist = [c1, c2]
    
    if c3 is not None:
        collist.append(c3)
    if c4 is not None:
        collist.append(c4)
    if c5 is not None:
        collist.append(c5)
        
    for i in range(len(collist)):
        
        if i == 0:
            tempDF = pd.get_dummies(df[collist[i]], prefix=prefix)
        
        if i > 0:
            tempDF1 = pd.get_dummies(df[collist[i]], prefix=prefix)
            tempDF = tempDF + tempDF1
        
    #Do we need this code? It's probably useful. Reduces everything to 1 (if a patien has 2 resporatory conditions, for example)
    tempDF = tempDF.clip(upper=1)
    tempDF = tempDF.drop(prefix + '_No', axis=1)
    
    return tempDF

In [67]:
#Now do a similar thing for diagfeature:
featureDummy = createcombineddummiesF(diabetes02, 'diabfeature1', 'diabfeature2', 'diabfeature3', prefix='diabfeat')

In [68]:
#Out of curiosity, let's see how many values are in each columns:
np.sum(featureDummy)

diabfeat_circulatory        1142
diabfeat_coma                 86
diabfeat_hyperosmolarity     396
diabfeat_ketoacidosis       2589
diabfeat_neurologic         3158
diabfeat_ophthalmic          577
diabfeat_other              3883
diabfeat_renal              1828
dtype: int64

In [69]:
'''This is a decent amount of information which could possibly help with predicting readmission rates (not sure about coma)'''

'This is a decent amount of information which could possibly help with predicting readmission rates (not sure about coma)'

In [70]:
diabetes03 = diabetes03.drop(['diabfeature1', 'diabfeature2', 'diabfeature3'], axis=1)
diabetes03 = pd.concat([diabetes03, featureDummy], axis=1)

In [71]:
diabetes03.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,admission_type_id,discharge_disposition,admission_source_id,time_in_hospital,medical_specialty,...,diag_skin,diag_urogenital,diabfeat_circulatory,diabfeat_coma,diabfeat_hyperosmolarity,diabfeat_ketoacidosis,diabfeat_neurologic,diabfeat_ophthalmic,diabfeat_other,diabfeat_renal
0,2278392,8222157,Caucasian,Female,1,5,unknown,1,1,Pediatrics,...,0,0,0,0,0,0,0,0,1,0
1,149190,55629189,Caucasian,Female,2,1,home,7,3,Unknown,...,0,0,0,0,0,0,0,0,0,0
2,500364,82442376,Caucasian,Male,4,1,home,7,2,Unknown,...,0,0,0,0,0,0,0,0,0,1
3,16680,42519267,Caucasian,Male,5,1,home,7,1,Unknown,...,0,0,0,0,0,0,0,0,0,0
4,35754,82637451,Caucasian,Male,6,2,home,2,3,Unknown,...,0,0,0,0,0,0,0,0,0,0


In [72]:
#At this point, we can fork the data into two DFs: One (AllDummy) will dummify all columns, including the medicine columns
#Another one, OrdMed, will make the medicinal columns ordinal:
#Not taking = 0, down=0.5, steady=1, up=1.5. Not sure where either of these models would work better

#A few last things to do are convert the ? in 'race' to Unknown:
diabetes03['race'][diabetes03['race'].isna()==True] = 'Unknown'

#And for now, we should probably save the outcome (readmitted) as a un-dummified column with 3 digitized outputs:
#No=0, >30=1, <30=2 (this is completely subject to change)
diabetes03['readmitted'] = diabetes03['readmitted'].replace(['NO', '>30', '<30'], ['No', 'No', 'Yes'])

In [73]:
#One other thing: We should check value counts for all the med columns (some are VERY sparse):
for i in range(18, 41):
    colname = list(diabetes03.columns)[i]
    print(diabetes03[colname].value_counts())

NotTaken    84748
>8           8216
Norm         4990
>7           3812
Name: A1Cresult, dtype: int64
No        81778
Steady    18346
Up         1067
Down        575
Name: med_metformin, dtype: int64
No        100227
Steady      1384
Up           110
Down          45
Name: med_repaglinide, dtype: int64
No        101063
Steady       668
Up            24
Down          11
Name: med_nateglinide, dtype: int64
No        101680
Steady        79
Up             6
Down           1
Name: med_chlorpropamide, dtype: int64
No        96575
Steady     4670
Up          327
Down        194
Name: med_glimepiride, dtype: int64
No        101765
Steady         1
Name: med_acetohexamide, dtype: int64
No        89080
Steady    11356
Up          770
Down        560
Name: med_glipizide, dtype: int64
No        91116
Steady     9274
Up          812
Down        564
Name: med_glyburide, dtype: int64
No        101743
Steady        23
Name: med_tolbutamide, dtype: int64
No        94438
Steady     6976
Up          234

## This is to produce a Shiny dataset for the group. So we are leaving all medicines and keeping everything as it originally was in terms of medicine

In [74]:
#Create three DF's, each corresponding to one of the criteria listed above:
DiabetesAllDummy = diabetes03.copy()
DiabetesAnyChange = diabetes03.copy()

In [75]:
diabetes03.columns[42]

'change'

In [76]:
#Modify AnyChange to show whather any diabetic medicine was changed:
MedColumns = list(diabetes03.columns[19:42])

for med in MedColumns:
    DiabetesAnyChange[med] = DiabetesAnyChange[med].replace(['No', 'Steady', 'Down', 'Up'], [0, 0, 1, 1])

In [77]:
#Next, create a new feature, "diabchange", which says if ANY diabeteic medication was changed at all:
DiabetesAnyChange['diabchange'] = 0
for med in MedColumns:
    DiabetesAnyChange['diabchange'] += DiabetesAnyChange[med]
DiabetesAnyChange['diabchange'] = DiabetesAnyChange['diabchange'].clip(upper=1)

In [78]:
#Now, import this column over to Diabetes AllDummy:
DiabetesAllDummy['diabchange'] = DiabetesAnyChange['diabchange']

In [79]:
#Finally, remove columns where the patients died
DiabetesAllDummy = DiabetesAllDummy[DiabetesAllDummy['discharge_disposition']!='died']

DiabetesAllDummy.index = list(range(len(DiabetesAllDummy)))

In [83]:
np.sum(DiabetesAllDummy.isna()).sort_values(ascending=False)

race                            2239
diabchange                         0
med_glipizide                      0
med_metformin                      0
med_repaglinide                    0
med_nateglinide                    0
med_chlorpropamide                 0
med_glimepiride                    0
med_acetohexamide                  0
med_glyburide                      0
max_glu_serum                      0
med_tolbutamide                    0
med_pioglitazone                   0
med_rosiglitazone                  0
med_acarbose                       0
med_miglitol                       0
med_troglitazone                   0
A1Cresult                          0
number_diagnoses                   0
med_examide                        0
number_inpatient                   0
number_emergency                   0
number_outpatient                  0
num_medications                    0
num_procedures                     0
num_lab_procedures                 0
medical_specialty                  0
t

In [334]:
#Write these two DF's to CSV and use for further analysis:
DiabetesAllDummy.to_csv('DiabetesTrainTestForShiny.csv')