In [1]:
import numpy as np
import pandas as pd
import random


In [2]:
df = pd.read_csv("diabetic_data.csv")
print(list(df))

['encounter_id', 'patient_nbr', 'race', 'gender', 'age', 'weight', 'admission_type_id', 'discharge_disposition_id', 'admission_source_id', 'time_in_hospital', 'payer_code', 'medical_specialty', 'num_lab_procedures', 'num_procedures', 'num_medications', 'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1', 'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult', 'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton', 'insulin', 'glyburide-metformin', 'glipizide-metformin', 'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted']


In [3]:
# The encounter_id and patient_nbr are used for identification and are not relevant to whether a patient will return.
# The weight and payer code attributes are dropped because of their high percentage of missing values.
# This means that neither of them are relevant to the readmission rate.
# We can find more information about this in the research paper.
df = df.drop(columns=['encounter_id', 'patient_nbr', 'weight', 'payer_code'])

In [None]:
# We remove all columns with patients that expired (died) as we know they cannot be readmitted.
# On a side note, theoretically if we had enough data and other features, we could predict whether or not a patient would die
# within the thirty days and use that to predict that that patient will not be readmitted.


In each of the following cells, we take a nominal (categorical) column and do the following:
- Dummify the column
- Remove any columns describing the missingness of that feature
- Drop the most common value of that feature if the missingness is not too deep
- Add the dummy columns back to the original dataframe and drop the original non-dummified column

In [4]:
races = pd.get_dummies(df['race'])
print(df['race'].value_counts())

# The most common race of the patients is Caucasian, so we drop that column.
# The missingness is sparse enough that we can drop it without losing very much information.
# We also drop the Asian column because it so uncommon that it will not factor into our model prediction if we included it.
races = races.drop(columns = ['?', 'Caucasian', 'Asian'])

# We add the the dummy columns back to the original dataframe
df = pd.concat([df, races], axis=1)
df = df.drop(columns = ['race'])

Caucasian          76099
AfricanAmerican    19210
?                   2273
Hispanic            2037
Other               1506
Asian                641
Name: race, dtype: int64


In [5]:
genders = pd.get_dummies(df['gender'])
print(df['gender'].value_counts())

# The most common gender of the patients is female, so we drop that column.
# The missingness is sparse enough that we can drop it without losing very much information.
genders = genders.drop(columns = ['Unknown/Invalid', 'Female'])

# We add the the dummy column back to the original dataframe
df = pd.concat([df, genders], axis=1)
df = df.drop(columns = ['gender'])

Female             54708
Male               47055
Unknown/Invalid        3
Name: gender, dtype: int64


In [6]:
# The Age nominal column need not be dummified because it can be easily turned into a numerical column as follows:
age = df["age"]

# We replace each age bin with its lower bound divided by 10
for i in range(10):
    age = age.replace("["+str(10*i)+"-"+str(10*(i+1))+")", i)

df['age'] = age

In [7]:
admission_type = pd.get_dummies(df['admission_type_id'])
print(df['admission_type_id'].value_counts())

# The most common admission type of the patients is Emergency (1), so we drop that column.
# The missingness is sparse enough (about 10%) that we can drop those columns (5, 6, and 8) without losing very much information.
admission_type = admission_type.drop(columns = [1, 5, 6, 8])

# We add the the dummy columns back to the original dataframe
df = pd.concat([df, admission_type], axis=1)
df = df.drop(columns = ['admission_type_id'])

1    53990
3    18869
2    18480
6     5291
5     4785
8      320
7       21
4       10
Name: admission_type_id, dtype: int64


In [8]:
discharge_disposition = pd.get_dummies(df['discharge_disposition_id'])
print(df['discharge_disposition_id'].value_counts())

# The most common discharge type of the patients is Discharged to Home (1), so we drop that column.
# The missingness is sparse enough (about 4.5%) that we can drop those columns (18 and 25) without losing very much information.
discharge_disposition = discharge_disposition.drop(columns = [1, 18, 25])

# We add the the dummy columns back to the original dataframe
df = pd.concat([df, discharge_disposition], axis=1)
df = df.drop(columns = ['discharge_disposition_id'])

1     60234
3     13954
6     12902
18     3691
2      2128
22     1993
11     1642
5      1184
25      989
4       815
7       623
23      412
13      399
14      372
28      139
8       108
15       63
24       48
9        21
17       14
16       11
19        8
10        6
27        5
12        3
20        2
Name: discharge_disposition_id, dtype: int64


In [9]:
admission_source = pd.get_dummies(df['admission_source_id'])
print(df['admission_source_id'].value_counts())

# The most common route of admission is from the emergency room (7), so we drop that column.
# The missingness is sparse enough (about 7%) that we can drop it without losing very much information.
admission_source = admission_source.drop(columns = [7, 17, 9, 20])

# We add the the dummy columns back to the original dataframe
df = pd.concat([df, admission_source], axis=1)
df = df.drop(columns = ['admission_source_id'])

7     57494
1     29565
17     6781
4      3187
6      2264
2      1104
5       855
3       187
20      161
9       125
8        16
22       12
10        8
11        2
14        2
25        2
13        1
Name: admission_source_id, dtype: int64


In [10]:
medical_specialty = pd.get_dummies(df['medical_specialty'])
print(df['medical_specialty'].value_counts())

# The missingness is rampant in this column (53%). Though we will still drop this column, we cannot drop the most common
# non-missing column, Internal Medicine. If we did do this, our model would assume that a missing specialty of the doctor
# is really internal medicine, which we do not want. As an aside, sometimes it is okay to impute by the most common class, but
# we will not be doing this.

# Here we drop the column displaying the missingness.
medical_specialty = medical_specialty.drop(columns = ['?'])

# As there are 84 medical specialties, we do not wish to try to model every specialty, because there are just too many.
# We would require more data, lots of computing power, and potentially a lot of time.
# Instead we only model the specialties of doctors that treated at least 1% of patients in our data set.
# Of course, 1% is arbitrary.
treated_onepercent = list(df['medical_specialty'].value_counts()[1:10].index)
medical_specialty = medical_specialty.drop(columns = treated_onepercent)

# We add the the dummy columns back to the original dataframe
df = pd.concat([df, medical_specialty], axis=1)
df = df.drop(columns = ['medical_specialty'])

?                                    49949
InternalMedicine                     14635
Emergency/Trauma                      7565
Family/GeneralPractice                7440
Cardiology                            5352
Surgery-General                       3099
Nephrology                            1613
Orthopedics                           1400
Orthopedics-Reconstructive            1233
Radiologist                           1140
Pulmonology                            871
Psychiatry                             854
Urology                                685
ObstetricsandGynecology                671
Surgery-Cardiovascular/Thoracic        652
Gastroenterology                       564
Surgery-Vascular                       533
Surgery-Neuro                          468
PhysicalMedicineandRehabilitation      391
Oncology                               348
Pediatrics                             254
Hematology/Oncology                    207
Neurology                              203
Pediatrics-

In [11]:
# For the diagonosis columns we need to perform a more detailed dummification of the variables becuase all three columns need to
# be distributed into the various dummified variables.

# For the three diagnosis columns, we first change the icd9 codes into the group name of the disease for which the patient was
# admitted. The types of diseases are circulatory, respiratory, digestive, diabetes, injury, musculoskeletal, genitourinary,
# neoplasms, and other.
diagnoses = df[['diag_1', 'diag_2', 'diag_3']]

# These are lists of the icd9 codes for illness contained within the nine illness groups
circulatory_icd9_codes = list(map(str, range(390, 460))) + ['785']
respiratory_icd9_codes = list(map(str, range(460, 520))) + ['786']
digestive_icd9_codes = list(map(str, range(520, 580))) + ['787']
diabetes_icd9_codes = list(map(str, list(np.linspace(250, 251, 100, endpoint = False)))) + ['250']
injury_icd9_codes = list(map(str, range(800, 1000)))
musculoskeletal_icd9_codes = list(map(str, range(710, 740)))
genitourinary_icd9_codes = list(map(str, range(580, 630))) + ['788']
neoplasms_icd9_codes = list(map(str, range(140, 240)))
other_codes = (list(map(str, range(1, 800))) + 
               list(map(lambda x: 'V0' + str(x), range(0, 10))) +
               list(map(lambda x: 'V' + str(x), range(10, 100))) + 
               list(map(lambda x: 'E' + str(x), range(800, 1000))) +
               ['365.44']
              )
# We change the icd9 codes to the respective illness group
diagnoses = diagnoses.replace(circulatory_icd9_codes, 'circulatory')
diagnoses = diagnoses.replace(respiratory_icd9_codes, 'respiratory')
diagnoses = diagnoses.replace(digestive_icd9_codes, 'digestive')
diagnoses = diagnoses.replace(diabetes_icd9_codes, 'diabetes')
diagnoses = diagnoses.replace(injury_icd9_codes, 'injury')
diagnoses = diagnoses.replace(musculoskeletal_icd9_codes, 'musculoskeletal')
diagnoses = diagnoses.replace(genitourinary_icd9_codes, 'genitourinary')
diagnoses = diagnoses.replace(neoplasms_icd9_codes, 'neoplasms')
diagnoses = diagnoses.replace(other_codes, 'other')

# Next we dummify and then add the dummifed columns together
diag_1 = pd.get_dummies(diagnoses['diag_1'])
diag_2 = pd.get_dummies(diagnoses['diag_2'])
diag_3 = pd.get_dummies(diagnoses['diag_3'])
dummy_diag = diag_1 + diag_2 + diag_3

# The missingness is sparse enough that we can drop it without losing very much information.
# Since we cannot deduce the illness group of a patient with the other collumns, there is not that much multicollinearity, so
# we need not drop the most common column.
dummy_diag = dummy_diag.drop(columns = ['?'])

# We add the the dummy columns back to the original dataframe
df = pd.concat([df, dummy_diag], axis=1)
df = df.drop(columns = ['diag_1', 'diag_2', 'diag_3'])

In [12]:
glu_serum = pd.get_dummies(df['max_glu_serum'])
print(df['max_glu_serum'].value_counts())

# There is so much missingness (96%) that we need not drop the non-missing column with the most counts because we do not risk
# causing multicollinearity.
glu_serum = glu_serum.drop(columns = ['None'])

# We add the the dummy columns back to the original dataframe
df = pd.concat([df, glu_serum], axis=1)
df = df.drop(columns = ['max_glu_serum'])

None    96420
Norm     2597
>200     1485
>300     1264
Name: max_glu_serum, dtype: int64


In [13]:
A1Cresult = pd.get_dummies(df['A1Cresult'])
print(df['A1Cresult'].value_counts())

# The most common A1C test value is None, indicating that it was not taken. Ordinarily, we would drop the dummified column of
# None because we usually can be sure it isn't statistically significant. However, in this case, the conclusion of the research
# paper clearly states that whether or not Hemoglobin A1C was measured is very important in predicting readission. Thus it is
# imperative that we retain the None column.

# We add the the dummy columns back to the original dataframe
df = pd.concat([df, A1Cresult], axis=1)
df = df.drop(columns = ['A1Cresult'])

None    84748
>8       8216
Norm     4990
>7       3812
Name: A1Cresult, dtype: int64


In [14]:
# When we examine the 23 different medication features, we find that a bunch of the medications are hardly ever prescribed.

print(df['metformin'].value_counts())
print(df['repaglinide'].value_counts())
print(df['nateglinide'].value_counts())
print(df['chlorpropamide'].value_counts())
print(df['glimepiride'].value_counts())
print(df['acetohexamide'].value_counts())
print(df['glipizide'].value_counts())
print(df['glyburide'].value_counts())
print(df['tolbutamide'].value_counts())
print(df['pioglitazone'].value_counts())
print(df['rosiglitazone'].value_counts())
print(df['acarbose'].value_counts())
print(df['miglitol'].value_counts())
print(df['troglitazone'].value_counts())
print(df['tolazamide'].value_counts())
print(df['examide'].value_counts())
print(df['citoglipton'].value_counts())
print(df['insulin'].value_counts())
print(df['glyburide-metformin'].value_counts())
print(df['glipizide-metformin'].value_counts())
print(df['glimepiride-pioglitazone'].value_counts())
print(df['metformin-rosiglitazone'].value_counts())
print(df['metformin-pioglitazone'].value_counts())


No        81778
Steady    18346
Up         1067
Down        575
Name: metformin, dtype: int64
No        100227
Steady      1384
Up           110
Down          45
Name: repaglinide, dtype: int64
No        101063
Steady       668
Up            24
Down          11
Name: nateglinide, dtype: int64
No        101680
Steady        79
Up             6
Down           1
Name: chlorpropamide, dtype: int64
No        96575
Steady     4670
Up          327
Down        194
Name: glimepiride, dtype: int64
No        101765
Steady         1
Name: acetohexamide, dtype: int64
No        89080
Steady    11356
Up          770
Down        560
Name: glipizide, dtype: int64
No        91116
Steady     9274
Up          812
Down        564
Name: glyburide, dtype: int64
No        101743
Steady        23
Name: tolbutamide, dtype: int64
No        94438
Steady     6976
Up          234
Down        118
Name: pioglitazone, dtype: int64
No        95401
Steady     6100
Up          178
Down         87
Name: rosiglitazone, dty

In [15]:
# We drop the 15 medications that are barely taken or prescribed. We use the criteria that we only include the ones that are taken
# or prescribed by at least 1% of patients.
barely_used = ['nateglinide',
               'chlorpropamide',
               'acetohexamide',
               'tolbutamide',
               'acarbose',
               'miglitol',
               'troglitazone',
               'tolazamide',
               'examide',
               'citoglipton',
               'glyburide-metformin',
               'glipizide-metformin',
               'glimepiride-pioglitazone',
               'metformin-rosiglitazone',
               'metformin-pioglitazone'               
              ]
df = df.drop(columns = barely_used)

In [16]:
# We dummify each of the remaining eight medicine columns:
used_medicine_strings = ['metformin',
                         'repaglinide',
                         'glimepiride',
                         'glipizide',
                         'glyburide',
                         'pioglitazone',
                         'rosiglitazone',
                         'insulin']
used_medicine_dfs = [df[medicine] for medicine in used_medicine_strings]

for i in range(len(used_medicine_strings)):
    used_medicine_dfs[i] = pd.get_dummies(df[used_medicine_strings[i]])

In [17]:
# For each of the remaining medicines, the most common value is No, which stands for the medication not being taken or
# prescribed. Since, we do not have any missingness for these columns, we can drop this most common value from the dummified
# data frames.

for medicine in used_medicine_dfs:
    medicine = medicine.drop(columns = ['No'])

In [18]:
# Now we add the dummy columns back to the original dataframe and delete the original columns.
df = pd.concat([df] + used_medicine_dfs, axis = 1)
df = df.drop(columns = used_medicine_strings)

In [19]:
# Lastly for the data cleaning process is to drop the two extraneous columns that record whether there was a change in diabetic
# medicatin or if any diabetic medication was prescribed. These two columns are extraneous because the medicine columns that
# we just added to the main df contain all the information from the extraneous ones.
df = df.drop(columns=['change', 'diabetesMed'])


In [20]:
list(df)

['encounter_id',
 'patient_nbr',
 'age',
 'time_in_hospital',
 'num_lab_procedures',
 'num_procedures',
 'num_medications',
 'number_outpatient',
 'number_emergency',
 'number_inpatient',
 'number_diagnoses',
 'readmitted',
 'AfricanAmerican',
 'Asian',
 'Hispanic',
 'Other',
 'Male',
 2,
 3,
 4,
 7,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 19,
 20,
 22,
 23,
 24,
 27,
 28,
 1,
 2,
 3,
 4,
 5,
 6,
 8,
 10,
 11,
 13,
 14,
 22,
 25,
 'AllergyandImmunology',
 'Anesthesiology',
 'Anesthesiology-Pediatric',
 'Cardiology-Pediatric',
 'DCPTEAM',
 'Dentistry',
 'Dermatology',
 'Endocrinology',
 'Endocrinology-Metabolism',
 'Gastroenterology',
 'Gynecology',
 'Hematology',
 'Hematology/Oncology',
 'Hospitalist',
 'InfectiousDiseases',
 'Neurology',
 'Neurophysiology',
 'Obsterics&Gynecology-GynecologicOnco',
 'Obstetrics',
 'ObstetricsandGynecology',
 'Oncology',
 'Ophthalmology',
 'Osteopath',
 'Otolaryngology',
 'OutreachServices',
 'Pathology',
 'Pediatrics',


In [29]:
df.iloc[1000]
#df.readmitted.value_counts()

encounter_id          7556418
patient_nbr           4282317
age                         5
time_in_hospital            6
num_lab_procedures         57
num_procedures              2
num_medications            14
number_outpatient           0
number_emergency            0
number_inpatient            0
number_diagnoses            4
readmitted                 NO
AfricanAmerican             0
Asian                       0
Hispanic                    0
Other                       0
Male                        1
2                           0
3                           0
4                           0
7                           0
2                           0
3                           0
4                           0
5                           0
6                           0
7                           0
8                           0
9                           0
10                          0
                       ...   
Steady                      0
Up                          0
Down      

In [None]:
### Put my new df into a csv