In [4]:
import numpy as np
import pandas as pd
import random


In [5]:
df = pd.read_csv("diabetic_data.csv")
#print(df.head())
#print(df.describe())
print(list(df))


['encounter_id', 'patient_nbr', 'race', 'gender', 'age', 'weight', 'admission_type_id', 'discharge_disposition_id', 'admission_source_id', 'time_in_hospital', 'payer_code', 'medical_specialty', 'num_lab_procedures', 'num_procedures', 'num_medications', 'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1', 'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult', 'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton', 'insulin', 'glyburide-metformin', 'glipizide-metformin', 'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted']


In [6]:
# The weight and payer code attributes are dropped because of their high percentage of missing values.
# This means that neither of them are relevant to the readmission rate.
# We can find more information about this in the research paper.
df = df.drop(columns=['weight', 'payer_code'])


In [7]:
races = pd.get_dummies(df['race'])
print(df['race'].value_counts())

# The most common race of the patients is Caucasian, so we drop that column.
# The missingness is sparse enough that we can drop it without losing very much information.
# There are so few Asian patients that we drop that column as well.
races = races.drop(columns = ['?', 'Caucasian', 'Asian'])
# races


Caucasian          76099
AfricanAmerican    19210
?                   2273
Hispanic            2037
Other               1506
Asian                641
Name: race, dtype: int64


In [8]:
gender = df['gender']
print(gender.value_counts())

# Three rows have do not supply a gender, so we drop them
# gender.describe()
UnknownorInvalidgenders = gender.loc[gender == 'Unknown/Invalid']
gender = gender.drop([30506, 75551, 82573])

gender = pd.get_dummies(gender)

# The most common gender of the patients is female, so we drop that column.
# gender.describe()
gender = gender.drop(columns=['Female'])

# gender


Female             54708
Male               47055
Unknown/Invalid        3
Name: gender, dtype: int64


In [9]:
age = df["age"]

# We replace each age bin with its lower bound
for i in range(10):
    age = age.replace("["+str(10*i)+"-"+str(10*(i+1))+")", i)

# print(age)


In [10]:
admission = df['admission_type_id']
# print(admission.value_counts())
# print(df.shape)

# About 10% of this column has values of Not Availible (5), NULL (6), or Not Mapped (8)
# We convert these values to NaN in order to impute these values at random
admission = admission.replace({5:np.nan, 6:np.nan, 8:np.nan})

# We check that the values were indeed replaced with Nans
# print(admission.value_counts())

# print(type(np.float64(np.nan)), type(admission[0]))

# We tested to see how many non nans there were
# print(~np.isnan(admission))
# print(admission[0] == np.float64(np.nan)) # Why are these two not equal considering that there of the same type and value???

#print(admission[~np.isnan(admission)])
#print(len(admission[~np.isnan(admission)]))
# This series is of the right length

# We now fill the NAs at random
admission.fillna(random.choice(admission[~np.isnan(admission)]),
                 inplace = True)
print(admission.value_counts())
#print(admission[admission != np.nan])

admission = pd.get_dummies(admission)

# print(admission)
# The most common admission type is emergency, so we drop that column. We also drop the newborn and Trauma_Center columns
# because of their sparcity.
admission = admission.drop(columns=[1.0, 4.0, 7.0])
#print(admission)

admission.columns = ['Urgent', 'Elective']
# print(admission)

1.0    53990
3.0    29265
2.0    18480
7.0       21
4.0       10
Name: admission_type_id, dtype: int64


In [14]:
discharge_disposition = df['discharge_disposition_id']
discharge_disposition.value_counts()

# We remove all columns with patients that expired (died) as we know they cannot be readmitted.
# On a side note, theoretically if we had enough data and other features, we could predict whether or not a patient would die
# within the thirty days and use that to predict that that patient will not be readmitted.

Deceased_patients = discharge_disposition[discharge_disposition.isin([11, 19, 20, 21])].index.tolist()
discharge_disposition = discharge_disposition.drop(Deceased_patients)
Deceased_patients
discharge_disposition.value_counts()
discharge_disposition = pd.get_dummies(discharge_disposition)
# discharge_disposition

In [21]:
admission_source = df['admission_source_id']
admission_source.value_counts()

admission_source = pd.get_dummies(admission_source)
# We now drop all of the null columns

# We drop all of the columns that indicate a null value. These columns provide no information about our patients.
#admission_source = admission_source.drop(columns=[9, 17, 20])

# admission_source

In [28]:
medical_specialty = df['medical_specialty']
medical_specialty.value_counts()[medical_specialty.value_counts() < 1000]


Pulmonology                          871
Psychiatry                           854
Urology                              685
ObstetricsandGynecology              671
Surgery-Cardiovascular/Thoracic      652
Gastroenterology                     564
Surgery-Vascular                     533
Surgery-Neuro                        468
PhysicalMedicineandRehabilitation    391
Oncology                             348
Pediatrics                           254
Hematology/Oncology                  207
Neurology                            203
Pediatrics-Endocrinology             159
Otolaryngology                       125
Endocrinology                        120
Surgery-Thoracic                     109
Psychology                           101
Podiatry                             100
Surgery-Cardiovascular                98
Pediatrics-CriticalCare               87
Hematology                            82
Gynecology                            58
Hospitalist                           57
Radiology       