# EDA

In [1]:
from sqlalchemy import create_engine
import pandas as pd

In [2]:
engine = create_engine('postgresql://willnobles:localhost@localhost:5432/diabetes')

# Read the data from CSV files
ids_mapping = pd.read_csv('./dataset_diabetes/dataset_diabetes/IDs_mapping.csv')
diabetic_data = pd.read_csv('./dataset_diabetes/dataset_diabetes/diabetic_data.csv')

# Name the tables
ids_mapping.to_sql('IDs_mapping', engine, index=False, if_exists='replace')
diabetic_data.to_sql('Diabetes', engine, index=False, if_exists='replace')

In [3]:
# Check for access to the IDs_mapping table
query = 'SELECT * FROM "IDs_mapping";'
df_id = pd.read_sql(query, engine)

df_id.head()

Unnamed: 0,admission_type_id,description
0,1,Emergency
1,2,Urgent
2,3,Elective
3,4,Newborn
4,5,Not Available


In [4]:
# Check for access to the Diabetes table
query = 'SELECT * FROM "Diabetes";'
df_diabetes = pd.read_sql(query, engine)

df_diabetes.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [5]:
# Look at all the columns
df_diabetes.columns

Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age', 'weight',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'payer_code', 'medical_specialty',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted'],
      dtype='object')

In [6]:
# Get an idea of the number of rows
df_diabetes.shape

(101766, 50)

We can see that there are a number of missing values for features like `race`, `weight`, and `payer_code` that are categorized with a "?". Missing values for columns like `race` and `payer_code` can be removed without affecting the data very much, but removing them for `weight` will have a big impact since that would involve removing almost 97% of the data.

In [7]:
df_diabetes['race'].value_counts()

Caucasian          76099
AfricanAmerican    19210
?                   2273
Hispanic            2037
Other               1506
Asian                641
Name: race, dtype: int64

In [8]:
df_diabetes['weight'].value_counts()

?            98569
[75-100)      1336
[50-75)        897
[100-125)      625
[125-150)      145
[25-50)         97
[0-25)          48
[150-175)       35
[175-200)       11
>200             3
Name: weight, dtype: int64

In [9]:
df_diabetes['payer_code'].value_counts()

?     40256
MC    32439
HM     6274
SP     5007
BC     4655
MD     3532
CP     2533
UN     2448
CM     1937
OG     1033
PO      592
DM      549
CH      146
WC      135
OT       95
MP       79
SI       55
FR        1
Name: payer_code, dtype: int64

In [10]:
# Get indices in race and payer_code columns with "?" values
pc_indices = df_diabetes[df_diabetes['payer_code'] == '?'].index
 
# Delete these row indexes from DataFrame
df_diabetes.drop(pc_indices, inplace=True)

In [11]:
df_diabetes.shape

(61510, 50)

Note that each patient has a unique identifier called `patient_nbr`, but there are multiple encounters for each patient. We should ensure that our data includes only one encounter per patient, specifically the first encounter.

In [12]:
df_diabetes.duplicated(subset=['patient_nbr'])

20446     False
20737     False
20824     False
21083     False
23668     False
          ...  
101760     True
101761     True
101762     True
101763     True
101764     True
Length: 61510, dtype: bool

We'll first sort by `encounter_id` in descending order so that the duplicates we remove will be encounters with a higher identification number since they're likely a subsequent encounter.

In [13]:
df_diabetes.sort_values(['encounter_id'], ascending=False).groupby('patient_nbr').head(50)

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
101764,443857166,31693671,Caucasian,Female,[80-90),?,2,3,7,10,...,No,Up,No,No,No,No,No,Ch,Yes,NO
101763,443854148,41088789,Caucasian,Male,[70-80),?,1,1,7,1,...,No,Down,No,No,No,No,No,Ch,Yes,NO
101762,443847782,74694222,AfricanAmerican,Female,[80-90),?,1,4,5,5,...,No,Steady,No,No,No,No,No,No,Yes,NO
101761,443847548,100162476,AfricanAmerican,Male,[70-80),?,1,3,7,3,...,No,Down,No,No,No,No,No,Ch,Yes,>30
101760,443847176,50375628,AfricanAmerican,Female,[60-70),?,1,1,7,6,...,No,Down,No,No,No,No,No,Ch,Yes,>30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23668,80820942,20514150,Caucasian,Female,[60-70),?,2,1,1,4,...,No,Up,No,No,No,No,No,Ch,Yes,<30
21083,73731852,20542797,Caucasian,Male,[70-80),?,1,2,7,10,...,No,Steady,No,No,No,No,No,Ch,Yes,NO
20824,73062156,20408121,Caucasian,Female,[90-100),?,1,1,7,4,...,No,No,No,No,No,No,No,No,Yes,NO
20737,72848634,20377854,Caucasian,Female,[60-70),?,2,1,1,3,...,No,Steady,No,No,No,No,No,No,Yes,NO


In [14]:
df_diabetes.drop_duplicates(subset='patient_nbr', keep='last', inplace=True)

Let's now check that we've removed all duplicate patient records by comparing the number of unique patient values with the number of rows in the dataframe to make sure they're the same.

In [15]:
df_diabetes['patient_nbr'].nunique()

41533

In [16]:
df_diabetes.shape

(41533, 50)

Lastly, let's look at the `discharge_disposition_id` column. There are 28 unique codes that the [Centers for Medicare & Medicaid Services]('https://www.cms.gov/medicare/medicare-contracting/contractorlearningresources/downloads/ja0801.pdf') (CMS) uses to classify how a patient was discharged.

In [17]:
df_diabetes.discharge_disposition_id.sort_values().unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 11, 13, 14, 15, 16, 17, 18, 19,
       20, 22, 23, 24, 25, 27, 28])

Since we want to predict whether a patient will be readmitted within 30 days after their first encounter, we should remove rows with a `discharge_disposition_id` equal to 20 since that code is used when a patient dies or is classified as "Expired."

In [21]:
# Get indices in discharge_disposition_id column where the code is 20 (i.e. "Expired")
expired_indices = df_diabetes[(df_diabetes['discharge_disposition_id'] == 20)].index

# Delete these row indexes from DataFrame
df_diabetes.drop(expired_indices, inplace=True)

In [24]:
# Check that "20" has been removed
df_diabetes.discharge_disposition_id.sort_values().unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 11, 13, 14, 15, 16, 17, 18, 19,
       22, 23, 24, 25, 27, 28])

We'll save the cleaned data to a file so that it can be opened in another notebook for data visualization and regression analysis.

In [25]:
df_diabetes.to_csv('diabetes.csv')