##### Phase 1: Data Ingestion & Clinical Sanitation

Load diabetic_data.csv file into a pandas dataframe

In [None]:
import pandas as pd
import numpy as np
import csv

df = pd.read_csv('./diabetic_data.csv')
print(df)

Perform an initial audit

Check dataframe length, datatypes of the columns and non-null value count of columns

In [None]:
print(df.info())

Identify numberic columns and get their summary of statistics includes count, mean, standard deviation, minimum, quartiles (25%, 50%, 75%), and maximum

In [None]:
print(df.describe())

Retrieve the default first 5 rows from the dataframe to understand the overall structure of the diabetic data.

In [None]:
print(df.head())

Get dataframe columns

In [None]:
print(df.columns);

Replace the ? character with the standard NumPy NaN value, '?' is used by hospitals to represent empty or null values

In [None]:
df = df.replace('?',np.nan)
print(df)

# write to csv file
df.to_csv('df.csv', index=False)

Drop the column entirely if the missingness exceeds 90%

In [None]:
missing_percent_of_each_columns = df.isna().mean()*100
print(missing_percent_of_each_columns);

df = df.dropna(axis=1,thresh=len(df)*0.1)
print(df)

# write to csv file
df.to_csv('df.csv', index=False)

Load IDs_mapping.csv file into a pandas dataframe

In [None]:
id_df = pd.read_csv('./IDs_mapping.csv')
print(id_df)

Get discharge_disposition_id codes corresponding 'Expired'

In [None]:
expired_rows = id_df[id_df['description'].str.contains('Expired', na=False)]
expired_rows['admission_type_id'] = expired_rows['admission_type_id'].astype(int)
print(expired_rows)

expired_ids = expired_rows['admission_type_id'].to_list()
print(expired_ids)

Remove dead records and filter only possible readmissions

In [None]:
df = df[~df['discharge_disposition_id'].isin(expired_ids)]
print(df)

# write to csv file
df.to_csv('df.csv', index=False)

Remove deplicates entries

In [None]:
df = df.drop_duplicates();
print(df);

# write to csv file
df.to_csv('df.csv', index=False)