##### Phase 1: Data Ingestion & Clinical Sanitation

Load diabetic_data.csv file into a pandas dataframe

In [None]:
import pandas as pd
import numpy as np
import csv

df = pd.read_csv('./diabetic_data.csv')
print(df)

Perform an initial audit

Check dataframe length, datatypes of the columns and non-null value count of columns

In [None]:
print(df.info())

Identify numberic columns and get their summary of statistics includes count, mean, standard deviation, minimum, quartiles (25%, 50%, 75%), and maximum

In [None]:
print(df.describe())

Retrieve the default first 5 rows from the dataframe to understand the overall structure of the diabetic data

In [None]:
print(df.head())

Get dataframe columns

In [None]:
print(df.columns);

Replace the ? character with the standard NumPy NaN value, '?' is used by hospitals to represent empty or null values

In [None]:
df = df.replace('?',np.nan)
print(df)

# write to csv file
df.to_csv('df.csv', index=False)

Drop the column entirely if the missingness exceeds 90%

In [None]:
missing_percent_of_each_columns = df.isna().mean()*100
print(missing_percent_of_each_columns);

df = df.dropna(axis=1,thresh=len(df)*0.1)
print(df)

# write to csv file
df.to_csv('df.csv', index=False)

Load IDs_mapping.csv file into a pandas dataframe

In [None]:
id_df = pd.read_csv('./IDs_mapping.csv')
print(id_df)

Get discharge_disposition_id codes corresponding 'Expired'

In [None]:
expired_rows = id_df[id_df['description'].str.contains('Expired', na=False)]
expired_rows['admission_type_id'] = expired_rows['admission_type_id'].astype(int)
print(expired_rows)

expired_ids = expired_rows['admission_type_id'].to_list()
print(expired_ids)

Remove dead records and filter only possible readmissions

In [None]:
df = df[~df['discharge_disposition_id'].isin(expired_ids)]
print(df)

# write to csv file
df.to_csv('df.csv', index=False)

Remove deplicates entries

In [None]:
df = df.drop_duplicates();
print(df);

# write to csv file
df.to_csv('df.csv', index=False)

##### Phase 2 â€“ Web Scraping & Data Enrichment

Calculate the frequency of all codes in diag_1

In [None]:
group_by_obj = df.groupby(by='diag_1')
frequency = group_by_obj.diag_1.count()
print(frequency)

Sort frequency by decending order to identify top frequent disease codes

In [None]:
sored_frequency = frequency.sort_values(ascending=False)
print(sored_frequency)

Find top 20 most frequency disease codes

In [None]:
top_disease_code = sored_frequency.head(n=20)
print(top_disease_code)

Choose ICD-9 lookup resource as https://www.icd9data.com

https://www.icd9data.com website does not provide direct URLs for individual codes. Code pages are stored inside multiple nested folders based on volume and code ranges. So array of main ranges and sub-ranges is created. When sending requests based on code, find the main range and subrange, and dynamically create a URL to send the request to retrieve the web page

In [None]:
main_ranges = [
    "001-139",
    "140-239",
    "240-279",
    "280-289",
    "290-319",
    "320-389",
    "390-459",
    "460-519",
    "520-579",
    "580-629",
    "630-679",
    "680-709",
    "710-739",
    "740-759",
    "760-779",
    "780-799",
    "800-999",
    "V01-V91",
    "E000-E999"
]

In [None]:
sub_range = [
    # Symptoms, Signs, And Ill-Defined Conditions 780-799
    "780-789",
    "790-796",
    "797-799",

    # Mental Disorders 290-319
    "290-294",
    "295-299",
    "300-316",
    "317-319",

    # Diseases Of The Genitourinary System 580-629
    "580-589",
    "590-599",
    "600-608",
    "610-612",
    "614-616",
    "617-629",

    # Complications Of Pregnancy, Childbirth, And The Puerperium 630-679
    "630-639",
    "640-649",
    "650-659",
    "660-669",
    "670-677",
    "678-679",

    # Certain Conditions Originating In The Perinatal Period 760-779
    "760-763",
    "764-779",

    # Diseases Of The Skin And Subcutaneous Tissue 680-709
    "680-686",
    "690-698",
    "700-709",

    # Diseases Of The Circulatory System 390-459
    "390-392",
    "393-398",
    "401-405",
    "410-414",
    "415-417",
    "420-429",
    "430-438",
    "440-449",
    "451-459",

    # Endocrine, Nutritional And Metabolic Diseases, And Immunity Disorders 240-279
    "240-246",
    "249-259",
    "260-269",
    "270-279",

    # Diseases Of The Musculoskeletal System And Connective Tissue 710-739
    "710-719",
    "720-724",
    "725-729",
    "730-739",

    # Diseases Of The Digestive System 520-579
    "520-529",
    "530-539",
    "540-543",
    "550-553",
    "555-558",
    "560-569",
    "570-579",

    # Diseases Of The Nervous System And Sense Organs 320-389
    "320-327",
    "330-337",
    "338-338",
    "339-339",
    "340-349",
    "350-359",
    "360-379",
    "380-389",

    # Injury And Poisoning 800-999 (main chapter ranges shown)
    "800-804",
    "805-809",
    "810-819",
    "820-829",
    "830-839",
    "840-848",
    "850-854",
    "860-869",
    "870-879",
    "880-887",
    "890-897",
    "900-904",
    "905-909",
    "910-919",
    "920-924",
    "925-929",
    "930-939",
    "940-949",
    "950-957",
    "958-959",
    "960-979",
    "980-989",
    "990-995",
    "996-999",

    # Neoplasms 140-239
    "140-149",
    "150-159",
    "160-165",
    "170-176",
    "179-189",
    "190-199",
    "200-209",
    "210-229",
    "230-234",
    "235-238",
    "239-239",

    # Supplementary Classification Of Factors Influencing Health Status... V01-V91
    "V01-V09",
    "V10-V19",
    "V20-V29",
    "V30-V39",
    "V40-V49",
    "V50-V59",
    "V60-V69",
    "V70-V82",
    "V83-V84",
    "V85-V85",
    "V86-V86",
    "V87-V87",
    "V88-V88",
    "V89-V89",
    "V90-V90",
    "V91-V91",

    # Diseases Of The Respiratory System 460-519
    "460-466",
    "470-478",
    "480-488",
    "490-496",
    "500-508",
    "510-519",

    # Infectious And Parasitic Diseases 001-139
    "001-009",
    "010-018",
    "020-027",
    "030-041",
    "042-042",
    "045-049",
    "050-059",
    "060-066",
    "070-079",
    "080-088",
    "090-099",
    "100-104",
    "110-118",
    "120-129",
    "130-136",
    "137-139",

    # Supplementary Classification Of External Causes Of Injury And Poisoning E000-E999
    "E000-E000",
    "E001-E030",
    "E800-E807",
    "E810-E819",
    "E820-E825",
    "E826-E829",
    "E830-E838",
    "E840-E845",
    "E846-E849",
    "E850-E858",
    "E860-E869",
    "E870-E876",
    "E878-E879",
    "E880-E888",
    "E890-E899",
    "E900-E909",
    "E910-E915",
    "E916-E928",
    "E929-E929",
    "E930-E949",
    "E950-E959",
    "E960-E969",
    "E970-E979",
    "E980-E989",
    "E990-E999"
]

Function for sending requests to retrieve web pages of corresponding codes and exact disease names

In [None]:
import requests
from bs4 import BeautifulSoup
import time

disease_code_name_map = {};

def get_deasese_names():
    for x in  top_disease_code.keys():

        code = x;
        if len(code) == 2:
            code = '0'+code
        
        prefix = code.split(".")[0]

        p = code.split(".")[0]
        if len(p) == 2:
            p = '0'+p

        if prefix.startswith('V') or prefix.startswith('E'):
            prefix = int(prefix[1:])
        else:
            prefix = int(prefix)

        for a in main_ranges:

            found = False

            range = a
        
            first = range.split('-')[0]
            f = range.split('-')[0]
            if first.startswith('V') or first.startswith('E'):
                first = int(first[1:])
            else:
                first = int(first)
            
            second = range.split('-')[1]
            s = range.split('-')[1]
            if second.startswith('V') or second.startswith('E'):
                second = int(second[1:])
            else:
                second = int(second)

            m_range = ''

            if x.startswith('V') and f.startswith('V') and s.startswith('V'):
                if prefix >= first and prefix <= second:
                    m_range = a
            
                    for b in sub_range:
                        range_b = b
        
                        first_b = range_b.split('-')[0]
                        f_b = range_b.split('-')[0]
                        if first_b.startswith('V') or first_b.startswith('E'):
                            first_b = int(first_b[1:])
                        else:
                            first_b = int(first_b)
            
                        second_b = range_b.split('-')[1]
                        s_b = range_b.split('-')[1]
                        if second_b.startswith('V') or second_b.startswith('E'):
                            second_b = int(second_b[1:])
                        else:
                            second_b = int(second_b)
            
                        s_range = ''

                        if x.startswith('V') and f_b.startswith('V') and s_b.startswith('V'):
                            if prefix >= first_b and prefix <= second_b:
                                s_range = b
                                found = True
                                break


            elif x.startswith('E') and f.startswith('E') and s.startswith('E'):
                if prefix >= first and prefix <= second:
                    m_range = a
            
                    for b in sub_range:
                        range_b = b
        
                        first_b = range_b.split('-')[0]
                        f_b = range_b.split('-')[0]
                        if first_b.startswith('V') or first_b.startswith('E'):
                            first_b = int(first_b[1:])
                        else:
                            first_b = int(first_b)
            
                        second_b = range_b.split('-')[1]
                        s_b = range_b.split('-')[1]
                        if second_b.startswith('V') or second_b.startswith('E'):
                            second_b = int(second_b[1:]);
                        else:
                            second_b = int(second_b)
            
                        s_range = ''

                        if x.startswith('E') and f_b.startswith('E') and s_b.startswith('E'):
                            if prefix >= first_b and prefix <= second_b:
                                s_range = b
                                found = True
                                break

            elif not x.startswith('V') and not f.startswith('V') and not s.startswith('V') and not x.startswith('E') and not f.startswith('E') and not s.startswith('E'):
                if prefix >= first and prefix <= second:
                    m_range = a
            
                    for b in sub_range:
                        range_b = b
        
                        first_b = range_b.split('-')[0]
                        f_b = range_b.split('-')[0]
                        if first_b.startswith('V') or first_b.startswith('E'):
                            first_b = int(first_b[1:])
                        else:
                            first_b = int(first_b)
            
                        second_b = range_b.split('-')[1]
                        s_b = range_b.split('-')[1]
                        if second_b.startswith('V') or second_b.startswith('E'):
                            second_b = int(second_b[1:])
                        else:
                            second_b = int(second_b)
            
                        s_range = ''

                        if not x.startswith('V') and not f_b.startswith('V') and not s_b.startswith('V') and not x.startswith('E') and not f_b.startswith('E') and not s_b.startswith('E'):
                            if prefix >= first_b and prefix <= second_b:
                                s_range = b
                                found = True
                                break
        
            if found:
                break
            
        print('code', x)
        print('main range',m_range)
        print('sub range',s_range)
        print('------')

        if s_range == '':
            print(f'https://www.icd9data.com/2015/Volume1/{m_range}/{s_range}/{p}/{code}.htm')
            response = requests.get(f'https://www.icd9data.com/2015/Volume1/{m_range}/{p}/{code}.htm')
            html_doc = response.text
            # print(html_doc)
            soup = BeautifulSoup(html_doc,'html.parser')
            t = soup.find('title').text
            # print(t)
            disease = t.split(':')[1].strip()
            print(disease)

            disease_code_name_map[x] = disease

            time.sleep(1)

        else:
            print(f'https://www.icd9data.com/2015/Volume1/{m_range}/{s_range}/{p}/{code}.htm')
            response = requests.get(f'https://www.icd9data.com/2015/Volume1/{m_range}/{s_range}/{p}/{code}.htm')
            html_doc = response.text
            # print(html_doc)
            soup = BeautifulSoup(html_doc,'html.parser')
            t = soup.find('title').text
            # print(t)
            disease = t.split(':')[1].strip()
            print(disease)

            disease_code_name_map[x] = disease

            time.sleep(1)


get_deasese_names()

print(disease_code_name_map)

Create new column named "Primary_Diagnosis_Desc" by mapping disease name with "diag_1" code

In [None]:
df["Primary_Diagnosis_Desc"] = (
    df["diag_1"].map(disease_code_name_map)
    )
print(df)

Replace "Primary_Diagnosis_Desc" column NaN values with "Other"

In [None]:
df["Primary_Diagnosis_Desc"].replace(np.nan,'Other',inplace=True)
print(df)

# write to csv file
df.to_csv('df.csv', index=False)

##### Phase 4: Feature Engineering - The "Vitality Complexity Index"

Create a new column "VCI_Score" by calculating the score for each patient based on the columns "time_in_hospital", "admission_type_id", "number_diagnoses", and "number_emergency"

In [None]:
def calculate_length_of_stay_score(days_count):
    L = 0
    if pd.isna(days_count):
        L = 0
    elif days_count < 1:
        L = 0
    elif days_count >= 1 and days_count <= 4:
        L = 1
    elif days_count >= 5 and days_count <= 13:
        L = 4
    elif days_count >= 14:
        L = 7
    return L

def calculate_acuity_of_admission_score(id):
    A = 0
    if pd.isna(id):
        A = 0
    elif id == 1 or id == 7:
        A = 3
    else:
        A = 0
    return A

def calculate_comorbidity_burden_score(diagnoses_count):
    C = 0
    if pd.isna(diagnoses_count):
        C = 0
    elif diagnoses_count < 4:
        C = 0
    elif diagnoses_count >= 4 and diagnoses_count <= 7:
        C = 3
    elif diagnoses_count >= 8:
        C = 5
    return C

def calculate_emergency_visit_intensity_score(visits_count):
    E = 0
    if pd.isna(visits_count):
        E = 0
    elif visits_count <= 0:
        E = 0
    elif visits_count >= 1 and visits_count <= 4:
        E = 3
    elif visits_count > 4:
        E = 5
    return E


df['VCI_Score'] = df['time_in_hospital'].apply(calculate_length_of_stay_score) + df['admission_type_id'].apply(calculate_acuity_of_admission_score) + df['number_diagnoses'].apply(calculate_comorbidity_burden_score) + df['number_emergency'].apply(calculate_emergency_visit_intensity_score)
print(df)

# write to csv file
df.to_csv('df.csv', index=False)

Create a new column "Risk_Category" based on the "VCI_Score"

In [None]:
def get_risk_category(score):
    if score < 7:
        return 'Low Risk'
    elif score >= 7 and score <= 10:
        return 'Medium Risk'
    elif score > 10:
        return 'High Risk'


df['Risk_Category'] = df['VCI_Score'].apply(get_risk_category)
print(df)

# write to csv file
df.to_csv('df.csv', index=False)

Calculate the "Low Risk" patients readmission rate

In [None]:
low_risk_patients = df[df['Risk_Category'] == 'Low Risk']
print(low_risk_patients)

low_risk_patients_readmitted = df[(df['Risk_Category'] == 'Low Risk') & (df['readmitted'] == '<30')]
print(low_risk_patients_readmitted)

low_risk_patient_readmisson_rate = len(low_risk_patients_readmitted) / len(low_risk_patients)*100
print(low_risk_patient_readmisson_rate)

Calculate the "Medium Risk" patients readmission rate

In [None]:
medium_risk_patients = df[df['Risk_Category'] == 'Medium Risk']
print(medium_risk_patients)

medium_risk_patients_readmitted = df[(df['Risk_Category'] == 'Medium Risk') & (df['readmitted'] == '<30')]
print(medium_risk_patients_readmitted)

medium_risk_patient_readmisson_rate = len(medium_risk_patients_readmitted) / len(medium_risk_patients)*100
print(medium_risk_patient_readmisson_rate)

Calculate the "High Risk" patients readmission rate

In [None]:
high_risk_patients = df[df['Risk_Category'] == 'High Risk']
print(high_risk_patients)

high_risk_patients_readmitted = df[(df['Risk_Category'] == 'High Risk') & (df['readmitted'] == '<30')]
print(high_risk_patients_readmitted)

high_risk_patient_readmisson_rate = len(high_risk_patients_readmitted) / len(high_risk_patients)*100
print(high_risk_patient_readmisson_rate)

Create a chart to visualize the readmission rate (<30 days) for each of the three risk categories

In [None]:
import matplotlib.pyplot as plt

categories = ['Low', 'Medium', 'High']
values = [low_risk_patient_readmisson_rate,
          medium_risk_patient_readmisson_rate,
          high_risk_patient_readmisson_rate]

plt.bar(categories, values, color='#C7CEEA')

for i, v in enumerate(values):
    plt.text(i, v + 0.01 * max(values), f"{v:.2f}%", ha='center', fontsize=10)

plt.title('Readmission Rate Of Each Risk Category')
plt.xlabel('Risk Category')
plt.ylabel('Readmission Rate %')

plt.tight_layout()
plt.show()
