In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [2]:
data = pd.read_csv(r'.\data\diabetic_data.csv')

In [3]:
## Filtering and classifying 
filtered_data = data[['encounter_id', 'patient_nbr', 'race', 'gender', 'age', 'admission_type_id',  'admission_source_id',
'payer_code', 'number_diagnoses', 'change', 'diabetesMed', 'readmitted']]

filtered_data = filtered_data.replace('?', np.nan)
filtered_data = filtered_data.dropna()

In [4]:
# Age groups:
age_groups = {
    '[0-10)': 0,
    '[10-20)': 1,
    '[20-30)': 2,
    '[30-40)': 2,
    '[40-50)': 3,
    '[50-60)': 3,
    '[60-70)': 4,
    '[70-80)': 4,
    '[80-90)': 4,
    '[90-100)': 4
}

for age_group, replacement in age_groups.items():
    filtered_data.loc[filtered_data['age'] == age_group, 'age'] = replacement

filtered_data = filtered_data.rename(columns={'age': 'age_group'})
filtered_data

Unnamed: 0,encounter_id,patient_nbr,race,gender,age_group,admission_type_id,admission_source_id,payer_code,number_diagnoses,change,diabetesMed,readmitted
20446,72091308,20123568,Caucasian,Female,4,1,7,MC,9,Ch,Yes,NO
20737,72848634,20377854,Caucasian,Female,4,2,1,MC,6,No,Yes,NO
20824,73062156,20408121,Caucasian,Female,4,1,7,MC,6,No,Yes,NO
21083,73731852,20542797,Caucasian,Male,4,1,7,MC,6,Ch,Yes,NO
23668,80820942,20514150,Caucasian,Female,4,2,1,MC,4,Ch,Yes,<30
...,...,...,...,...,...,...,...,...,...,...,...,...
101760,443847176,50375628,AfricanAmerican,Female,4,1,7,DM,9,Ch,Yes,>30
101761,443847548,100162476,AfricanAmerican,Male,4,1,7,MC,9,Ch,Yes,>30
101762,443847782,74694222,AfricanAmerican,Female,4,1,5,MC,9,No,Yes,NO
101763,443854148,41088789,Caucasian,Male,4,1,7,MC,13,Ch,Yes,NO


In [5]:
# "diabetesMed": Convert "Yes"/"No" to True/False
filtered_data['diabetesMed'] = filtered_data['diabetesMed'].map({'Yes': True, 'No': False})
# "change": Convert "Ch"/"No" to True/False
filtered_data['change'] = filtered_data['change'].map({'Ch': True, 'No': False})


filtered_data.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age_group,admission_type_id,admission_source_id,payer_code,number_diagnoses,change,diabetesMed,readmitted
20446,72091308,20123568,Caucasian,Female,4,1,7,MC,9,True,True,NO
20737,72848634,20377854,Caucasian,Female,4,2,1,MC,6,False,True,NO
20824,73062156,20408121,Caucasian,Female,4,1,7,MC,6,False,True,NO
21083,73731852,20542797,Caucasian,Male,4,1,7,MC,6,True,True,NO
23668,80820942,20514150,Caucasian,Female,4,2,1,MC,4,True,True,<30


In [6]:
# Payer code

payer_codes = filtered_data['payer_code'].unique()

# Define the payer code categories: 1 = self pay, 2 = mid class insurance, 3 = expensive/premium
payer_code_categories = {
   # 'nan': 'nan',
    'MC': '2',
    'MD': '2',
    'HM': '2',
    'UN': '2',
    'BC': '2',
    'SP': '1',
    'CP': '2',
    'SI': '2',
    'DM': '3',
    'CM': '3',
    'CH': '3',
    'PO': '2',
    'WC': '2',
    'OT': '2',
    'OG': '2',
    'MP': '3',
    'FR': '2'
}

for payer_code_categories, replacement in payer_code_categories.items():
    filtered_data.loc[filtered_data['payer_code'] == payer_code_categories, 'payer_code'] = replacement


In [7]:

# Create the collapsed_data DataFrame
collapsed_data = pd.DataFrame(columns=filtered_data.columns)
collapsed_data_rows = []

# Iterate over each row in the filtered_data DataFrame
for indx1, row in filtered_data.iterrows():
    patient_nbr = row['patient_nbr']
    readmitted = row['readmitted']
    
    # Check if patient_nbr appears more than once
    if filtered_data['patient_nbr'].value_counts()[patient_nbr] > 1:
        # Find the index of the second occurrence of patient_nbr
        indx2 = filtered_data[filtered_data['patient_nbr'] == patient_nbr].index[1]
        
        # Get the readmitted value for the second occurrence
        re_admitted = filtered_data.loc[indx2, 'readmitted']
        collapsed_data_rows.append(row)
        
        # readmitted_less_than_30? (based on the second visit)
        if re_admitted == '>30':
            collapsed_data_rows[-1]['readmitted_less_than_30'] = True
        else:
            collapsed_data_rows[-1]['readmitted_less_than_30'] = False

# Create the collapsed_data DataFrame using concat
collapsed_data = pd.concat(collapsed_data_rows, axis=1).T

collapsed_data.head()
collapsed_data.to_csv('./data/collased_data.csv')