In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('uncleaned_data/diabetic_data.csv')
mapping = pd.read_csv('uncleaned_data/Full_ID_Mappings.csv')
ICD9_mapping = pd.read_csv('uncleaned_data/ICD9_Encounter_Data.csv')

In [3]:
data.replace("?", np.nan, inplace=True)

In [4]:
data['race'] = data['race'].fillna(data['race'].mode()[0])
data['payer_code'] = data['payer_code'].fillna('unknown')
data['medical_specialty'] = data['medical_specialty'].fillna('unknown')
data['diag_1'] = data['diag_1'].fillna('unknown')
data['diag_2'] = data['diag_2'].fillna('unknown')
data['diag_3'] = data['diag_3'].fillna('unknown')
data['max_glu_serum'] = data['max_glu_serum'].fillna('Not Taken')
data['A1Cresult'] = data['A1Cresult'].fillna('Not Taken')

data.drop(columns=['encounter_id', 'patient_nbr'], inplace = True)
data.drop(columns=['weight'], inplace = True)

In [None]:
for column in data.columns:
    dtype = data[column].apply(type).mode()[0]
    data[column] = data[column].astype(dtype)

In [6]:
data.to_csv('EDA_Data/cleaned_data.csv', index=False)

In [7]:
target = data['readmitted']
data.drop(columns=['readmitted'], inplace=True)

In [8]:
for col in data.columns:
    if col in mapping['mapping_type'].unique():
        
        submap = (mapping
                  .loc[mapping['mapping_type'] == col]
                  .set_index('id')['description'])
        
        data[col] = data[col].map(submap)

In [9]:
def diagnosis_entry(code):

    s = str(code).strip()

    try:
        num = float(s)
    except ValueError:
        return "Other"
    
    if (390 <= num <= 459) or num == 785:
        return "Circulatory"
    elif (460 <= num <= 519) or num == 786:
        return "Respiratory"
    elif (520 <= num <= 579) or num == 787:
        return "Digestive"
    elif s.startswith("250"):
        return "Diabetes"
    elif 800 <= num <= 999:
        return "Injury"
    elif 710 <= num <= 739:
        return "Musculoskeletal"
    elif (580 <= num <= 629) or num == 788:
        return "Genitourinary"
    elif 140 <= num <= 239:
        return "Neoplasms"
    else:
        return "Other"

diag_cols = ['diag_1', 'diag_2', 'diag_3']
data[diag_cols] = data[diag_cols].map(diagnosis_entry)

In [10]:
data['admission_type_id'] = data['admission_type_id'].replace(np.nan, 'Unkonwn')
data['admission_type_id'] = data['admission_type_id'].replace('Not Mapped', 'Unkonwn')
data['admission_type_id'] = data['admission_type_id'].replace('Not Available', 'Unkonwn')

data['discharge_disposition_id'] = data['discharge_disposition_id'].replace(np.nan, 'Unkonwn')
data['discharge_disposition_id'] = data['discharge_disposition_id'].replace('Not Mapped', 'Unkonwn')

data['admission_source_id'] = data['admission_source_id'].replace(np.nan, 'Unkonwn')
data['admission_source_id'] = data['admission_source_id'].replace('Not Mapped', 'Unkonwn')
data['admission_source_id'] = data['admission_source_id'].replace('Not Available', 'Unkonwn')

In [11]:
# One-hot encoding for categorical variables from scratch

cat_slice = data.select_dtypes(include=['object']).copy()
orig_cols = cat_slice.columns.tolist()

dummy_data = {}
for col in orig_cols:
    for val in cat_slice[col].unique():
        safe_val = str(val).replace(' ', '_').replace('/', '_')
        new_col = f"{col}_{safe_val}"
        
        dummy_data[new_col] = [1 if x == val else 0 for x in cat_slice[col]]

dummy_df = pd.DataFrame(dummy_data, index=cat_slice.index)

cat_slice.drop(columns=orig_cols, inplace=True)

data = pd.concat(
    [ data.drop(columns=orig_cols), 
      dummy_df ],
    axis=1
)

In [12]:
data = pd.concat([data, target], axis=1)

In [13]:
n = 70000
training_data = data.iloc[ : n].copy()
testing_data = data.iloc[n : ].copy()

In [14]:
training_data.to_csv('model_ready_data/training_data.csv', index=False)
testing_data.to_csv('model_ready_data/testing_data.csv', index=False)