In [1]:
# 01_build_uci_cohort.ipynb
# Build processed tabular cohort from diabetic_data.csv

import os
import numpy as np
import pandas as pd

RAW_PATH = "../data_raw/diabetic_data.csv"
OUT_PATH = "../data_processed/admissions_features.csv"

print("Loading raw data from:", RAW_PATH)
df_raw = pd.read_csv(RAW_PATH)
print("Raw shape:", df_raw.shape)
df_raw.head()

Loading raw data from: ../data_raw/diabetic_data.csv
Raw shape: (25000, 17)


Unnamed: 0,age,time_in_hospital,n_lab_procedures,n_procedures,n_medications,n_outpatient,n_inpatient,n_emergency,medical_specialty,diag_1,diag_2,diag_3,glucose_test,A1Ctest,change,diabetes_med,readmitted
0,[70-80),8,72,1,18,2,0,0,Missing,Circulatory,Respiratory,Other,no,no,no,yes,no
1,[70-80),3,34,2,13,0,0,0,Other,Other,Other,Other,no,no,no,yes,no
2,[50-60),5,45,0,18,0,0,0,Missing,Circulatory,Circulatory,Circulatory,no,no,yes,yes,yes
3,[70-80),2,36,0,12,1,0,0,Missing,Circulatory,Other,Diabetes,no,no,yes,yes,yes
4,[60-70),1,42,0,7,0,0,0,InternalMedicine,Other,Circulatory,Respiratory,no,no,no,yes,no


In [2]:
# Basic cleaning and label creation

df = df_raw.copy()

# Sometimes discharge_disposition_id codes for death/hospice are removed.
# If you want to keep it simple, we keep all for now. If you want to match
# common practice, uncomment the next block:

# codes_death_hospice = [11, 13, 14, 19, 20, 21]
# df = df[~df["discharge_disposition_id"].isin(codes_death_hospice)]

# 2. Create binary label: any readmission
df["label"] = (df["readmitted"].astype(str).str.lower() == "yes").astype(int)

print("Label positive rate:", df["label"].mean())
print("Cleaned shape:", df.shape)
df[["readmitted", "label"]].head()

Label positive rate: 0.47016
Cleaned shape: (25000, 18)


Unnamed: 0,readmitted,label
0,no,0
1,no,0
2,yes,1
3,yes,1
4,no,0


In [3]:
# Helper functions for feature engineering

def map_age_bucket(age_str: str) -> int:
    """
    Map age ranges like '[40-50)' to an integer bucket 0..9.
    If unknown, return -1.
    """
    if isinstance(age_str, str) and age_str.startswith("[") and age_str.endswith(")"):
        # age ranges in this dataset are like [0-10), [10-20), ..., [90-100)
        try:
            low = int(age_str[1:].split("-")[0])
            # each bin is 10 years wide
            return low // 10
        except Exception:
            return -1
    return -1


def map_diag_to_group(code) -> str:
    """
    Map ICD-9 diagnosis code (as string) to a coarse group.
    This is a simplified version of groupings used in prior work.
    """
    if pd.isna(code) or code == "?":
        return "missing"

    # codes in this dataset can be '250.83', '401', 'V57', 'E879', etc.
    s = str(code)
    # handle V and E codes as separate groups
    if s.startswith("V"):
        return "supplemental"
    if s.startswith("E"):
        return "injury_external"

    try:
        val = float(s)
    except ValueError:
        return "other"

    # diabetes
    if 250 <= val < 251:
        return "diabetes"
    # circulatory
    if (390 <= val <= 459) or (val == 785):
        return "circulatory"
    # respiratory
    if (460 <= val <= 519) or (val == 786):
        return "respiratory"
    # digestive
    if (520 <= val <= 579) or (val == 787):
        return "digestive"
    # injury/poisoning
    if 800 <= val <= 999:
        return "injury"
    # musculoskeletal
    if 710 <= val <= 739:
        return "musculoskeletal"
    # genitourinary
    if 580 <= val <= 629:
        return "genitourinary"
    # neoplasms
    if 140 <= val <= 239:
        return "neoplasm"

    return "other"


def encode_a1c(result: str) -> int:
    """
    Ordinal encoding for A1Cresult.
    Values in dataset: 'None', 'Norm', '>7', '>8' (sometimes slightly different).
    We'll map to 0..3
    """
    if result == "None":
        return 0
    if result == "Norm":
        return 1
    if result == ">7":
        return 2
    if result == ">8":
        return 3
    return 0  # default for '?', etc.

In [4]:
# Medication feature engineering

def build_medication_features(df):
    """
    Build features from diabetic_data.csv medication fields:
    - diabetes_med (Yes/No)
    - change (Up, Down, Steady, Ch, No)
    """
    df_feat = df.copy()

    # Binary: is the patient on diabetes medication?
    df_feat["on_diabetes_med"] = (df["diabetes_med"] == "Yes").astype(int)

    # Handle change column categories
    change_col = df["change"].fillna("No")

    df_feat["med_up"] = (change_col == "Up").astype(int)
    df_feat["med_down"] = (change_col == "Down").astype(int)
    df_feat["med_steady"] = (change_col == "Steady").astype(int)
    df_feat["med_changed"] = (change_col == "Ch").astype(int)
    df_feat["med_no_change"] = (change_col == "No").astype(int)

    return df_feat

In [5]:
# Apply medication features

df_feat = build_medication_features(df)

print("Shape after medication features:", df_feat.shape)
df_feat.head()

Shape after medication features: (25000, 24)


Unnamed: 0,age,time_in_hospital,n_lab_procedures,n_procedures,n_medications,n_outpatient,n_inpatient,n_emergency,medical_specialty,diag_1,...,change,diabetes_med,readmitted,label,on_diabetes_med,med_up,med_down,med_steady,med_changed,med_no_change
0,[70-80),8,72,1,18,2,0,0,Missing,Circulatory,...,no,yes,no,0,0,0,0,0,0,0
1,[70-80),3,34,2,13,0,0,0,Other,Other,...,no,yes,no,0,0,0,0,0,0,0
2,[50-60),5,45,0,18,0,0,0,Missing,Circulatory,...,yes,yes,yes,1,0,0,0,0,0,0
3,[70-80),2,36,0,12,1,0,0,Missing,Circulatory,...,yes,yes,yes,1,0,0,0,0,0,0
4,[60-70),1,42,0,7,0,0,0,InternalMedicine,Other,...,no,yes,no,0,0,0,0,0,0,0


In [6]:
# Choose columns for the modeling table

label_col = "label"

num_cols = [
    "time_in_hospital",
    "n_lab_procedures",
    "n_procedures",
    "n_medications",
    "n_outpatient",
    "n_inpatient",
    "n_emergency",
    "on_diabetes_med",
    "med_up",
    "med_down",
    "med_steady",
    "med_changed",
    "med_no_change",
]

cat_cols = [
    "age",
    "medical_specialty",
    "diag_1",
    "diag_2",
    "diag_3",
    "glucose_test",
    "A1Ctest",
]

# Filter to columns that actually exist (in case some are missing)
num_cols = [c for c in num_cols if c in df_feat.columns]
cat_cols = [c for c in cat_cols if c in df_feat.columns]

print("Numeric columns:", num_cols)
print("Categorical columns:", cat_cols)

Numeric columns: ['time_in_hospital', 'n_lab_procedures', 'n_procedures', 'n_medications', 'n_outpatient', 'n_inpatient', 'n_emergency', 'on_diabetes_med', 'med_up', 'med_down', 'med_steady', 'med_changed', 'med_no_change']
Categorical columns: ['age', 'medical_specialty', 'diag_1', 'diag_2', 'diag_3', 'glucose_test', 'A1Ctest']


In [7]:
# One-hot encode categoricals and assemble final feature table

df_num = df_feat[num_cols + [label_col]].copy()

# Convert categoricals to string and one-hot encode
df_cat = pd.get_dummies(
    df_feat[cat_cols].astype("category"),
    drop_first=False,
    dummy_na=True,
)

df_processed = pd.concat([df_num, df_cat], axis=1)

print("Processed shape:", df_processed.shape)
df_processed.head()

Processed shape: (25000, 64)


Unnamed: 0,time_in_hospital,n_lab_procedures,n_procedures,n_medications,n_outpatient,n_inpatient,n_emergency,on_diabetes_med,med_up,med_down,...,diag_3_Respiratory,diag_3_nan,glucose_test_high,glucose_test_no,glucose_test_normal,glucose_test_nan,A1Ctest_high,A1Ctest_no,A1Ctest_normal,A1Ctest_nan
0,8,72,1,18,2,0,0,0,0,0,...,False,False,False,True,False,False,False,True,False,False
1,3,34,2,13,0,0,0,0,0,0,...,False,False,False,True,False,False,False,True,False,False
2,5,45,0,18,0,0,0,0,0,0,...,False,False,False,True,False,False,False,True,False,False
3,2,36,0,12,1,0,0,0,0,0,...,False,False,False,True,False,False,False,True,False,False
4,1,42,0,7,0,0,0,0,0,0,...,True,False,False,True,False,False,False,True,False,False


In [8]:
# Save processed cohort

out_path = "../data_processed/admissions_features.csv"
os.makedirs(os.path.dirname(out_path), exist_ok=True)
df_processed.to_csv(out_path, index=False)

print("Saved processed cohort to:", out_path)

Saved processed cohort to: ../data_processed/admissions_features.csv
