In [8]:
import os
import random
import torch
import pickle
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from dnamite.models import DNAMiteBinaryClassifier, DNAMiteRegressor
from sklearn.metrics import roc_auc_score, confusion_matrix,  ConfusionMatrixDisplay, mean_squared_error, r2_score
from collections import defaultdict
import cleaning

df = pd.read_csv("./data/diabetic_data.csv")
ccs_mapping = pd.read_csv("./data/ccs_multi_dx_tool_2015.csv", dtype = str)
df = cleaning.load_and_clean(df)

In [9]:
# Map CCS groups
ccs_mapping = pd.read_csv("./data/ccs_multi_dx_tool_2015.csv", dtype = str)
# Clean CCS mapping column names and values
ccs_mapping.columns = ccs_mapping.columns.str.strip("'").str.strip()
ccs_mapping = ccs_mapping.rename(columns={
    "ICD-9-CM CODE": "icd9_code",
    "CCS LVL 1": "ccs_level_1",
    "CCS LVL 1 LABEL": "ccs_description"
})
ccs_mapping["icd9_code"] = ccs_mapping["icd9_code"].str.strip("'").str.strip()
# SRemove dots from ICD-9 codes
for col in ["diag_1", "diag_2", "diag_3"]:
    df[f"{col}_clean"] = df[col].str.replace(".", "", regex=False)

# Build prefix-based lookup table for fallback matching
prefix_map = defaultdict(lambda: None)
for code, desc in zip(ccs_mapping["icd9_code"], ccs_mapping["ccs_description"]):
    for i in range(3, len(code) + 1):  # Use prefixes starting at 3 digits
        prefix = code[:i]
        if prefix not in prefix_map:
            prefix_map[prefix] = desc

# Matching function using longest prefix available
def map_icd_to_ccs(code):
    if pd.isna(code):
        return None
    for i in range(len(code), 2, -1):
        prefix = code[:i]
        if prefix in prefix_map:
            return prefix_map[prefix]
    return None

# Map each diagnosis column to CCS group using prefix match
for col in ["diag_1_clean", "diag_2_clean", "diag_3_clean"]:
    df[col.replace("_clean", "_ccs")] = df[col].map(map_icd_to_ccs)

df = df.drop(['diag_1', 'diag_2', 'diag_3', 'diag_1_clean', 'diag_2_clean', 'diag_3_clean'], axis = 1)

In [34]:
# Automatically determine categorical and numerical features
categorical_features = df.select_dtypes(include=["object", "category"]).columns.tolist()
numerical_features = df.select_dtypes(include=["number"]).columns.tolist()

print("Categorical Features:", categorical_features)
print("Numerical Features:", numerical_features)


# Remove the target variable (if included in the dataset)
target_variable = "readmit30"
# categorical_features.remove(target_variable)
    
# Define the ColumnTransformer
one_hot_encoder = OneHotEncoder(sparse_output=False)
scaler = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ("num", scaler, numerical_features),   # Scale numerical features
        ("cat", one_hot_encoder, categorical_features)  # One-hot encode categorical features
    ]
)

print(df.columns)

# Separate features and target
X = df.drop([target_variable], axis=1)
y = df[target_variable]

X = df[['race', 'age', 'admission_type_id', 'discharge_disposition_id', 'admission_source_id', 'time_in_hospital', 
        'num_lab_procedures', 'num_procedures', 'num_medications', 'number_emergency', 'number_inpatient', 'number_diagnoses',
        'max_glu_serum', 'A1Cresult', 'change', 'diabetesMed', 'total_previous_visits', 'female', 'diag_1_ccs', 'diag_2_ccs', 'diag_3_ccs']]
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

Categorical Features: ['race', 'max_glu_serum', 'A1Cresult', 'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton', 'insulin', 'glyburide-metformin', 'glipizide-metformin', 'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone', 'change', 'diabetesMed', 'diag_1_ccs', 'diag_2_ccs', 'diag_3_ccs']
Numerical Features: ['age', 'admission_type_id', 'discharge_disposition_id', 'admission_source_id', 'time_in_hospital', 'num_lab_procedures', 'num_procedures', 'num_medications', 'number_outpatient', 'number_emergency', 'number_inpatient', 'number_diagnoses', 'total_previous_visits', 'readmit30', 'female']
Index(['race', 'age', 'admission_type_id', 'discharge_disposition_id',
       'admission_source_id', 'time_in_hospital', 'num_lab_procedures',
       'num_procedures', 'num_medica

In [35]:
print(X_train.shape)

(80091, 21)


In [36]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_bc = DNAMiteBinaryClassifier(
    device=device,
    random_state=34
)
model_bc.fit(X_train, y_train)

In [None]:
#Predict outputs
outputs_bc = model_bc.predict_proba(X_test)
predicted_bc = outputs_bc > 0.5

#Calculate accuracy
total = y_test.shape[0]
correct_bc = (predicted_bc == y_test).sum().item()
accuracy_bc = correct_bc / total

print('Accuracy for simple binary classification: ', accuracy_bc)

print('\n')
#Calculate AUC score
roc_auc_bc = roc_auc_score(y_test, outputs_bc)
print('ROC AUC for simple binary classification:', roc_auc_bc)

conf_mat_bc = confusion_matrix(y_test, predicted_bc)

# Plot confusion matrix
fig, ax = plt.subplots(1, 2)
ax[0].set_title("Simple Binary Classification")
ConfusionMatrixDisplay(confusion_matrix=conf_mat_bc).plot(cmap='Blues', values_format='d', ax=ax[0])
plt.tight_layout()
plt.show()