In [7]:
%load_ext autoreload
%autoreload 2 

import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from cleaning import load_and_clean

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [9]:
# load data
data = pd.read_csv('data/diabetic_data.csv')

# clean data
data_clean = load_and_clean(data)

# set X and y
X = data_clean.drop(['readmit30', 'diag_1', 'diag_2', 'diag_3', 'diag_1_clean', 'diag_2_clean', 'diag_3_clean'], axis=1)  # Features
y = data_clean['readmit30']  # Target variable

X

Unnamed: 0,race,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,...,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,total_previous_visits,female,diag_1_ccs,diag_2_ccs,diag_3_ccs
0,Caucasian,5,6,25,1,1,41,0,1,0,...,No,No,No,No,No,0,1,Endocrine; nutritional; and metabolic diseases...,,
1,Caucasian,15,1,1,7,3,59,0,18,0,...,No,No,No,Ch,Yes,0,1,Endocrine; nutritional; and metabolic diseases...,Endocrine; nutritional; and metabolic diseases...,Endocrine; nutritional; and metabolic diseases...
2,AfricanAmerican,25,1,1,7,2,11,5,13,2,...,No,No,No,No,Yes,1,1,Mental Illness,Endocrine; nutritional; and metabolic diseases...,Complications of pregnancy; childbirth; and th...
3,Caucasian,35,1,1,7,2,44,1,16,0,...,No,No,No,Ch,Yes,0,0,,Endocrine; nutritional; and metabolic diseases...,Diseases of the circulatory system
4,Caucasian,45,1,1,7,1,51,0,8,0,...,No,No,No,Ch,Yes,0,0,Neoplasms,Neoplasms,Endocrine; nutritional; and metabolic diseases...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101761,AfricanAmerican,75,1,3,7,3,51,0,16,0,...,No,No,No,Ch,Yes,0,0,Endocrine; nutritional; and metabolic diseases...,Mental Illness,Diseases of the circulatory system
101762,AfricanAmerican,85,1,4,5,5,33,3,18,0,...,No,No,No,No,Yes,1,1,Diseases of the digestive system,Endocrine; nutritional; and metabolic diseases...,Diseases of the digestive system
101763,Caucasian,75,1,1,7,1,53,0,9,1,...,No,No,No,Ch,Yes,0,0,,Diseases of the genitourinary system,Mental Illness
101764,Caucasian,85,2,3,7,10,45,2,21,0,...,No,No,No,Ch,Yes,1,1,Injury and poisoning,Diseases of the blood and blood-forming organs,Injury and poisoning


In [19]:
X = data_clean[['num_inpatient', 'diag_1_ccs', 'discharge_disposition_id', 'num_lab_procedures', 'time_in_hospital', 'number_diagnoses', 'age', 'race']]
y = data_clean['readmit30']

for col in X.select_dtypes(include=['object']).columns:
    X[col] = X[col].astype('category')

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create DMatrix objects (XGBoost's optimized data structure)
dtrain = xgb.DMatrix(X_train, label=y_train, enable_categorical=True)
dtest = xgb.DMatrix(X_test, label=y_test, enable_categorical=True)

KeyError: "['num_inpatient'] not in index"

In [18]:
params = {
    'objective': 'binary:logistic',  # for logistic regression
    'eval_metric': 'auc',        # log loss for binary classification
    'eta': 0.1,                      # learning rate
    'max_depth': 3,                  # shallow trees for more linear model
    'subsample': 0.8,                # use 80% of data per tree
    'colsample_bytree': 0.8,         # use 80% of features per tree
    'lambda': 1.0,                   # L2 regularization
    'alpha': 0.0                     # L1 regularization
}

# Train the model
num_rounds = 100
model = xgb.train(
    params,
    dtrain,
    num_rounds,
    evals=[(dtrain, 'train'), (dtest, 'test')],
    early_stopping_rounds=10,
    verbose_eval=10
)

# Make predictions
preds_proba = model.predict(dtest)
preds = [1 if p > 0.5 else 0 for p in preds_proba]

# Evaluate the model
accuracy = accuracy_score(y_test, preds)
print(f"Accuracy: {accuracy:.4f}")
print(classification_report(y_test, preds))

# Feature importance
importance = model.get_score(importance_type='gain')
importance = sorted(importance.items(), key=lambda x: x[1], reverse=True)
print("Feature importance:")
for feature, score in importance:
    print(f"{feature}: {score}")

[0]	train-auc:0.58502	test-auc:0.59351
[10]	train-auc:0.65470	test-auc:0.65505
[20]	train-auc:0.66141	test-auc:0.65890
[30]	train-auc:0.66308	test-auc:0.65949
[40]	train-auc:0.66779	test-auc:0.66324
[50]	train-auc:0.67081	test-auc:0.66568
[60]	train-auc:0.67476	test-auc:0.66843
[70]	train-auc:0.67713	test-auc:0.67033
[80]	train-auc:0.67920	test-auc:0.67137
[90]	train-auc:0.68057	test-auc:0.67215
[99]	train-auc:0.68147	test-auc:0.67228
Accuracy: 0.8879
              precision    recall  f1-score   support

           0       0.89      1.00      0.94     18069
           1       0.61      0.00      0.01      2285

    accuracy                           0.89     20354
   macro avg       0.75      0.50      0.48     20354
weighted avg       0.86      0.89      0.84     20354

Feature importance:
total_previous_visits: 75.20158386230469
discharge_disposition_id: 25.652450561523438
diag_1_ccs: 10.274768829345703
time_in_hospital: 10.13017463684082
number_diagnoses: 10.097326278686523
age: 7.