In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
import os

from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC

from sklearn.model_selection import train_test_split

import sys
sys.path.append(os.path.abspath('..'))

from util import evaluate_model_performance

In [2]:
data_path = Path(os.getcwd()).parent.parent / "data" / "dataset_diabetes"
df = pd.read_csv(data_path / "diabetic_preprocessed.csv")

In [3]:
df["age"] = df["age_all"]

columns_to_remove = ['encounter_id', 'patient_nbr', 'readmitted', 'readmit_binary', 'diabetes_type', \
    'had_emergency', 'had_inpatient_days', 'had_outpatient_days', 'race_all', 'age_all']

df_for_experimenting = df.drop(columns=columns_to_remove)

In [4]:
target_variable = "readmit_30_days"
Y= df_for_experimenting.loc[:, target_variable]
X = pd.get_dummies(df_for_experimenting.drop(columns=["readmit_30_days"]))

In [5]:
X.head() # sanity check

Unnamed: 0,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,race_AfricanAmerican,race_Caucasian,...,glimepiride-pioglitazone_No,glimepiride-pioglitazone_Steady,metformin-rosiglitazone_No,metformin-rosiglitazone_Steady,metformin-pioglitazone_No,metformin-pioglitazone_Steady,change_Ch,change_No,diabetesMed_No,diabetesMed_Yes
0,1,41,0,1,0,0,0,1,0,1,...,1,0,1,0,1,0,0,1,1,0
1,3,59,0,18,0,0,0,9,0,1,...,1,0,1,0,1,0,1,0,0,1
2,2,11,5,13,2,0,1,6,1,0,...,1,0,1,0,1,0,0,1,0,1
3,2,44,1,16,0,0,0,7,0,1,...,1,0,1,0,1,0,1,0,0,1
4,1,51,0,8,0,0,0,5,0,1,...,1,0,1,0,1,0,1,0,0,1


In [6]:
random_seed = 445
np.random.seed(random_seed)

X_train, X_test, Y_train, Y_test = train_test_split(
    X,
    Y,
    test_size=0.20,
    stratify=Y,
    random_state=random_seed
)

# Auto-balanced classifiers

### Logistic regression

In [7]:
lr_auto_balanced = LogisticRegression(solver='newton-cg', class_weight="balanced")
lr_auto_balanced.fit(X_train, Y_train)

# Predicting on the test data
pred_test_lr = lr_auto_balanced.predict(X_test)
evaluate_model_performance(Y_test, pred_test_lr)

The accuracy score for the testing data: 0.6449172112219329
The balanced accuracy score for the testing data: 0.6010993341113342
The precision score for the testing data: 0.1664872139973082
The recall score for the testing data: 0.5446939674152356
The F1 score for the testing data: 0.2550252551283373
The F2 score for the testing data: 0.3745307012232045
The G mean score for the testing data: 0.5984470269597536


### Decision tree

In [8]:
tree_auto_balanced = DecisionTreeClassifier(class_weight="balanced")
tree_auto_balanced.fit(X_train, Y_train)

# Predicting on the test data
pred_test_tree = tree_auto_balanced.predict(X_test)
evaluate_model_performance(Y_test, pred_test_tree)

The accuracy score for the testing data: 0.7996364172357884
The balanced accuracy score for the testing data: 0.5253067499976013
The precision score for the testing data: 0.15102356122054847
The recall score for the testing data: 0.17217084984588288
The F1 score for the testing data: 0.1609053497942387
The F2 score for the testing data: 0.16748051057997088
The G mean score for the testing data: 0.38889872411346116


### Perceptron

In [9]:
perceptron_auto_balanced = Perceptron(class_weight="balanced")
perceptron_auto_balanced.fit(X_train, Y_train)

# Predicting on the test data
pred_test_perceptron = perceptron_auto_balanced.predict(X_test)
evaluate_model_performance(Y_test, pred_test_perceptron)

The accuracy score for the testing data: 0.44018080872598636
The balanced accuracy score for the testing data: 0.5859823181357241
The precision score for the testing data: 0.13903616364643506
The recall score for the testing data: 0.7736679876706297
The F1 score for the testing data: 0.23571236919774616
The F2 score for the testing data: 0.4044473090557525
The G mean score for the testing data: 0.5551120306919597


### SVM (linear kernel)

In [10]:
svm_auto_balanced = LinearSVC(class_weight="balanced")
svm_auto_balanced.fit(X_train, Y_train)

# Predicting on the test data
pred_test_svm = svm_auto_balanced.predict(X_test)
evaluate_model_performance(Y_test, pred_test_svm)



The accuracy score for the testing data: 0.8868962806465878
The balanced accuracy score for the testing data: 0.5076134767633002
The precision score for the testing data: 0.3697478991596639
The recall score for the testing data: 0.019374724790841038
The F1 score for the testing data: 0.03682008368200837
The F2 score for the testing data: 0.023905248288601542
The G mean score for the testing data: 0.1389041499167718
