In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
import os

from imblearn.over_sampling import SMOTENC
from collections import Counter

from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB

import umap

from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
import seaborn as sns
import hypertools as hyp

import sys
sys.path.append(os.path.abspath('..'))

from util import evaluate_model_performance, evaluate_model_fairness

  @numba.jit()
  @numba.jit()
  @numba.jit()
  from .autonotebook import tqdm as notebook_tqdm
  @numba.jit()


In [2]:
data_path = Path(os.getcwd()).parent.parent / "data" / "dataset_diabetes"
df = pd.read_csv(data_path / "diabetic_preprocessed.csv")

In [3]:
df["age"] = df["age_all"]

columns_to_remove = ['encounter_id', 'patient_nbr', 'readmitted', 'readmit_binary', 'diabetes_type', \
    'had_emergency', 'had_inpatient_days', 'had_outpatient_days', 'race_all', 'age_all']

df_for_experimenting = df.drop(columns=columns_to_remove)

In [4]:
data_path_write = Path(os.getcwd()).parent / "fawos" / "FAWOS" / "datasets" / "diabetes"
df_for_experimenting.to_csv(data_path_write / "raw_dataset.csv")

In [5]:
target_variable = "readmit_30_days"
Y = df_for_experimenting.loc[:, target_variable]
X = df_for_experimenting.drop(columns=[target_variable])

In [6]:
X.head() # sanity check

Unnamed: 0,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,...,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed
0,Caucasian,Female,[0-10),Unknown,Other,Other,Referral,1,Unknown,Other,...,No,No,No,No,No,No,No,No,No,No
1,Caucasian,Female,[10-20),Unknown,Emergency,Discharged to Home,Emergency,3,Unknown,Missing,...,No,No,Up,No,No,No,No,No,Ch,Yes
2,AfricanAmerican,Female,[20-30),Unknown,Emergency,Discharged to Home,Emergency,2,Unknown,Missing,...,No,No,No,No,No,No,No,No,No,Yes
3,Caucasian,Male,[30-40),Unknown,Emergency,Discharged to Home,Emergency,2,Unknown,Missing,...,No,No,Up,No,No,No,No,No,Ch,Yes
4,Caucasian,Male,[40-50),Unknown,Emergency,Discharged to Home,Emergency,1,Unknown,Missing,...,No,No,Steady,No,No,No,No,No,Ch,Yes


## Oversampling - SMOTENC

In [7]:
categorical_features = ['race', 'gender', 'weight', 'age', 'admission_type_id', 'discharge_disposition_id', 'admission_source_id', \
'payer_code', 'medical_specialty', 'diag_1', 'diag_2', 'diag_3', 'max_glu_serum', 'A1Cresult', 'metformin', 'repaglinide', \
'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone', \
'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone','tolazamide', 'examide', 'citoglipton', 'insulin','glyburide-metformin', \
'glipizide-metformin', 'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone', 'change', 'diabetesMed']

col_idx_mapping = zip(df_for_experimenting.columns, range(len(df_for_experimenting.columns)))
col_idx_filtered = list(filter(lambda x: x[0] in categorical_features, col_idx_mapping))
idx_filtered = list(map(lambda x: x[1], col_idx_filtered))

In [8]:
sm = SMOTENC(random_state=42, categorical_features=list(idx_filtered))
X_res_before_onehot, Y_res = sm.fit_resample(X, Y)
print(f'Resampled dataset samples per class {Counter(Y_res)}')

Resampled dataset samples per class Counter({False: 90406, True: 90406})


In [9]:
#one-hot encoding for the categorical features
X_res = pd.get_dummies(X_res_before_onehot)

In [10]:
random_seed = 445
np.random.seed(random_seed)

X_train_res_before_onehot, X_test_res_before_onehot, X_train_res, X_test_res, Y_train_res, Y_test_res = train_test_split(
    X_res_before_onehot,
    X_res,
    Y_res,
    test_size=0.20,
    stratify=Y_res,
    random_state=random_seed
)

### Logistic regression

In [11]:
lr_res = LogisticRegression(solver='newton-cg')
lr_res.fit(X_train_res, Y_train_res)

# Predicting on the test data
lr_pred_test_res = lr_res.predict(X_test_res)
evaluate_model_performance(Y_test_res, lr_pred_test_res)
evaluate_model_fairness(Y_test_res, lr_pred_test_res, X_test_res_before_onehot['race'])

The balanced accuracy score for the testing data: 0.8063764645081966
The precision score for the testing data: 0.8016114982578397
The recall score for the testing data: 0.8142904545957306
The F1 score for the testing data: 0.8079012345679012
The F2 score for the testing data: 0.8117226779571316
The G mean score for the testing data: 0.8063376285859623
The Demographic parity difference score for the testing data: 0.5564906431178059
The Equalized odds difference score for the testing data: 0.800861784595085
The Equal opportunity difference score for the testing data: 0.8142904545957306


  _warn_prf(average, modifier, msg_start, len(result))


### Decision tree

In [12]:
tree_auto_balanced_res = DecisionTreeClassifier()
tree_auto_balanced_res.fit(X_train_res, Y_train_res)

# Predicting on the test data
tree_pred_test_res = tree_auto_balanced_res.predict(X_test_res)
evaluate_model_performance(Y_test_res, tree_pred_test_res)
evaluate_model_fairness(Y_test_res, tree_pred_test_res, X_test_res_before_onehot['race'])

The balanced accuracy score for the testing data: 0.82354802190668
The precision score for the testing data: 0.8035908878625915
The recall score for the testing data: 0.8564318106404158
The F1 score for the testing data: 0.829170347763232
The F2 score for the testing data: 0.8453149051845543
The G mean score for the testing data: 0.8228912448342858
The Demographic parity difference score for the testing data: 0.40160133531249487
The Equalized odds difference score for the testing data: 0.5630566551212272
The Equal opportunity difference score for the testing data: 0.8564318106404158


  _warn_prf(average, modifier, msg_start, len(result))


### Perceptron

In [13]:
perceptron_res = Perceptron()
perceptron_res.fit(X_train_res, Y_train_res)

# Predicting on the test data
perceptron_pred_test_res = perceptron_res.predict(X_test_res)
evaluate_model_performance(Y_test_res, perceptron_pred_test_res)
evaluate_model_fairness(Y_test_res, perceptron_pred_test_res, X_test_res_before_onehot['race'])

The balanced accuracy score for the testing data: 0.7650408801118094
The precision score for the testing data: 0.9056971133496995
The recall score for the testing data: 0.5916933967481474
The F1 score for the testing data: 0.7157718682053855
The F2 score for the testing data: 0.6357780392436507
The G mean score for the testing data: 0.7451430723382838
The Demographic parity difference score for the testing data: 0.3750042142881225
The Equalized odds difference score for the testing data: 0.6238248391885205
The Equal opportunity difference score for the testing data: 0.5916933967481474


  _warn_prf(average, modifier, msg_start, len(result))


### SVM (linear kernel)

In [14]:
svm_res = LinearSVC()
svm_res.fit(X_train_res, Y_train_res)

# Predicting on the test data
svm_pred_test_res = svm_res.predict(X_test_res)
evaluate_model_performance(Y_test_res, svm_pred_test_res)
evaluate_model_fairness(Y_test_res, svm_pred_test_res, X_test_res_before_onehot['race'])

The balanced accuracy score for the testing data: 0.6904225538285937
The precision score for the testing data: 0.6270992507289705
The recall score for the testing data: 0.9396084503926557
The F1 score for the testing data: 0.7521859435528501
The F2 score for the testing data: 0.8544472495750394
The G mean score for the testing data: 0.6438863966483235
The Demographic parity difference score for the testing data: 0.5483657281301697
The Equalized odds difference score for the testing data: 0.6200931881906646
The Equal opportunity difference score for the testing data: 0.9396084503926557


  _warn_prf(average, modifier, msg_start, len(result))


### Gaussian Naive Bayes

In [15]:
nbc = GaussianNB()
nbc.fit(X_train_res, Y_train_res)

# Predicting on the test data
nbc_pred_test_res = nbc.predict(X_test_res)
evaluate_model_performance(Y_test_res, nbc_pred_test_res)
evaluate_model_fairness(Y_test_res, nbc_pred_test_res, X_test_res_before_onehot['race'])

The balanced accuracy score for the testing data: 0.6232243909889918
The precision score for the testing data: 0.5736816002645065
The recall score for the testing data: 0.9595730560778675
The F1 score for the testing data: 0.7180665052662072
The F2 score for the testing data: 0.8457878779015922
The G mean score for the testing data: 0.5246696265427713
The Demographic parity difference score for the testing data: 0.8608610633491791
The Equalized odds difference score for the testing data: 0.9672810489856507
The Equal opportunity difference score for the testing data: 0.9595730560778675


  _warn_prf(average, modifier, msg_start, len(result))


### t-SNE visualization

In [16]:
tsne_embedding = TSNE(n_components=2, learning_rate=50, init='random', perplexity=50).fit_transform(pd.get_dummies(X))

In [17]:
tsne_embedding_oversampled_smotenc = TSNE(n_components=2, learning_rate=50, init='random', perplexity=50).fit_transform(X_res)

In [18]:
colors = ['#F3DC1B', '#F37A3F', '#EC1DF3', '#27C4F1', '#00F37A', "red", "blue", "purple", 'green', 'yellow', 'pink', 'cyan', 'magenta', 'orange', 'grey', 'black']

sns.set()
_, axes = plt.subplots(1, 2, figsize=(15, 8))

sns.scatterplot(
    ax=axes[0],
    x=tsne_embedding[:, 0], y=tsne_embedding[:, 1],
    hue=list(Y),
    data=tsne_embedding,
    legend="full",
    alpha=0.2,
    palette=colors
)

sns.scatterplot(
    ax=axes[1],
    x=tsne_embedding_oversampled_smotenc[:, 0], y=tsne_embedding_oversampled_smotenc[:, 1],
    hue=list(Y_res),
    data=tsne_embedding_oversampled_smotenc,
    legend="full",
    alpha=0.2,
    palette=colors
)

  sns.scatterplot(
  sns.scatterplot(


<Axes: >

### UMAP

In [19]:
reducer = umap.UMAP()

In [20]:
embedding_umap = reducer.fit_transform(pd.get_dummies(X))

In [21]:
embedding_umap_resampled = reducer.fit_transform(X_res)

In [22]:
sns.set()
_, axes = plt.subplots(1, 2, figsize=(15, 8))

sns.scatterplot(
    ax=axes[0],
    x=embedding_umap[:, 0], y=embedding_umap[:, 1],
    hue=list(Y),
    data=embedding_umap,
    legend="full",
    alpha=0.2
)

sns.scatterplot(
    ax=axes[1],
    x=embedding_umap_resampled[:, 0], y=embedding_umap_resampled[:, 1],
    hue=list(Y_res),
    data=embedding_umap_resampled,
    legend="full",
    alpha=0.2
)

<Axes: >

### PCA - 2d

In [23]:
pca = PCA(n_components=2, whiten=True) 
X_pca = pca.fit_transform(pd.get_dummies(X))
X_resampled_pca = pca.fit_transform(X_res)

In [24]:
sns.set()
_, axes = plt.subplots(1, 2, figsize=(15, 8))

sns.scatterplot(
    ax=axes[0],
    x=X_pca[:, 0], y=X_pca[:, 1],
    hue=list(Y),
    data=X_pca,
    legend="full",
    alpha=0.2
)

sns.scatterplot(
    ax=axes[1],
    x=X_resampled_pca[:, 0], y=X_resampled_pca[:, 1],
    hue=list(Y_res),
    data=X_resampled_pca,
    legend="full",
    alpha=0.2
)

<Axes: >

### PCA - 3d

In [25]:
pca = PCA(n_components=3, whiten=True) 
X_pca_3d = pca.fit_transform(pd.get_dummies(X))
X_resampled_pca_3d = pca.fit_transform(X_res)

In [26]:
fig = plt.figure(figsize = (10, 7))
ax = plt.axes(projection ="3d")
 
# Creating plot
ax.scatter3D(X_pca_3d[:, 0], X_pca_3d[:, 1], X_pca_3d[:, 2])
plt.title("simple 3D scatter plot")
 
# show plot
plt.show()

<Figure size 1500x800 with 2 Axes>

<Figure size 1500x800 with 2 Axes>

<Figure size 1500x800 with 2 Axes>

<Figure size 1000x700 with 1 Axes>

### Hypertools (should be cited)

In [27]:
X_reduced = hyp.reduce(x=X, reduce='IncrementalPCA', ndims=10)

In [28]:
X_tsne_double_reduced = hyp.reduce(x=X_reduced, reduce='TSNE', ndims=3)

In [29]:
hyp.plot(X_tsne_double_reduced, '.', hue=Y, save_path='.')

  kwargs[kwarg]=np.array(kwargs[kwarg])


<hypertools.datageometry.DataGeometry at 0x1fd98876110>

In [30]:
hyp.plot(X_res, '.', hue=Y_res, reduce='FastICA')



<hypertools.datageometry.DataGeometry at 0x1fd23269780>