In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
import os
from collections import Counter
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler

import sys
sys.path.append(os.path.abspath('..'))

In [2]:
data_path = Path(os.getcwd()).parent / "data" / "dataset_diabetes"
df_init = pd.read_csv(data_path / "data_analyzed.csv")

In [3]:
df_init.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,readmitted,readmit_30_days,readmit_binary,diabetes_type,had_emergency,had_inpatient_days,had_outpatient_days,race_all,age_all,age_numeric
0,2278392,8222157,Caucasian,Female,30 years or younger,Unknown,Other,Other,Referral,1,...,NO,False,False,Type 1,False,False,False,Caucasian,[0-10),0
1,149190,55629189,Caucasian,Female,30 years or younger,Unknown,Emergency,Discharged to Home,Emergency,3,...,>30,False,True,Type 1,False,False,False,Caucasian,[10-20),10
2,64410,86047875,AfricanAmerican,Female,30 years or younger,Unknown,Emergency,Discharged to Home,Emergency,2,...,NO,False,False,,False,True,True,AfricanAmerican,[20-30),20
3,500364,82442376,Caucasian,Male,30-60 years,Unknown,Emergency,Discharged to Home,Emergency,2,...,NO,False,False,Type 1,False,False,False,Caucasian,[30-40),30
4,16680,42519267,Caucasian,Male,30-60 years,Unknown,Emergency,Discharged to Home,Emergency,1,...,NO,False,False,,False,False,False,Caucasian,[40-50),40


In [4]:
#remove ids & columns used for the analysis & the ones with most missing values (weight, payer_code, medical_specialty attributes)
columns_to_remove = ['encounter_id', 'patient_nbr', 'readmitted', 'readmit_binary', 'diabetes_type', \
    'had_emergency', 'had_inpatient_days', 'had_outpatient_days', 'race_all', 'age_all', 'age_numeric', \
    'weight', 'payer_code', 'medical_specialty']

df = df_init.drop(columns=columns_to_remove)

In [5]:
target_variable = "readmit_30_days"
sensitive_attribute = "race"

Y = df.loc[:, target_variable]
A = df.loc[:, sensitive_attribute]
X = pd.get_dummies(df.drop(columns=[target_variable, sensitive_attribute]))

X_A = pd.get_dummies(df.drop(columns=[target_variable]))
X_Y = pd.get_dummies(df.drop(columns=[sensitive_attribute]))
X_A_Y = pd.get_dummies(df)

Y_A = df[target_variable].astype(str) + "_" + df[sensitive_attribute].astype(str)

Train-test split

In [6]:
random_seed = 445
np.random.seed(random_seed)

X_train, X_test, Y_train, Y_test, A_train, A_test, \
X_A_train, X_A_test, X_Y_train, X_Y_test, X_A_Y_train, X_A_Y_test, \
Y_A_train, Y_A_test = train_test_split(
    X, Y, A,
    X_A, X_Y, X_A_Y,
    Y_A,
    test_size=0.20,
    stratify=Y,
    random_state=random_seed
)

Random undersampling - target attribute, without sensitive attribute

In [12]:
def drop_dummy_sensitive_attribute(df):
    filter_col = [col for col in df if col.startswith(sensitive_attribute)]
    return df.drop(filter_col, axis=1)

In [8]:
def undummyfy_sensitive_attribute(df):
    filter_col = [col for col in df if col.startswith(sensitive_attribute)]
    dummy_A_df = df[filter_col]
    A_df = dummy_A_df[dummy_A_df == 1].idxmax(axis=1)
    return A_df.apply(lambda s: s.removeprefix('race_'))

In [21]:
rus_target_wos = RandomUnderSampler(random_state=123, sampling_strategy='not minority')
X_A_train_res_target_wos, Y_train_res_target_wos = rus_target_wos.fit_resample(X_A_train, Y_train)
print(f'Resampled dataset samples per class {Counter(Y_train_res_target_wos)}')

# drop race columns (one-hot encoded)
X_train_res_target_wos = drop_dummy_sensitive_attribute(X_A_train_res_target_wos)
A_train_res_target_wos = undummyfy_sensitive_attribute(X_A_train_res_target_wos)

Resampled dataset samples per class Counter({False: 9086, True: 9086})


Random undersampling - target attribute, with sensitive attribute

In [22]:
rus_target_ws = RandomUnderSampler(random_state=123, sampling_strategy='not minority')
X_A_train_res_target_ws, Y_train_res_target_ws = rus_target_ws.fit_resample(X_A_train, Y_train)
print(f'Resampled dataset samples per class {Counter(Y_train_res_target_ws)}')

A_train_res_target_ws = undummyfy_sensitive_attribute(X_A_train_res_target_ws)

Resampled dataset samples per class Counter({False: 9086, True: 9086})


Random undersampling - sensitive attribute, without sensitive attribute

In [24]:
rus_sensitive_wos = RandomUnderSampler(random_state=123, sampling_strategy='not minority')
X_Y_train_res_sensitive_wos, A_train_res_sensitive_wos = rus_sensitive_wos.fit_resample(X_Y_train, A_train)
print(f'Resampled dataset samples per sensitive attribute {Counter(A_train_res_sensitive_wos)}')

X_train_res_sensitive_wos = X_Y_train_res_sensitive_wos.drop(columns=[target_variable])
Y_train_res_sensitive_wos = X_Y_train_res_sensitive_wos[target_variable]

Resampled dataset samples per sensitive attribute Counter({'AfricanAmerican': 1846, 'Caucasian': 1846, 'Other': 1846, 'Unknown': 1846})


Random undersampling - sensitive attribute, with sensitive attribute

In [31]:
rus_sensitive_ws = RandomUnderSampler(random_state=123, sampling_strategy='not minority')
X_A_Y_train_res_sensitive_ws, A_train_res_sensitive_ws = rus_sensitive_ws.fit_resample(X_A_Y_train, A_train)
print(f'Resampled dataset samples per class {Counter(A_train_res_sensitive_ws)}')

X_A_train_res_sensitive_ws = X_A_Y_train_res_sensitive_ws.drop(columns=[target_variable])
Y_train_res_sensitive_ws = X_A_Y_train_res_sensitive_ws[target_variable]

Resampled dataset samples per class Counter({'AfricanAmerican': 1846, 'Caucasian': 1846, 'Other': 1846, 'Unknown': 1846})


Random undersampling - multivariate, without sensitive attribute

In [43]:
rus_multiv_wos = RandomUnderSampler(random_state=123, sampling_strategy='not minority')
X_Y_train_res_multiv_wos, Y_A_res_wos = rus_multiv_wos.fit_resample(X_Y_train, Y_A_train)
print(f'Resampled dataset samples per class {Counter(Y_A_res_wos)}')

X_train_res_multiv_wos = X_Y_train_res_multiv_wos.drop(columns=[target_variable])
Y_train_res_multiv_wos = X_Y_train_res_multiv_wos[target_variable]
A_train_res_multiv_wos = Y_A_res_wos.apply(lambda r: r.split('_')[1])

Resampled dataset samples per class Counter({'False_AfricanAmerican': 154, 'False_Caucasian': 154, 'False_Other': 154, 'False_Unknown': 154, 'True_AfricanAmerican': 154, 'True_Caucasian': 154, 'True_Other': 154, 'True_Unknown': 154})


Random undersampling - multivariate, with sensitive attribute

In [51]:
rus_multiv_ws = RandomUnderSampler(random_state=123, sampling_strategy='not minority')
X_A_Y_train_res_multiv_ws, Y_A_res_ws = rus_multiv_ws.fit_resample(X_A_Y_train, Y_A_train)
print(f'Resampled dataset samples per class {Counter(Y_A_res_ws)}')

X_A_train_res_multiv_ws = X_A_Y_train_res_multiv_ws.drop(columns=[target_variable])
Y_train_res_multiv_ws = X_A_Y_train_res_multiv_ws[target_variable]
A_train_res_multiv_ws = Y_A_res_ws.apply(lambda r: r.split('_')[1])

Resampled dataset samples per class Counter({'False_AfricanAmerican': 154, 'False_Caucasian': 154, 'False_Other': 154, 'False_Unknown': 154, 'True_AfricanAmerican': 154, 'True_Caucasian': 154, 'True_Other': 154, 'True_Unknown': 154})


Save datasets in csv files

In [53]:
X_test.to_csv(data_path / "clsf_data" / "X_test_split.csv", index=False)
Y_test.to_csv(data_path / "clsf_data" / "Y_test_split.csv", index=False)
A_test.to_csv(data_path / "clsf_data" / "A_test_split.csv", index=False)
X_A_test.to_csv(data_path / "clsf_data" / "X_A_test_split.csv", index=False)

X_train.to_csv(data_path / "clsf_data" / "X_train_split.csv", index=False)
Y_train.to_csv(data_path / "clsf_data" / "Y_train_split.csv", index=False)
A_train.to_csv(data_path / "clsf_data" / "A_train_split.csv", index=False)
X_A_train.to_csv(data_path / "clsf_data" / "X_A_train_split.csv", index=False)

X_train_res_target_wos.to_csv(data_path / "clsf_data" / "X_train_res_target_wos.csv", index=False)
Y_train_res_target_wos.to_csv(data_path / "clsf_data" / "Y_train_res_target_wos.csv", index=False)
A_train_res_target_wos.to_csv(data_path / "clsf_data" / "A_train_res_target_wos.csv", index=False)

X_A_train_res_target_ws.to_csv(data_path / "clsf_data" / "X_A_train_res_target_ws.csv", index=False)
Y_train_res_target_ws.to_csv(data_path / "clsf_data" / "Y_train_res_target_ws.csv", index=False)
A_train_res_target_ws.to_csv(data_path / "clsf_data" / "A_train_res_target_ws.csv", index=False)

X_train_res_sensitive_wos.to_csv(data_path / "clsf_data" / "X_train_res_sensitive_wos.csv", index=False)
Y_train_res_sensitive_wos.to_csv(data_path / "clsf_data" / "Y_train_res_sensitive_wos.csv", index=False)
A_train_res_sensitive_wos.to_csv(data_path / "clsf_data" / "A_train_res_sensitive_wos.csv", index=False)

X_A_train_res_sensitive_ws.to_csv(data_path / "clsf_data" / "X_A_train_res_sensitive_ws.csv", index=False)
Y_train_res_sensitive_ws.to_csv(data_path / "clsf_data" / "Y_train_res_sensitive_ws.csv", index=False)
A_train_res_sensitive_ws.to_csv(data_path / "clsf_data" / "A_train_res_sensitive_ws.csv", index=False)

X_train_res_multiv_wos.to_csv(data_path / "clsf_data" / "X_train_res_multiv_wos.csv", index=False)
Y_train_res_multiv_wos.to_csv(data_path / "clsf_data" / "Y_train_res_multiv_wos.csv", index=False)
A_train_res_multiv_wos.to_csv(data_path / "clsf_data" / "A_train_res_multiv_wos.csv", index=False)

X_A_train_res_multiv_ws.to_csv(data_path / "clsf_data" / "X_A_train_res_multiv_ws.csv", index=False)
Y_train_res_multiv_ws.to_csv(data_path / "clsf_data" / "Y_train_res_multiv_ws.csv", index=False)
A_train_res_multiv_ws.to_csv(data_path / "clsf_data" / "A_train_res_multiv_ws.csv", index=False)