In [3]:
import pandas as pd

df = pd.read_csv('../datasets/diabetes/diabetic_data.csv')
# print(df.shape)
# print(df.head())


(101766, 50)
   encounter_id  patient_nbr             race  gender      age weight  \
0       2278392      8222157        Caucasian  Female   [0-10)      ?   
1        149190     55629189        Caucasian  Female  [10-20)      ?   
2         64410     86047875  AfricanAmerican  Female  [20-30)      ?   
3        500364     82442376        Caucasian    Male  [30-40)      ?   
4         16680     42519267        Caucasian    Male  [40-50)      ?   

   admission_type_id  discharge_disposition_id  admission_source_id  \
0                  6                        25                    1   
1                  1                         1                    7   
2                  1                         1                    7   
3                  1                         1                    7   
4                  1                         1                    7   

   time_in_hospital  ... citoglipton insulin  glyburide-metformin  \
0                 1  ...          No      No        

In [4]:
# Replace '?' with NaN
df.replace('?', pd.NA, inplace=True)

# Drop columns with too much missing or no variance
df.drop(columns=['weight', 'payer_code', 'medical_specialty', 'encounter_id', 'patient_nbr'], inplace=True)

# Drop rows with missing gender/race
df.dropna(subset=['race', 'gender'], inplace=True)


In [10]:
print(df.shape)
print(df.head())

(99493, 45)
   race  gender  age  admission_type_id  discharge_disposition_id  \
0     2       0    0                  6                        25   
1     2       0    1                  1                         1   
2     0       0    2                  1                         1   
3     2       1    3                  1                         1   
4     2       1    4                  1                         1   

   admission_source_id  time_in_hospital  num_lab_procedures  num_procedures  \
0                    1                 1                  41               0   
1                    7                 3                  59               0   
2                    7                 2                  11               5   
3                    7                 2                  44               1   
4                    7                 1                  51               0   

   num_medications  ...  citoglipton  insulin  glyburide-metformin  \
0                1  ..

In [11]:
from sklearn.preprocessing import LabelEncoder

cat_cols = df.select_dtypes(include='object').columns
for col in cat_cols:
    df[col] = LabelEncoder().fit_transform(df[col].astype(str))

In [13]:
df['readmitted'].value_counts(normalize=True)


readmitted
2    0.535887
1    0.351854
0    0.112259
Name: proportion, dtype: float64

In [15]:
df.to_csv('../datasets/diabetes/cleaned_data.csv', index=False)


In [3]:
import pandas as pd
import os

# Load cleaned dataset
# df = pd.read_csv('../datasets/diabetes/cleaned_data.csv')
df = pd.read_csv('../datasets/diabetes/train_only.csv')
output_dir = '../datasets/diabetes/silos'
os.makedirs(output_dir, exist_ok=True)

# Final uniform sample size
SILO_SIZE = 6000

# Define silo filtering criteria
criteria = {
    "hospital_1.csv": df[df['age'] <= 3],                      # Younger patients (≤ 40)
    "hospital_2.csv": df[df['gender'] == 1],                   # Male patients
    "hospital_3.csv": df[df['diabetesMed'] == 1],              # Patients on medication
    "hospital_4.csv": df[df['max_glu_serum'] > 0],             # Had glucose test
    "hospital_5.csv": df[df['readmitted'] != 0],               # Any readmission
}

# Create and save each silo
for filename, group_df in criteria.items():
    actual_size = min(len(group_df), SILO_SIZE)
    sampled_df = group_df.sample(n=actual_size, random_state=42)
    sampled_df.to_csv(os.path.join(output_dir, filename), index=False)
    print(f"{filename} created with {actual_size} rows.")

# Summary printout
print("\nSilo composition completed. Uniform sample size: 6000 rows each.")


hospital_1.csv created with 4918 rows.
hospital_2.csv created with 6000 rows.
hospital_3.csv created with 6000 rows.
hospital_4.csv created with 6000 rows.
hospital_5.csv created with 6000 rows.

Silo composition completed. Uniform sample size: 6000 rows each.


In [8]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split

# Load original dataset
df = pd.read_csv("../datasets/diabetes/cleaned_data.csv")

# Simulate hospital_id (5 silos)
np.random.seed(42)
df["hospital_id"] = np.random.choice([1, 2, 3, 4, 5], size=len(df))

# Create output directory if not exists
silo_dir = "../datasets/diabetes/processed_silos"
os.makedirs(silo_dir, exist_ok=True)

# Split off 20% for central evaluation
df_train, df_central_eval = train_test_split(df, test_size=0.2, random_state=42, stratify=df["readmitted"])

# Save central evaluation dataset
central_eval_path = "../datasets/diabetes/central_eval.csv"
df_central_eval.to_csv(central_eval_path, index=False)
print(f"Central evaluation set saved: {len(df_central_eval)} samples")

# Save each silo (using only training part, excluding central_eval set)
for hospital_id in range(1, 6):
    silo_df = df_train[df_train["hospital_id"] == hospital_id].copy()
    path = f"{silo_dir}/hospital_{hospital_id}.csv"
    silo_df.to_csv(path, index=False)
    print(f"Hospital {hospital_id} silo saved: {len(silo_df)} samples")


Central evaluation set saved: 19899 samples
Hospital 1 silo saved: 15896 samples
Hospital 2 silo saved: 16026 samples
Hospital 3 silo saved: 15718 samples
Hospital 4 silo saved: 15917 samples
Hospital 5 silo saved: 16037 samples
