# Preprocessing & Train/Val/Test Split

This notebook:
1. Loads the raw dataset
2. Applies basic cleaning (if needed)
3. Creates reproducible **train/val/test** splits (70/15/15 stratified)
4. Saves processed splits to `data/processed/`
5. Saves row indices to `splits/` for reproducibility


In [4]:
# 📌 1. Imports
import pandas as pd
from sklearn.model_selection import train_test_split
from pathlib import Path


Path("../data/processed").mkdir(parents=True, exist_ok=True)
Path("../splits").mkdir(parents=True, exist_ok=True)


In [5]:
# 📌 2. Load Raw Dataset
df = pd.read_csv("./data/raw/dataset-uci.csv")

print("Shape:", df.shape)
df.head()


Shape: (319, 39)


Unnamed: 0,Gallstone Status,Age,Gender,Comorbidity,Coronary Artery Disease (CAD),Hypothyroidism,Hyperlipidemia,Diabetes Mellitus (DM),Height,Weight,...,High Density Lipoprotein (HDL),Triglyceride,Aspartat Aminotransferaz (AST),Alanin Aminotransferaz (ALT),Alkaline Phosphatase (ALP),Creatinine,Glomerular Filtration Rate (GFR),C-Reactive Protein (CRP),Hemoglobin (HGB),Vitamin D
0,0,50,0,0,0,0,0,0,185,92.8,...,40.0,134.0,20.0,22.0,87.0,0.82,112.47,0.0,16.0,33.0
1,0,47,0,1,0,0,0,0,176,94.5,...,43.0,103.0,14.0,13.0,46.0,0.87,107.1,0.0,14.4,25.0
2,0,61,0,0,0,0,0,0,171,91.1,...,43.0,69.0,18.0,14.0,66.0,1.25,65.51,0.0,16.2,30.2
3,0,41,0,0,0,0,0,0,168,67.7,...,59.0,53.0,20.0,12.0,34.0,1.02,94.1,0.0,15.4,35.4
4,0,42,0,0,0,0,0,0,178,89.6,...,30.0,326.0,27.0,54.0,71.0,0.82,112.47,0.0,16.8,40.6


In [6]:
# 📌 3. Define Target Column
TARGET = "Gallstone Status"

df[TARGET].value_counts(normalize=True).mul(100).round(1)


Gallstone Status
0    50.5
1    49.5
Name: proportion, dtype: float64

In [7]:
# 📌 4. Create Splits (70/15/15 stratified)

# Train vs temp (val+test)
train_df, temp_df = train_test_split(
    df,
    test_size=0.30,
    stratify=df[TARGET],
    random_state=42
)

# Val vs test
val_df, test_df = train_test_split(
    temp_df,
    test_size=0.50,
    stratify=temp_df[TARGET],
    random_state=42
)

print("Train:", train_df.shape, "Val:", val_df.shape, "Test:", test_df.shape)


Train: (223, 39) Val: (48, 39) Test: (48, 39)


In [8]:
# 📌 5. Save Full Splits as CSVs
train_df.to_csv("../data/processed/train.csv", index=False)
val_df.to_csv("../data/processed/val.csv", index=False)
test_df.to_csv("../data/processed/test.csv", index=False)

print("✅ CSVs saved to data/processed/")


✅ CSVs saved to data/processed/


In [9]:
# 📌 6. Saving Indices Only (for reproducibility)
pd.Series(train_df.index).to_csv("../splits/train_idx.csv", index=False, header=False)
pd.Series(val_df.index).to_csv("../splits/val_idx.csv", index=False, header=False)
pd.Series(test_df.index).to_csv("../splits/test_idx.csv", index=False, header=False)

print("✅ Indices saved to splits/")


✅ Indices saved to splits/


In [10]:
# 📌 7. Verify Class Balance
print("Train balance:")
print(train_df[TARGET].value_counts(normalize=True).mul(100).round(1))

print("\nValidation balance:")
print(val_df[TARGET].value_counts(normalize=True).mul(100).round(1))

print("\nTest balance:")
print(test_df[TARGET].value_counts(normalize=True).mul(100).round(1))


Train balance:
Gallstone Status
0    50.7
1    49.3
Name: proportion, dtype: float64

Validation balance:
Gallstone Status
0    50.0
1    50.0
Name: proportion, dtype: float64

Test balance:
Gallstone Status
1    50.0
0    50.0
Name: proportion, dtype: float64


In [11]:
import json

feature_means = df.mean(numeric_only=True).to_dict()

with open("models/feature_means.json", "w") as f:
    json.dump(feature_means, f)
