In [9]:
%pip install kagglehub
%pip install ipywidgets

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


### Import Libraries

In [3]:
import kagglehub
from pathlib import Path
import shutil


### Download datasets

In [4]:
def download_dataset(path):
	dest = Path("datasets")
	dest.mkdir(parents=True, exist_ok=True)

	src = Path(path)
	for entry in src.iterdir():
		target = dest / entry.name
		if entry.is_dir():
			shutil.copytree(entry, target, dirs_exist_ok=True)
		else:
			shutil.copy2(entry, target)

In [5]:
# Download latest version
path = kagglehub.dataset_download("yasserhessein/multiclass-diabetes-dataset")

download_dataset(path)

print("Path to dataset files:", path)

Path to dataset files: /home/tr3p0l3m/.cache/kagglehub/datasets/yasserhessein/multiclass-diabetes-dataset/versions/1


In [6]:
# Download latest version
path = kagglehub.dataset_download("alexteboul/diabetes-health-indicators-dataset")

download_dataset(path)

print("Path to dataset files:", path)

Path to dataset files: /home/tr3p0l3m/.cache/kagglehub/datasets/alexteboul/diabetes-health-indicators-dataset/versions/1


In [7]:
# Download latest version
path = kagglehub.dataset_download(
    "ishandutta/early-stage-diabetes-risk-prediction-dataset"
)

download_dataset(path)

print("Path to dataset files:", path)

Path to dataset files: /home/tr3p0l3m/.cache/kagglehub/datasets/ishandutta/early-stage-diabetes-risk-prediction-dataset/versions/1


### Dataset selection

In [None]:
try:
	from IPython.display import display  # type: ignore
	import ipywidgets as widgets
except Exception:
	display = None
	widgets = None


def list_available_datasets(dataset_root: Path) -> list[Path]:
	dataset_root = Path(dataset_root)
	if not dataset_root.exists():
		raise ValueError(f"No datasets found under {dataset_root.resolve()}")
	directories = sorted(path for path in dataset_root.iterdir() if path.is_dir())
	csv_files = sorted(path for path in dataset_root.iterdir() if path.is_file() and path.suffix.lower() == ".csv")
	dataset_paths = directories + csv_files
	if not dataset_paths:
		raise ValueError(f"No datasets found under {dataset_root.resolve()}")
	for index, dataset_path in enumerate(dataset_paths, start=1):
		print(f"{index}. {dataset_path.name}")
	return dataset_paths


def select_dataset_by_number(datasets: list[Path], selection: int | None = None) -> Path:
	if not datasets:
		raise ValueError("No datasets available to select from.")
	if selection is None:
		if len(datasets) == 1:
			return datasets[0]
		raise ValueError("Multiple datasets available. Please provide a selection.")
	if not 1 <= selection <= len(datasets):
		raise ValueError(f"Selection {selection} is out of range 1..{len(datasets)}.")
	return datasets[selection - 1]


def list_csv_files(dataset_path: Path, *, silent: bool = False) -> list[Path]:
	dataset_path = Path(dataset_path)
	if dataset_path.is_file():
		if dataset_path.suffix.lower() != ".csv":
			raise FileNotFoundError(f"No CSV files found in {dataset_path.resolve()}")
		if not silent:
			print(dataset_path.name)
		return [dataset_path]
	if not dataset_path.is_dir():
		raise FileNotFoundError(f"No CSV files found in {dataset_path.resolve()}")
	csv_files = sorted(dataset_path.rglob("*.csv"))
	if not csv_files:
		raise FileNotFoundError(f"No CSV files found in {dataset_path.resolve()}")
	if not silent:
		for csv_file in csv_files:
			print(csv_file.relative_to(dataset_path))
	return csv_files


def _in_notebook() -> bool:
	try:
		from IPython import get_ipython
		ip = get_ipython()
	except Exception:
		return False
	if ip is None:
		return False
	return "IPKernelApp" in getattr(ip, "config", {})


def prompt_dataset_selection(datasets: list[Path]) -> Path:
	if len(datasets) == 1:
		return datasets[0]
	while True:
		selection_input = input(f"Select dataset by number (1-{len(datasets)}) [default 1]: ").strip()
		if not selection_input:
			return datasets[0]
		if selection_input.isdigit():
			try:
				return select_dataset_by_number(datasets, int(selection_input))
			except ValueError as exc:
				print(exc)
				continue
		print("Please enter a valid integer.")


dataset_root = Path("datasets")

available_datasets = list_available_datasets(dataset_root)

use_widgets = display is not None and widgets is not None and _in_notebook()

if use_widgets:
	selector = widgets.Dropdown(
		options=[(path.name, path) for path in available_datasets],
		description="Dataset:",
		value=available_datasets[0],
	)

	output = widgets.Output()

	def update_selection(dataset_path: Path) -> None:
		global selected_dataset, csv_files
		selected_dataset = dataset_path
		csv_files = list_csv_files(selected_dataset, silent=True)
		output.clear_output()
		with output:
			print(f"Selected dataset: {selected_dataset}")
			if selected_dataset.is_file():
				print(selected_dataset.name)
			else:
				for csv_file in csv_files:
					print(csv_file.relative_to(selected_dataset))

	update_selection(selector.value)

	def on_dataset_change(change: dict) -> None:
		if change["name"] == "value" and change["new"]:
			update_selection(change["new"])

	selector.observe(on_dataset_change, names="value")

	display(selector, output)
else:
	selected_dataset = prompt_dataset_selection(available_datasets)
	print(f"Selected dataset: {selected_dataset}")
	csv_files = list_csv_files(selected_dataset)

1. Multiclass Diabetes Dataset
2. Dataset of Diabetes .csv
3. Multiclass Diabetes Dataset.csv
4. diabetes_012_health_indicators_BRFSS2015.csv
5. diabetes_binary_5050split_health_indicators_BRFSS2015.csv
6. diabetes_binary_health_indicators_BRFSS2015.csv
7. diabetes_data_upload.csv


Dropdown(description='Dataset:', options=(('Multiclass Diabetes Dataset', PosixPath('datasets/Multiclass Diabeâ€¦

Output()

### Data Preprocessing

In [10]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

df_raw = pd.read_csv(csv_files[0])
df = df_raw.copy()
df.columns = (
	df.columns.str.strip()
	.str.lower()
	.str.replace(r"[^0-9a-z]+", "_", regex=True)
	.str.strip("_")
)

df = df.drop_duplicates().reset_index(drop=True)

target_candidates = [
	col for col in df.columns if col.lower() in {"class", "outcome", "diabetes", "diabetes_binary", "diabetes_status"}
]
target_column = target_candidates[0] if target_candidates else df.columns[-1]
feature_columns = [col for col in df.columns if col != target_column]

X = df[feature_columns].copy()
y = df[target_column]

categorical_columns = X.select_dtypes(exclude="number").columns.tolist()
numeric_columns = X.select_dtypes(include="number").columns.tolist()

if categorical_columns:
	X[categorical_columns] = (
		X[categorical_columns]
		.apply(lambda col: col.astype(str).str.strip())
		.replace({"": pd.NA})
	)

one_hot_kwargs: dict[str, object] = {"handle_unknown": "ignore"}
if "sparse_output" in OneHotEncoder.__init__.__code__.co_varnames:
	one_hot_kwargs["sparse_output"] = False
else:
	one_hot_kwargs["sparse"] = False

numeric_pipeline = Pipeline(steps=[("imputer", SimpleImputer(strategy="median"))])
categorical_pipeline = Pipeline(
	steps=[
		("imputer", SimpleImputer(strategy="most_frequent")),
		("encoder", OneHotEncoder(**one_hot_kwargs)),
	]
)

preprocessor = ColumnTransformer(
	transformers=[
		("num", numeric_pipeline, numeric_columns),
		("cat", categorical_pipeline, categorical_columns),
	],
	remainder="drop",
)

X_preprocessed_array = preprocessor.fit_transform(X)
feature_names = preprocessor.get_feature_names_out()
X_preprocessed = pd.DataFrame(X_preprocessed_array, columns=feature_names, index=df.index)

print(
	f"Loaded {selected_dataset.name} with {df.shape[0]} rows and {df.shape[1]} columns. "
	f"Target: {target_column}. Numeric features: {len(numeric_columns)}. "
	f"Categorical features: {len(categorical_columns)}."
)

Loaded Dataset of Diabetes .csv with 1000 rows and 14 columns. Target: class. Numeric features: 12. Categorical features: 1.


### Feature Engineering

In [None]:
import numpy as np

def engineer_features(features: pd.DataFrame) -> pd.DataFrame:
	engineered = features.copy()

	hdl_safe = engineered["hdl"].replace(0, np.nan)
	cr_safe = engineered["cr"].replace(0, np.nan)
	chol_safe = engineered["chol"].replace(0, np.nan)

	engineered["chol_hdl_ratio"] = engineered["chol"] / hdl_safe
	engineered["ldl_hdl_ratio"] = engineered["ldl"] / hdl_safe
	engineered["tg_hdl_ratio"] = engineered["tg"] / hdl_safe
	engineered["urea_creatinine_ratio"] = engineered["urea"] / cr_safe
	engineered["age_bmi_interaction"] = engineered["age"] * engineered["bmi"]
	engineered["metabolic_score"] = (
		engineered[["hba1c", "chol", "tg", "bmi"]].rank(pct=True).mean(axis=1)
	)
	engineered["lipid_density"] = (engineered["ldl"] + engineered["vldl"]) / chol_safe
	engineered["is_obese"] = (engineered["bmi"] >= 30).astype(int)

	age_band = pd.cut(
		engineered["age"],
		bins=[0, 29, 39, 49, 59, 69, np.inf],
		labels=["<30", "30-39", "40-49", "50-59", "60-69", "70+"],
		right=True,
		include_lowest=True,
	)
	engineered["age_band"] = age_band.astype("string")

	ratio_cols = [
		"chol_hdl_ratio",
		"ldl_hdl_ratio",
		"tg_hdl_ratio",
		"urea_creatinine_ratio",
		"lipid_density",
	]
	engineered[ratio_cols] = engineered[ratio_cols].replace([np.inf, -np.inf], np.nan)

	return engineered

base_feature_columns = feature_columns.copy()
X = engineer_features(X)
new_feature_names = sorted(set(X.columns) - set(base_feature_columns))

categorical_columns = X.select_dtypes(exclude="number").columns.tolist()
numeric_columns = X.select_dtypes(include="number").columns.tolist()
feature_columns = X.columns.tolist()

numeric_pipeline = Pipeline(steps=[("imputer", SimpleImputer(strategy="median"))])
categorical_pipeline = Pipeline(
	steps=[
		("imputer", SimpleImputer(strategy="most_frequent")),
		("encoder", OneHotEncoder(**one_hot_kwargs)),
	]
)

preprocessor = ColumnTransformer(
	transformers=[
		("num", numeric_pipeline, numeric_columns),
		("cat", categorical_pipeline, categorical_columns),
	],
	remainder="drop",
)

X_preprocessed_array = preprocessor.fit_transform(X)
feature_names = preprocessor.get_feature_names_out()
X_preprocessed = pd.DataFrame(X_preprocessed_array, columns=feature_names, index=df.index)

print(
	f"Engineered {len(new_feature_names)} new features; "
	f"{len(numeric_columns)} numeric and {len(categorical_columns)} categorical columns after transformation."
)
if new_feature_names:
	print("New features:", ", ".join(new_feature_names))
print(f"Preprocessed design matrix shape: {X_preprocessed.shape}")

### Model Selection

### Training

### Evaluation

### Exploratory Data Analysis