In [70]:
import torch
import pandas as pd
import os
import numpy as np
root = "../data/us-health"

df = pd.read_csv(os.path.join(root, "insurance.csv"))

df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [71]:
num_cols = ["age", "children", "bmi"]
cat_cols = ["sex", "smoker", "region"]
target = "charges"

df_shuf = df.sample(frac=1, random_state=42).reset_index(drop=True)

n = len(df_shuf)

df_tr = (df_shuf.iloc[:int(n*0.8)]).copy()
df_val = (df_shuf.iloc[int(n*0.8):int(0.9*n)]).copy()
df_te = (df_shuf.iloc[int(0.9*n):]).copy()

cat_levels = {c: sorted(df_tr[c].unique().tolist()) for c in cat_cols}

print(df.isna().sum())
print(cat_levels)

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64
{'sex': ['female', 'male'], 'smoker': ['no', 'yes'], 'region': ['northeast', 'northwest', 'southeast', 'southwest']}


In [72]:
def one_hot(df_part: pd.DataFrame) -> pd.DataFrame:
    outs = []
    for c in cat_cols:
        cat = pd.Categorical(df_part[c], categories=cat_levels[c])
        outs.append(pd.get_dummies(cat, prefix=c, drop_first=False).astype(np.float32))
    return pd.concat(outs, axis=1)

Xtr_cat = one_hot(df_tr)
Xval_cat = one_hot(df_val)
Xte_cat = one_hot(df_te)

Xtr_cat.head()

Unnamed: 0,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
1,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
3,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
4,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0


In [73]:
def fit_standard_scaler(X):
    X = torch.as_tensor(X, dtype=torch.float32)
    mean = X.mean(dim=0)
    std  = X.std(dim=0, unbiased=False).clamp_min(1e-8)  # ddof=0
    return mean, std

def transform_standard_scaler(X, mean, std):
    X = torch.as_tensor(X, dtype=torch.float32)
    return (X - mean) / std

mu, sigma = fit_standard_scaler(df_tr[num_cols].to_numpy(np.float32))
print(mu)
print(sigma)


tensor([39.3514,  1.1019, 30.7335])
tensor([13.9638,  1.1903,  6.1342])


In [74]:
Xtr_num = transform_standard_scaler(df_tr[num_cols].to_numpy(np.float32), mu, sigma).numpy()
Xval_num = transform_standard_scaler(df_val[num_cols].to_numpy(np.float32), mu, sigma).numpy()
Xte_num = transform_standard_scaler(df_te[num_cols].to_numpy(np.float32), mu, sigma).numpy()

In [75]:
X_train = torch.from_numpy(np.hstack([Xtr_num, Xtr_cat.values.astype(np.float32)]))
X_val = torch.from_numpy(np.hstack([Xval_num, Xval_cat.values.astype(np.float32)]))
X_test = torch.from_numpy(np.hstack([Xte_num, Xte_cat.values.astype(np.float32)]))

X_train.shape, X_val.shape, X_test.shape

(torch.Size([1070, 11]), torch.Size([134, 11]), torch.Size([134, 11]))

In [76]:
y_train = torch.from_numpy(df_tr[target].to_numpy(np.float32))
y_val = torch.from_numpy(df_val[target].to_numpy(np.float32))
y_test = torch.from_numpy(df_te[target].to_numpy(np.float32))

y_train.shape, y_val.shape, y_test.shape

(torch.Size([1070]), torch.Size([134]), torch.Size([134]))

In [77]:
bundle = {
    "X_train": X_train, "y_train": y_train,
    "X_val": X_val, "y_val": y_val,
    "X_test": X_test, "y_test": y_test,
    "mu": mu, "sigma": sigma,
    "onehot_template": list(Xtr_cat.columns),
    "num_cols": num_cols,
    "cat_cols": cat_cols,
    "target": target
}

torch.save(bundle, os.path.join(root, "health_knn.pt"))