In [1]:
print("hello world")

hello world


In [2]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [3]:
class MLP(nn.Module):
    def __init__(self, input_dim, num_classes=2):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 32),
            nn.ReLU(),
            nn.Linear(32, 16),
            nn.ReLU(),
            nn.Linear(16, num_classes)
        )

    def forward(self, x):
        return self.net(x)

In [4]:
### SYNTHESIS FUNCTIONS ###

def synthesize_data(X_df, numeric_cols, categorical_cols,
                            n_samples=5000, gamma=0.1, low_pct=1.0, high_pct=99.0,
                            random_state=None):
    
    def compute_bounds(X_df, numeric_cols, low_pct=1.0, high_pct=99.0, gamma=0.1):
        # X_df: pandas DataFrame of training data (raw/unscaled)
        mins = X_df[numeric_cols].quantile(low_pct/100.0).values
        maxs = X_df[numeric_cols].quantile(high_pct/100.0).values
        spans = maxs - mins
        # avoid zero spans
        spans[spans == 0] = 1e-6
        low = mins - gamma * spans
        high = maxs + gamma * spans
        return low, high

    rng = np.random.default_rng(random_state)
    low, high = compute_bounds(X_df, numeric_cols, low_pct=low_pct, high_pct=high_pct, gamma=gamma)
    n_num = len(numeric_cols)
    Xs_num = rng.uniform(low=low, high=high, size=(n_samples, n_num))

    # categorical: sample by empirical frequencies
    Xs_cat = {}
    for c in categorical_cols:
        vals, counts = np.unique(X_df[c].values, return_counts=True)
        probs = counts / counts.sum()
        picks = rng.choice(len(vals), size=n_samples, p=probs)
        Xs_cat[c] = vals[picks]

    # assemble DataFrame
    df_num = pd.DataFrame(Xs_num, columns=numeric_cols)
    df_cat = pd.DataFrame(Xs_cat)
    Xs = pd.concat([df_num, df_cat.reset_index(drop=True)], axis=1)[list(numeric_cols) + list(categorical_cols)]
    return Xs


# Generate synthetic data according to established distribution
def synthesize_to_distribution(X_df, model, scaler, target_dist, conf_threshold=0.8, n_total=1000):
    model.eval()

    X_scale = scaler.fit_transform(X_df)
    X_tensor = torch.tensor(X_scale, dtype=torch.float32)

    # Run through model
    with torch.no_grad():
        logits = model(X_tensor)
        probs = torch.softmax(logits, dim=1)
        confs, preds = torch.max(probs, dim=1)

    probs_np = probs.numpy()
    confs_np = confs.numpy()
    preds_np = preds.numpy()

    # Filter by confidence
    conf_mask = confs_np >= conf_threshold
    X_conf = X_df[conf_mask]
    probs_conf = probs_np[conf_mask]
    confs_conf = confs_np[conf_mask]
    preds_conf = preds_np[conf_mask]

    class_counts = {
        c: int(round(frac * n_total)) for c, frac in target_dist.items()
    }

    # Select top confident samples
    selected_idx = []
    for c, count in class_counts.items():
        idx_c = np.where(preds_conf == c)[0]
        if len(idx_c) == 0:
            print("something went wrong; no samples are above confidence threshold for class:", c)

        order = np.argsort(-confs_conf[idx_c])
        top_idx = idx_c[order[:count]]
        selected_idx.extend(top_idx)
    
    # Filter out records
    X_selected = pd.DataFrame(np.array(X_conf)[selected_idx])
    X_selected.columns = X_df.columns
    y_selected = preds_conf[selected_idx]
    probs_selected = probs_conf[selected_idx]

    return X_selected, y_selected, probs_selected

def output_dataset(X, y, X_synth, y_synth, class_names, use_baseline, filename):
    X_synth = pd.DataFrame(X_synth, columns=X.columns)
    y_synth = pd.get_dummies(y_synth)
    y_synth.columns = class_names

    # Force boolean columns in synthetic data to bool
    bool_cols = X.select_dtypes(include='bool').columns
    for col in bool_cols:
        X_synth[col] = X_synth[col].astype(int)
        X[col] = X[col].astype(int)

    data = pd.concat([X, y], axis=1)
    synth_data = pd.concat([X_synth, y_synth], axis=1)

    if use_baseline:
        final_data = pd.concat([data, synth_data], ignore_index=True)
    else:
        final_data = synth_data
    
    final_data.to_csv(filename, index=False)

In [5]:
# Synthetic data generator: It loads data and generates synthetic data within the given distribution
# We shouldn't be doing extrapolated data generation

##### 1. LOAD DATA AND MODEL WEIGHTS #####

data_filename = "../data/kidney/datasets/baseline/99.8_percent/in_distribution.csv"
weights_filename = "../data/kidney/model_weights/baseline/99.8_percent.pth"
uci_function = None

# class_names = ["malignant", "benign"]
# class_names = ['class_0', 'class_1', 'class_2']
# class_names = ["HeartDisease_0", "HeartDisease_1"]
class_names = ["CKD_Status_0", "CKD_Status_1"]

# Option 1: UCI Dataset
if uci_function:
    dataset = uci_function()
    class_names = dataset.target_names
    X = pd.DataFrame(dataset.data, columns=dataset.feature_names)
    y = pd.DataFrame(dataset.target, columns=['target'])

# Option 2: Load data from file
else:
    # may be expecting one-shot data?????
    dataset = pd.read_csv(data_filename)
    X = dataset.drop(columns=class_names)
    y = dataset[class_names]


input_dim = X.shape[1]
num_classes = y.shape[1]

X_tensor = torch.tensor(StandardScaler().fit_transform(X), dtype=torch.float32)
y_tensor = torch.tensor(y.values.squeeze(), dtype=torch.long).argmax(dim=1) # not one-hot encoded

categorical_cols = X.select_dtypes(include=["object", "category", "bool"]).columns.tolist()
numeric_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()


if weights_filename:
    model = MLP(input_dim, num_classes)
    model.load_state_dict(torch.load(weights_filename))
    model.eval()
    predictions = model(X_tensor)

    _, predicted_classes = torch.max(predictions, 1)
    accuracy = (predicted_classes == y_tensor).float().mean()
    print(f"Test Accuracy: {accuracy.item()*100:.2f}%")
else:
    print("model weights not loaded")


Test Accuracy: 98.33%


In [6]:
##### 2. CALCULATE TARGET DISTRIBUTION #####

counts = torch.bincount(y_tensor, minlength=num_classes)
dist = counts.float() / counts.sum()
distribution_map = {i: dist[i].item() for i in range(num_classes)}

print(distribution_map)

{0: 0.8001030683517456, 1: 0.19989696145057678}


In [7]:
##### 3. SYNTHESIZE DATA #####

num_samples = 300
multiplier = 1000 # just generates extra data so that more data falls above the confidence threshold
conf_threshold = 0.7
filename = "../data/kidney/datasets/mix/99.8_percent/300_samples.csv"

use_baseline = True


X_synth_raw = synthesize_data(X, numeric_cols, categorical_cols, num_samples * multiplier, gamma=0.0)
X_synth, y_synth, preds = synthesize_to_distribution(X_synth_raw, model, StandardScaler(), distribution_map, conf_threshold, num_samples)
output_dataset(X, y, X_synth, y_synth, class_names, use_baseline, filename)

means_original = X.mean()
means_synthetic = X_synth.mean()

# print("Means of original data:\n", means_original)
# print("Means of synthetic data:\n", means_synthetic)

diff = means_original - means_synthetic
print("Means difference:\n", diff)



Means difference:
 Creatinine                   -0.715767
BUN                         -14.779775
GFR                          10.765822
Urine_Output               -199.982905
Diabetes                     -0.378305
Hypertension                  0.002968
Age                         -16.244711
Protein_in_Urine           -765.333002
Water_Intake                 -0.286709
Medication_ACE Inhibitor     -0.286034
Medication_ARB               -0.126019
Medication_Diuretic          -0.512768
dtype: float64
