In [18]:
print("hello world")

hello world


In [19]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

class MLP(nn.Module):
    def __init__(self, input_dim, num_classes=2):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 32),
            nn.ReLU(),
            nn.Linear(32, 16),
            nn.ReLU(),
            nn.Linear(16, num_classes)
        )

    def forward(self, x):
        return self.net(x)


data = datasets.load_wine()
path = "wine_mlp_weights.pth"

df = pd.DataFrame(data.data, columns=data.feature_names)
targets = data.target
input_dim = df.shape[1]
num_classes = len(np.unique(targets))

# Ready data
X_train, X_test, y_train, y_test = train_test_split(df, targets, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)  # "fit" learns the mean/variance of each feature
X_test = scaler.transform(X_test)  # fit isn't necessary because the scaler already learned the features

# Convert to PyTorch tensors
X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.long)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.long)



# Run model
loaded_model = MLP(input_dim, num_classes)
loaded_model.load_state_dict(torch.load(path))
loaded_model.eval()
print("Model weights reloaded successfully.")

predictions = loaded_model(X_test)
_, predicted_classes = torch.max(predictions, 1)
accuracy = (predicted_classes == y_test).float().mean()
print(f"Test Accuracy: {accuracy.item()*100:.2f}%")

Model weights reloaded successfully.
Test Accuracy: 97.22%


In [20]:
def compute_bounds(X_df, numeric_cols, low_pct=1.0, high_pct=99.0, gamma=0.1):
    # X_df: pandas DataFrame of training data (raw/unscaled)
    mins = X_df[numeric_cols].quantile(low_pct/100.0).values
    maxs = X_df[numeric_cols].quantile(high_pct/100.0).values
    spans = maxs - mins
    # avoid zero spans
    spans[spans == 0] = 1e-6
    low = mins - gamma * spans
    high = maxs + gamma * spans
    return low, high

def synthesize_extrapolated(X_df, numeric_cols, categorical_cols,
                            n_samples=5000, gamma=0.1, low_pct=1.0, high_pct=99.0,
                            random_state=None):
    rng = np.random.default_rng(random_state)
    low, high = compute_bounds(X_df, numeric_cols, low_pct=low_pct, high_pct=high_pct, gamma=gamma)
    n_num = len(numeric_cols)
    Xs_num = rng.uniform(low=low, high=high, size=(n_samples, n_num))

    # categorical: sample by empirical frequencies
    Xs_cat = {}
    for c in categorical_cols:
        vals, counts = np.unique(X_df[c].values, return_counts=True)
        probs = counts / counts.sum()
        picks = rng.choice(len(vals), size=n_samples, p=probs)
        Xs_cat[c] = vals[picks]

    # assemble DataFrame
    df_num = pd.DataFrame(Xs_num, columns=numeric_cols)
    df_cat = pd.DataFrame(Xs_cat)
    Xs = pd.concat([df_num, df_cat.reset_index(drop=True)], axis=1)[list(numeric_cols) + list(categorical_cols)]
    return Xs

def run_through_model(X, model):
    X_tensor = torch.tensor(X.values, dtype=torch.float32)
    with torch.no_grad():
        scaler = StandardScaler()
        X_tensor = torch.tensor(scaler.fit_transform(X), dtype=torch.float32)
        predictions = model(X_tensor)
        _, predicted_classes = torch.max(predictions, 1)

    return predicted_classes


# Generate synthetic data according to established distribution
def synthesize_to_distribution(X_df, model, scaler, target_dist, conf_threshold=0.8, n_total=1000):
    model.eval()

    X_scale = scaler.fit_transform(X_df)
    X_tensor = torch.tensor(X_scale, dtype=torch.float32)

    # Run through model
    with torch.no_grad():
        logits = model(X_tensor)
        probs = torch.softmax(logits, dim=1)
        confs, preds = torch.max(probs, dim=1)

    probs_np = probs.numpy()
    confs_np = confs.numpy()
    preds_np = preds.numpy()

    # Filter by confidence
    conf_mask = confs_np >= conf_threshold
    X_conf = X_df[conf_mask]
    probs_conf = probs_np[conf_mask]
    confs_conf = confs_np[conf_mask]
    preds_conf = preds_np[conf_mask]

    class_counts = {
        c: int(round(frac * n_total)) for c, frac in target_dist.items()
    }

    # Select top confident samples
    selected_idx = []
    for c, count in class_counts.items():
        idx_c = np.where(preds_conf == c)[0]
        if len(idx_c) == 0:
            print("something went wrong")

        order = np.argsort(-confs_conf[idx_c])
        top_idx = idx_c[order[:count]]
        selected_idx.extend(top_idx)
    
    # Filter out records
    X_selected = np.array(X_conf)[selected_idx]
    y_selected = preds_conf[selected_idx]
    probs_selected = probs_conf[selected_idx]

    return X_selected, y_selected, probs_selected


In [21]:
categorical_features = df.select_dtypes(include=["object", "category", "bool"]).columns.tolist()
continuous_features = df.select_dtypes(include=["int64", "float64"]).columns.tolist()

X_full = df.copy()

Xs_synth = synthesize_extrapolated(X_full, continuous_features, categorical_features,
                                   n_samples=150, gamma=0.05)

print(Xs_synth.columns)

y_synth = run_through_model(Xs_synth, model=loaded_model)

# probe: how many are high-confidence?
print("High-confidence synthetic samples (conf >= .9)")

Index(['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium',
       'total_phenols', 'flavanoids', 'nonflavanoid_phenols',
       'proanthocyanins', 'color_intensity', 'hue',
       'od280/od315_of_diluted_wines', 'proline'],
      dtype='object')
High-confidence synthetic samples (conf >= .9)


In [22]:
X_tensor = torch.tensor(Xs_synth.values, dtype=torch.float32)
y_tensor = torch.tensor(y_synth, dtype=torch.long)

scaler = StandardScaler()
X_tensor = torch.tensor(scaler.fit_transform(Xs_synth), dtype=torch.float32)
predictions = loaded_model(X_tensor)
_, predicted_classes = torch.max(predictions, 1)
print(len(predicted_classes))
print(y_tensor.shape)
accuracy = (predicted_classes == y_tensor).float().mean()
print(f"Test Accuracy: {accuracy.item()*100:.2f}%")

150
torch.Size([150])
Test Accuracy: 100.00%


  y_tensor = torch.tensor(y_synth, dtype=torch.long)


In [23]:
# Define class names
class_names = ['class_0', 'class_1', 'class_2']

# One-hot encode
ys_onehot = pd.get_dummies(y_synth)
ys_onehot.columns = class_names

print(ys_onehot.head())

s_dataset = pd.concat([Xs_synth, ys_onehot], axis=1)

X = df
y = data.target
print(y)
print((y==0).sum(), (y==1).sum(), (y==2).sum())
y_oneshot = pd.get_dummies(y)
y_oneshot.columns = class_names

dataset = pd.concat([X, y_oneshot], axis=1)

final_dataset = pd.concat([s_dataset, dataset], ignore_index=True)

final_dataset.to_csv("wine_synthetic_data.csv", index=False)

   class_0  class_1  class_2
0     True    False    False
1     True    False    False
2     True    False    False
3    False     True    False
4     True    False    False
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2]
59 71 48


In [24]:
means_synthetic = Xs_synth.mean()
means_original = df.mean()

print("Means of original data:\n", means_original)
print("Means of synthetic data:\n", means_synthetic)

Means of original data:
 alcohol                          13.000618
malic_acid                        2.336348
ash                               2.366517
alcalinity_of_ash                19.494944
magnesium                        99.741573
total_phenols                     2.295112
flavanoids                        2.029270
nonflavanoid_phenols              0.361854
proanthocyanins                   1.590899
color_intensity                   5.058090
hue                               0.957449
od280/od315_of_diluted_wines      2.611685
proline                         746.893258
dtype: float64
Means of synthetic data:
 alcohol                          12.906584
malic_acid                        3.120987
ash                               2.344812
alcalinity_of_ash                19.433797
magnesium                       110.391580
total_phenols                     2.355541
flavanoids                        2.164532
nonflavanoid_phenols              0.359864
proanthocyanins                

In [28]:
### DISTRIBUTION MATCHING

num_classes = 3
class_names = [0, 1, 2]
counts = [59, 71, 48]
target_dist = {
    class_names[i]: counts[i] / sum(counts) for i in range(num_classes)
}


num_samples = 150
multiplier = 5
categorical_features = df.select_dtypes(include=["object", "category", "bool"]).columns.tolist()
continuous_features = df.select_dtypes(include=["int64", "float64"]).columns.tolist()

X_full = df.copy()

X_synth_full = synthesize_extrapolated(X_full, continuous_features, categorical_features,
                                   n_samples=num_samples * multiplier, gamma=0.05)

X_synth, y_synth, probs_synth = synthesize_to_distribution(X_synth_full, loaded_model, StandardScaler(), target_dist, 0.9, num_samples)

print("High-confidence synthetic samples (conf >= .9):" + str(np.mean(np.max(probs_synth, axis=1))))


# Save data

# Define class names
class_names = ['class_0', 'class_1', 'class_2']

# One-hot encode
ys_onehot = pd.get_dummies(y_synth)
ys_onehot.columns = class_names

X_synth = pd.DataFrame(X_synth, columns=df.columns)

s_dataset = pd.concat([X_synth, ys_onehot], axis=1)

X = df
y = data.target
# print(y)
# print((y==0).sum(), (y==1).sum(), (y==2).sum())
y_oneshot = pd.get_dummies(y)
y_oneshot.columns = class_names

dataset = pd.concat([X, y_oneshot], axis=1)

final_dataset = pd.concat([s_dataset, dataset], ignore_index=True)

final_dataset.to_csv("wine_synthetic_data.csv", index=False)


High-confidence synthetic samples (conf >= .9):0.9130295
