In [13]:
from Samplers.sampler_test import *
from Samplers.plotting import *
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import multivariate_normal, invwishart, dirichlet
from tqdm import tqdm
import matplotlib.patches as patches
import matplotlib.pyplot as plt
import seaborn as sns

data = pd.read_csv('data/NHANES_adults_data_preprocessed.csv').sample(400)


In [None]:
# Select the columns of interest
features = [
    'height', 'bmi', 'WHtR', 'sbp', 'dbp',
    'eGFR', 'hba1c', 'hdl', 'non_hdl', 'pulse'
]
data = data[features]
#remove rows with NaN values
data = data.dropna()
# Normalize the data
for feature in features:
    data[feature] = (data[feature] - data[feature].mean()) / data[feature].std()
    

In [15]:
from scipy.spatial.distance import pdist

def h1(mu):
    return np.min(pdist(mu))


In [16]:
sig = 0.05
num_iterations = 10000
K = 10
X = data.to_numpy() 

run_samples_rep = bayesian_repulsive_randomwalk(X, K, num_iterations, h1, burn_in=2000, sig=sig)

Sampling:  12%|█▏        | 1236/10000 [00:05<00:38, 227.04it/s]


KeyboardInterrupt: 

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Extract mu samples from the non-repulsive sampler
mu_samples = np.array([s[1] for s in run_samples_rep])  # shape: (T, K, p)
T, K, p = mu_samples.shape
feature_names = data.columns  # assuming this has p features

# === Choose a few (k, d) pairs to visualize ===
plot_targets = [
    (0, 0),  # Cluster 1, height
    (1, 0),  # Cluster 2, height
    (0, 1),  # Cluster 1, bmi
    (1, 1),  # Cluster 2, bmi
]

# === Plot trace plots ===
for (k, d) in plot_targets:
    plt.figure(figsize=(10, 3))
    plt.plot(mu_samples[:, k, d], alpha=0.8)
    plt.title(f'Trace Plot: $\mu_{{Cluster={k+1}, Feature={feature_names[d]}}}$')
    plt.xlabel('Iteration')
    plt.ylabel('Value')
    plt.grid(True)
    plt.tight_layout()
    plt.show()

In [None]:
# mu_samples shape is (T, K, p)
# T is the number of samples, K is the number of clusters, and p is the number of features
mu_samples = np.array([s[1] for s in run_samples_rep])  # extract mu
T, K, p = mu_samples.shape
feature_names = data.columns
for d in range(p):
    plt.figure(figsize=(10, 6))
    
    # Plot the distribution of mu_k[:, d] for each cluster
    for k in range(K):
        values = mu_samples[:, k, d]  # T samples for mu_k, feature d
        sns.kdeplot(values, label=f'Cluster {k+1}', fill=True, alpha=0.3)

    plt.title(f'Posterior Means for {feature_names[d]}')
    plt.xlabel(feature_names[d])
    plt.ylabel('Density')
    plt.legend(title='Cluster')
    plt.tight_layout()
    plt.show()

In [None]:
num_iterations = 10000
K = 10
X = data.to_numpy() 

run_samples_nonrep = gibbs_sampler_gmm_multivariate(X, K, num_iterations, burn_in=2000)

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Extract mu samples from the non-repulsive sampler
mu_samples = np.array([s[1] for s in run_samples_nonrep])  # shape: (T, K, p)
T, K, p = mu_samples.shape
feature_names = data.columns  # assuming this has p features

# === Choose a few (k, d) pairs to visualize ===
plot_targets = [
    (0, 0),  # Cluster 1, height
    (1, 0),  # Cluster 2, height
    (0, 1),  # Cluster 1, bmi
    (1, 1),  # Cluster 2, bmi
]

# === Plot trace plots ===
for (k, d) in plot_targets:
    plt.figure(figsize=(10, 3))
    plt.plot(mu_samples[:, k, d], alpha=0.8)
    plt.title(f'Trace Plot: $\mu_{{Cluster={k+1}, Feature={feature_names[d]}}}$')
    plt.xlabel('Iteration')
    plt.ylabel('Value')
    plt.grid(True)
    plt.tight_layout()
    plt.show()

In [None]:
for d in range(p):
    plt.figure(figsize=(10, 6))
    
    # Plot the distribution of mu_k[:, d] for each cluster
    for k in range(K):
        values = mu_samples[:, k, d]  # T samples for mu_k, feature d
        sns.kdeplot(values, label=f'Cluster {k+1}', fill=True, alpha=0.3)

    plt.title(f'Posterior Means for {feature_names[d]}')
    plt.xlabel(feature_names[d])
    plt.ylabel('Density')
    plt.legend(title='Cluster')
    plt.tight_layout()
    plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.spatial.distance import pdist

def compute_distance_trace(mu_samples, mode='min'):
    """
    Computes a trace of distance statistics (min, mean, max) between cluster means.

    Parameters:
        mu_samples: ndarray of shape (T, K, p)
        mode: str, 'min', 'mean', or 'max'

    Returns:
        trace: ndarray of shape (T,)
    """
    trace = []
    for mu in mu_samples:
        dists = pdist(mu)  # All pairwise Euclidean distances between cluster means
        if mode == 'min':
            trace.append(np.min(dists))
        elif mode == 'mean':
            trace.append(np.mean(dists))
        elif mode == 'max':
            trace.append(np.max(dists))
        else:
            raise ValueError("mode must be 'min', 'mean', or 'max'")
    return np.array(trace)

# === Extract mu samples ===
mu_rep = np.array([s[1] for s in run_samples_rep])     # shape: (T, K, p)
mu_norep = np.array([s[1] for s in run_samples_nonrep]) # shape: (T, K, p)

# === Compute distance traces ===
dist_min_rep = compute_distance_trace(mu_rep, mode='min')
dist_mean_rep = compute_distance_trace(mu_rep, mode='mean')

dist_min_norep = compute_distance_trace(mu_norep, mode='min')
dist_mean_norep = compute_distance_trace(mu_norep, mode='mean')

# === Plotting ===
plt.figure(figsize=(14, 5))

# --- Minimum Distance Trace ---
plt.subplot(1, 2, 1)
plt.plot(dist_min_norep, label='No Repulsion', color='tab:blue', alpha=0.7)
plt.plot(dist_min_rep, label='With Repulsion (h1)', color='tab:red', alpha=0.7)
plt.title('Minimum Distance Between Cluster Means')
plt.xlabel('Iteration')
plt.ylabel('Min Distance')
plt.legend()
plt.grid(True)

# --- Mean Distance Trace ---
plt.subplot(1, 2, 2)
plt.plot(dist_mean_norep, label='No Repulsion', color='tab:blue', alpha=0.7)
plt.plot(dist_mean_rep, label='With Repulsion (h1)', color='tab:red', alpha=0.7)
plt.title('Mean Distance Between Cluster Means')
plt.xlabel('Iteration')
plt.ylabel('Mean Distance')
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.show()