In [7]:
import numpy as np
SEED = 45
np.random.seed(SEED)
from scipy.special import expit
from scipy.optimize import linear_sum_assignment
from sklearn.metrics import mean_squared_error
from models import GibbsSamplerLLFM
from evals import latent_features


In [9]:
def generate_synthetic(T=150, S=4, K_true=2):

    

    # ----- True latent features -----
    Z_true = np.zeros((T, K_true))
    Z_true[:70, 0] = 1
    Z_true[110:, 1] = 1
    #Z_true[50:75, :] = 1
    

    # ----- True weights -----
    W_true = np.zeros((K_true, S))
    W_true[0,1] = 6
    W_true[0,3] = 6
    W_true[1,0] = 6
    W_true[1,2] = 6


    # ----- True bias -----
    b_true = np.array([-6, -6, -6, -6])

    # ----- Generate observations -----
    logits = Z_true @ W_true + b_true
    P_true = expit(logits)
    Y = np.random.binomial(1, P_true)

    return Y, Z_true, W_true, b_true, P_true

Y, Z_true, W_true, b_true, P_true = generate_synthetic()
#print("Generated synthetic data Y:", Y)
print("Probabilities P_true:", P_true)



Probabilities P_true: [[0.00247262 0.5        0.00247262 0.5       ]
 [0.00247262 0.5        0.00247262 0.5       ]
 [0.00247262 0.5        0.00247262 0.5       ]
 [0.00247262 0.5        0.00247262 0.5       ]
 [0.00247262 0.5        0.00247262 0.5       ]
 [0.00247262 0.5        0.00247262 0.5       ]
 [0.00247262 0.5        0.00247262 0.5       ]
 [0.00247262 0.5        0.00247262 0.5       ]
 [0.00247262 0.5        0.00247262 0.5       ]
 [0.00247262 0.5        0.00247262 0.5       ]
 [0.00247262 0.5        0.00247262 0.5       ]
 [0.00247262 0.5        0.00247262 0.5       ]
 [0.00247262 0.5        0.00247262 0.5       ]
 [0.00247262 0.5        0.00247262 0.5       ]
 [0.00247262 0.5        0.00247262 0.5       ]
 [0.00247262 0.5        0.00247262 0.5       ]
 [0.00247262 0.5        0.00247262 0.5       ]
 [0.00247262 0.5        0.00247262 0.5       ]
 [0.00247262 0.5        0.00247262 0.5       ]
 [0.00247262 0.5        0.00247262 0.5       ]
 [0.00247262 0.5        0.00247262 0.5

In [10]:
# ---- Instantiate your sampler ----
sampler = GibbsSamplerLLFM(
    Data=Y,
    K=10,              
    alpha=0.1,
    sigma_w=3.0,
    mu_b=-1,
    sigma_b=1,
    #fixed_bias=[-0.5,-0.5,-0.5,-0.5],
    n_iter=1000,
    burn=200,
    n_subsample=500
)

# ---- Run MCMC ----
sampler.run()
sampler.get_posterior_samples()


(array([[[-1.71105412e+00, -1.06495274e+00, -1.09316925e+00,
           2.62083581e+00],
         [-7.00756428e-01, -2.30741267e-01,  6.00361284e-01,
          -1.73657979e+00],
         [ 1.87637300e+00, -1.39228406e-01, -3.10339330e+00,
           8.17162042e+00],
         ...,
         [ 7.72232778e-01,  3.11617124e+00, -1.99447777e+00,
          -3.19709408e-01],
         [-1.06581143e-01, -2.63626899e+00, -6.36258432e+00,
          -7.86990551e-01],
         [-5.64027760e+00, -1.22205159e-01, -2.23867108e+00,
          -2.48619272e+00]],
 
        [[-6.59632427e+00,  2.11717433e+00, -8.15181360e+00,
           2.20473665e+00],
         [-5.02331281e-01,  2.14396987e+00,  2.74467777e+00,
           6.11911946e+00],
         [ 4.27501508e+00, -1.05397775e+00,  2.07603887e+00,
           2.92050684e+00],
         ...,
         [ 2.04065420e-01,  2.90940798e+00,  1.59275718e-01,
          -1.09404539e+00],
         [ 1.36079035e-01, -2.94124868e+00, -6.91230112e-01,
          -1.70934

In [11]:
latent_features( Z_post=sampler.good_samples_Z, W_post=sampler.good_samples_W, b_post=sampler.good_samples_b)
p1given0 = sampler.posterior_predictive([1, 0, 0])
print("P(pred=1 | conds=[1,0,0]):", p1given0)

Posterior grouping by number of active features

Number of samples with zero active features: 329/500

Group with 1 active features:
  Number of posterior samples: 152/500
  Average usage per feature (size-biased order):
[23.04605263]
  Average weights:
[[-9.64667617 -6.77906104 -9.82822426 -7.38930992]]
  Average Bias:
[-1.73822439 -0.96459792 -1.75646363 -1.13505011]
--------------------------------------------------
Group with 2 active features:
  Number of posterior samples: 16/500
  Average usage per feature (size-biased order):
[17.375   9.5625]
  Average weights:
[[-7.38958803 -6.53161199 -8.62997512 -6.35982414]
 [-5.50005081 -4.46646521 -6.41673919 -3.07870014]]
  Average Bias:
[-1.6518161  -0.91043558 -1.63861712 -1.01216274]
--------------------------------------------------
Group with 3 active features:
  Number of posterior samples: 3/500
  Average usage per feature (size-biased order):
[26.66666667 17.33333333  9.33333333]
  Average weights:
[[ -9.15754325  -7.90706651 -1

In [5]:
def write_true_parameters_to_file(
    filename,
    Y,
    Z_true,
    W_true,
    b_true
):
    """
    Writes a structured summary of the true synthetic parameters to file.
    Appends to file so posterior summaries can be added later.
    """

    T, S = Y.shape
    K_true = Z_true.shape[1]

    with open(filename, "a") as f:

        f.write("=" * 60 + "\n")
        f.write("TRUE SYNTHETIC PARAMETERS\n")
        f.write("=" * 60 + "\n\n")

        f.write(f"T (observations): {T}\n")
        f.write(f"S (dimensions):   {S}\n")
        f.write(f"K_true (features): {K_true}\n\n")

        # ---- Latent feature usage ----
        usage = Z_true.sum(axis=0)

        f.write("True latent feature usage counts:\n")
        for k in range(K_true):
            f.write(f"  Feature {k}: active {usage[k]} times\n")
        f.write("\n")

        # ---- True weights ----
        f.write("True weight matrix W_true (K x S):\n")
        f.write(np.array2string(W_true, precision=3))
        f.write("\n\n")

        # ---- True bias ----
        f.write("True bias vector b_true:\n")
        f.write(np.array2string(b_true, precision=3))
        f.write("\n\n")

        # ---- Observed data summary ----
        f.write("Observed data summary (Y):\n")
        f.write(f"  Mean activation per dimension: {Y.mean(axis=0)}\n")
        f.write(f"  Overall mean activation: {Y.mean():.4f}\n\n")

        f.write("=" * 60 + "\n\n")

write_true_parameters_to_file('validation1_overlapping_feature_weight.txt', Y, Z_true, W_true, b_true)

In [6]:
from evals import latent_features_to_file
latent_features_to_file(filename='validation1_overlapping_feature_weight.txt', Z_post=sampler.good_samples_Z, W_post=sampler.good_samples_W, b_post=sampler.good_samples_b)
p1given0 = sampler.posterior_predictive([1, 0, 0])
print("P(pred=1 | conds=[1,0,0]):", p1given0)

log_numerator: 0.3267140295466753
log_denom: 2.3261272741329133
P(pred=1 | conds=[1,0,0]): 0.13541471524802723
