In [32]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.neighbors import KernelDensity
from scipy import stats as st
from joblib import Parallel, delayed
from tqdm import tqdm
import pickle

In [2]:
script_dir = "/Users/ivyzhang/Documents/GitHub/prioritizing_data_collection/figures_code/fig2/"

In [3]:
num_trials = 1000
num_sources = 16  

P1_train_list = []
P1_test_list = []
source_list_all = []

for t in range(1, num_trials + 1):
    # Read P1_train and P1_test
    P1_train_list.append(pd.read_parquet(script_dir+f"data/P1_train_trial{t}.parquet"))
    P1_test_list.append(pd.read_parquet(script_dir+f"data/P1_test_trial{t}.parquet"))
    
    # Read all sources for this trial into a list
    sources = [
        pd.read_parquet(script_dir+f"data/source_{i}_trial{t}.parquet")
        for i in range(1, num_sources + 1)
    ]
    source_list_all.append(sources)


The below calculates KL and Score X as in Shen et al. (2024)


In [40]:
EPS = 1e-6 

def compute_pca(P1_train, P2_train, Pk_train, random_state=None):
    """First reduce dimensions"""
    # Project to PCA with fixed seed
    X1 = P1_train.iloc[:, 1:]
    X2 = P2_train.iloc[:, 1:]
    Xk = Pk_train.iloc[:, 1:]

    # Combine and scale
    combined = pd.concat([X1, X2, Xk], axis=0)
    scaler = StandardScaler().fit(combined)
    combined_scaled = scaler.transform(combined)

    # Fit PCA on combined scaled data
    pca = PCA(n_components=3, random_state=random_state)
    pca.fit(combined_scaled)

    # Project X1+X2 and Xk using the same scaler+pca
    X1X2 = pd.concat([X1, X2], axis=0)
    X1X2_scaled = scaler.transform(X1X2)
    Xk_scaled = scaler.transform(Xk)

    X1X2_proj = pca.transform(X1X2_scaled)
    Xk_proj = pca.transform(Xk_scaled)

    return {"X1X2_proj": X1X2_proj, "Xk_proj": Xk_proj}
    
def fit_density(X, bandwidths=np.logspace(-0.5, 0.5, 5), random_state=None):

    kf = KFold(n_splits=5, shuffle=True, random_state=random_state)
    best_bw, best_ll = None, -np.inf

    for bw in bandwidths:
        fold_lls = []
        for train_idx, val_idx in kf.split(X):
            kde = KernelDensity(kernel="gaussian", bandwidth=bw).fit(X[train_idx])
            fold_lls.append(kde.score(X[val_idx]))
        if np.mean(fold_lls) > best_ll:
            best_ll, best_bw = np.mean(fold_lls), bw

    kde = KernelDensity(kernel="gaussian", bandwidth=best_bw).fit(X)
    return kde

def compute_kl_and_score(P1_train, P2_train, Pk_train, random_state=None):
    """
    Compute KL(Target||Source) using KDE estimates.
    """
    # PCA projection
    pcs = compute_pca(P1_train, P2_train, Pk_train, random_state)
    X1X2_pca = pcs["X1X2_proj"]
    Xk_pca = pcs["Xk_proj"]

    kde_t = fit_density(X1X2_pca,random_state=random_state)
    kde_s = fit_density(Xk_pca,random_state=random_state)
    
    with open(f'{script_dir}data/kde_t_{random_state}.pkl', 'wb') as f:
        pickle.dump(kde_t, f)
    
    with open(f'{script_dir}data/kde_s_{random_state}.pkl', 'wb') as f:
        pickle.dump(kde_s, f)
    

    # # Evaluate densities on X1X2 points
    # t = np.exp(kde_t.score_samples(X1X2_pca)) + EPS
    # s = np.exp(kde_s.score_samples(X1X2_pca)) + EPS
    # 
    # t /= t.sum()
    # s /= s.sum()
    # 
    # # KL(t || s)
    # kl = st.entropy(t, s)

    # Score_x

    log_t = np.log(np.exp(kde_t.score_samples(X1X2_pca)) + EPS)
    log_s = np.log(np.exp(kde_s.score_samples(X1X2_pca)) + EPS)

    log_ratio = log_t - log_s
    score_x = np.mean(log_ratio)
    
    return {"kl": None, "score_x": score_x}

def process_one_pk(P1_train, P2_train, Pk_train, random_state):
    res = compute_kl_and_score(P1_train, P2_train, Pk_train, random_state=random_state)
    return res["kl"], res["score_x"]

In [41]:
# kl_results = []
score_results = []

for t in tqdm(range(len(P1_train_list))):
    P1_train = P1_train_list[t]
    P2_train = source_list_all[t][0]
    Pk_list = source_list_all[t][1:]
    # Run all Pk for this trial in parallel
    results = Parallel(n_jobs=-1)(
        delayed(process_one_pk)(P1_train, P2_train, Pk, random_state=t)
        for Pk in Pk_list
    )

    # Split out the tuples
    kls, scores = zip(*results)
    # kl_results.append(list(kls))
    score_results.append(list(scores))

  0%|          | 0/1000 [00:00<?, ?it/s]Python(89815) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(89816) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(89817) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(89818) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(89819) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(89820) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(89821) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(89822) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
100%|██████████| 1000/1000 [16:21:47<00:00, 58.91s/it] 


In [43]:
# with open('kl_results.pkl', 'wb') as f:
#     pickle.dump(kl_results, f)
#     
with open('score_x.pkl', 'wb') as f:
    pickle.dump(score_results, f)


In [42]:
# avg_kl = np.mean(kl_results, axis=0)
avg_score_x = np.mean(score_results, axis=0)
results_df = pd.DataFrame({
    "kl": avg_kl,
    "score_x": avg_score_x
})
results_df.to_csv(f"{script_dir}data/kl_score_x_avg_v3.csv", index=False)
results_df

Unnamed: 0,kl,score_x
0,0.439699,1.045079
1,0.297684,0.923316
2,0.281259,0.907727
3,0.248743,0.854621
4,0.223369,0.814483
5,0.198516,0.792912
6,0.192761,0.787079
7,0.187875,0.788172
8,0.181064,0.783262
9,0.161771,0.747496
