In [None]:
# Import libraries
import folktables
import pandas as pd
import numpy as np
from matplotlib.ticker import FormatStrFormatter
import random
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
from scipy.stats import pearsonr
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from joblib import Parallel, delayed
import multiprocessing as mp
from tqdm import tqdm
import pickle
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KernelDensity
from scipy import stats as st

In [None]:
EPS = 1e-6 

def compute_pca(X1, X2, random_state=None):
    """First reduce dimensions"""
    # Project to PCA with fixed seed
    pipeline = make_pipeline(
        StandardScaler(), 
        PCA(n_components=3, random_state=random_state)
    )
    combined = pd.concat([X1, X2], axis=0).sort_index()
    pipeline.fit(combined)

    X1_proj = pipeline.transform(X1)
    X2_proj = pipeline.transform(X2)
    return X1_proj, X2_proj
    
def fit_density(X, bandwidths=np.logspace(-1, 1, 20), random_state=None):
    """Fit a KDE with bandwidth selection."""
    grid = GridSearchCV(
        KernelDensity(kernel="gaussian"),
        {"bandwidth": bandwidths},
        cv=5,
        n_jobs=1,  # Force single-threaded for reproducibility
    )
    grid.fit(X)
    return grid.best_estimator_

def compute_kl(X1, X2, random_state=None):
    """
    Compute KL(Target||Source) using KDE estimates.
    X1: samples from Target
    X2: samples from Source
    """
    # PCA projection
    X1_pca, X2_pca = compute_pca(X1, X2, random_state=random_state)

    # Fit KDEs
    kde_t = fit_density(X1_pca, random_state=random_state)
    kde_s = fit_density(X2_pca, random_state=random_state)

    # Evaluate densities at X1 points
    t = np.exp(kde_t.score_samples(X1_pca)) + EPS
    s = np.exp(kde_s.score_samples(X1_pca)) + EPS

    t /= t.sum()
    s /= s.sum()

    # KL(Target||Source)
    return st.entropy(t, s)

def compute_score_x(X1, X2, random_state=None):
    """
    Compute mean log-density ratio for X1 relative to X2.
    """
    # PCA projection
    X1_pca, X2_pca = compute_pca(X1, X2, random_state=random_state)
    
    # Fit KDEs
    kde_t = fit_density(X1_pca, random_state=random_state)
    kde_s = fit_density(X2_pca, random_state=random_state)

    # Evaluate on combined grid
    log_t = np.log(np.exp(kde_t.score_samples(X1_pca)) + EPS)
    log_s = np.log(np.exp(kde_s.score_samples(X1_pca)) + EPS)

    log_ratio = log_t - log_s
    return np.mean(log_ratio) / np.std(log_ratio)