In [None]:
%load_ext autoreload
%autoreload 2
import sys, os
sys.path.insert(1, '../')
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patheffects as pe
import seaborn as sns
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import pdb
import xgboost as xgb
from scipy.stats import norm
from scipy.stats import multivariate_normal
from scipy.stats import bernoulli

## Implementation of classical inference, PPI with data splitting, and cross-prediction

In [None]:
def trial(X, Y, n, alpha):
    # one trial; randomly splits data into labeled and unlabeled data and then runs baselines
    
    X_labeled, X_unlabeled, Y_labeled, Y_unlabeled = train_test_split(X, Y, train_size=n)

    classical_interval = classical_mean_interval(Y_labeled, alpha) # uses only labeled data
    
    pp_interval = pp_mean_interval(X_labeled, X_unlabeled, Y_labeled, alpha, int(0.5*n)) # runs PPI after data splitting
    
    cpp_interval = cross_prediction_mean_interval(X_labeled, X_unlabeled, Y_labeled, alpha) # cross-prediction
    
    return classical_interval, pp_interval, cpp_interval

In [None]:
def pp_mean_interval(X_labeled, X_unlabeled, Y_labeled, alpha, n_tr):
    # performs data splitting and then runs PPI
    
    n = X_labeled.shape[0]
    N = X_unlabeled.shape[0]
    
    # data used for training
    X_train, X_val, Y_train, Y_val = train_test_split(X_labeled, Y_labeled, train_size=n_tr)
    
    X_train1, X_train2, y_train1, y_train2 = train_test_split(X_train, Y_train, test_size=0.1)
    dtrain = xgb.DMatrix(X_train1, label=y_train1)
    dtest = xgb.DMatrix(X_train2, label=y_train2)
    param = {'max_depth': 7, 'eta': 0.1, 'objective': 'reg:squarederror', 'eval_metric': ['error', 'mae']}
    evallist = [(dtest, 'eval'), (dtrain, 'train')]
    num_round = 500
    tree = xgb.train(param, dtrain, num_round, evallist, verbose_eval=False)

    
    Yhat_unlabeled = tree.predict(xgb.DMatrix(X_unlabeled))
    Yhat_val = tree.predict(xgb.DMatrix(X_val))
    
    thetaPP = np.mean(Yhat_unlabeled) + np.mean(Y_val - Yhat_val)
    
    sigma_hat = np.sqrt(np.var(Yhat_unlabeled)/N + np.var(Y_val - Yhat_val)/(n - n_tr))
    
    halfwidth = norm.ppf(1-alpha/2) * sigma_hat
    
    return [thetaPP - halfwidth, thetaPP + halfwidth]

In [None]:
def cross_prediction_mean_interval(X_labeled, X_unlabeled, Y_labeled, alpha, K = 10):
    # cross-prediction
    
    n = X_labeled.shape[0]
    N = X_unlabeled.shape[0]
    
    fold_n = int(n/K)
    
    Yhat_labeled = np.zeros(n)
    Yhat_unlabeled = np.zeros(N)
    Yhat_avg_labeled = np.zeros(n)
    
    for j in range(K):
    
        X_val = X_labeled[j*fold_n:(j+1)*fold_n,:]
        Y_val = Y_labeled[j*fold_n:(j+1)*fold_n]
        train_ind = np.delete(range(n),range(j*fold_n,(j+1)*fold_n))
        X_train = X_labeled[train_ind,:]
        Y_train = Y_labeled[train_ind]

        # use train data to train a tree
        X_train1, X_train2, y_train1, y_train2 = train_test_split(X_train, Y_train, test_size=0.1)
        dtrain = xgb.DMatrix(X_train1, label=y_train1)
        dtest = xgb.DMatrix(X_train2, label=y_train2)
        param = {'max_depth': 7, 'eta': 0.1, 'objective': 'reg:squarederror', 'eval_metric': ['error', 'mae']}
        evallist = [(dtest, 'eval'), (dtrain, 'train')]
        num_round = 500
        tree = xgb.train(param, dtrain, num_round, evallist, verbose_eval=False)


        Yhat_unlabeled += (tree.predict(xgb.DMatrix(X_unlabeled)))/K
        Yhat_labeled[j*fold_n:(j+1)*fold_n] = tree.predict(xgb.DMatrix(X_val))
    
    
    thetaPP = np.mean(Yhat_unlabeled) + np.mean(Y_labeled - Yhat_labeled)


    var_hat = bootstrap_variance(X_labeled, X_unlabeled, Y_labeled, n-fold_n, thetaPP)
    
    
    halfwidth = norm.ppf(1-alpha/2) * np.sqrt(var_hat)
    
    return [thetaPP - halfwidth, thetaPP + halfwidth]

In [None]:
def bootstrap_variance(X_labeled, X_unlabeled, Y_labeled, train_n, thetaPP, B = 30):
    # estimates the asymptotic variance of cross-prediction
    
    n = X_labeled.shape[0]
    N = X_unlabeled.shape[0]

    Yhat_labeled = np.zeros(n)
    Yhat_unlabeled = np.zeros(N)
    
    
    grad_diff = np.zeros(int((n-train_n)*B))
    
    for j in range(B):
        
        train_ind = np.random.choice(range(n),train_n)
        X_train = X_labeled[train_ind,:]
        Y_train = Y_labeled[train_ind]
        
        # use train data to train a tree
        X_train1, X_train2, y_train1, y_train2 = train_test_split(X_train, Y_train, test_size=0.1)
        dtrain = xgb.DMatrix(X_train1, label=y_train1)
        dtest = xgb.DMatrix(X_train2, label=y_train2)
        param = {'max_depth': 7, 'eta': 0.1, 'objective': 'reg:squarederror', 'eval_metric': ['error', 'mae']}
        evallist = [(dtest, 'eval'), (dtrain, 'train')]
        num_round = 500
        tree = xgb.train(param, dtrain, num_round, evallist, verbose_eval=False)

        Yhat_unlabeled += (tree.predict(xgb.DMatrix(X_unlabeled)))/B
        
        other_inds = np.delete(range(n), train_ind)[:n-train_n]
        Yhat_labeled = tree.predict(xgb.DMatrix(X_labeled[other_inds, :]))

        grad_diff[j*(n-train_n):(j+1)*(n-train_n)] = Yhat_labeled - Y_labeled[other_inds]
    
    var_unlabeled = np.var(Yhat_unlabeled)
    var_labeled = np.var(grad_diff)
    
    var_hat = var_unlabeled/N + var_labeled/n
    
    return var_hat

In [None]:
def classical_mean_interval(Y, alpha):
    # classical CLT interval
    
    n = len(Y)
    
    point_estimate = np.mean(Y)
    
    halfwidth = norm.ppf(1-alpha/2) * np.sqrt(np.var(Y)/n)
    
    return [point_estimate - halfwidth, point_estimate + halfwidth]

## Main cell: generate data and form intervals

In [None]:
N = 10000 # size of unlabeled data
ns = np.array([100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]) # size of labeled data
num_trials = 100
alpha = 0.1

# parameters of data generating process:
d = 2
Rsqs = [0, 0.5, 1]
var_y = 4
mu = 4
beta = np.zeros(d)

df_list = []
        
# store results
columns = ["lb","ub","coverage","estimator","n", r'$R^2$']

std_errs = np.zeros((len(ns), num_trials))

for Rsq in Rsqs:
    
    # for saving data
    filename = "mean_results/" + f"Rsq_{Rsq}".replace(".", "_") + ".csv"
    if os.path.exists(filename):
        continue
    
    # target
    beta[:] = np.sqrt(Rsq * var_y / d)
    theta_true = mu
        

    results = []
    for j in tqdm(range(ns.shape[0])):
        n = ns[j]
        
        for i in range(num_trials):

            # generate data
            X = multivariate_normal.rvs(mean=np.zeros(d), cov=np.eye(d), size=(n+N)) # feature matrix
            y = X @ beta + np.sqrt(var_y * (1-Rsq))*np.random.randn(n+N) + mu # outcomes

            ci, ppi, cppi = trial(X, y, n, alpha)
            
            temp_df = pd.DataFrame(np.zeros((3,len(columns))), columns=columns)
            temp_df.loc[0] = cppi[0], cppi[1], (cppi[0] <= theta_true) & (theta_true <= cppi[1]), "cross-prediction", n, Rsq
            temp_df.loc[1] = ci[0], ci[1], (ci[0] <= theta_true) & (theta_true <= ci[1]), "classical", n, Rsq
            temp_df.loc[2] = ppi[0], ppi[1], (ppi[0] <= theta_true) & (theta_true <= ppi[1]), "PPI", n, Rsq
            results += [temp_df]

    df = pd.concat(results)
    df["width"] = df["ub"] - df["lb"]
    df_list += [df]
    os.makedirs('./mean_results/', exist_ok=True)
    
    # save data
    df.to_csv(filename)
        
final_df = pd.concat(df_list, ignore_index=True)

















100%|████████████████████████████████████████| 7/7 [15:32:34<00:00, 7993.45s/it]




























100%|████████████████████████████████████████| 7/7 [10:52:04<00:00, 5589.24s/it]


## Plot results

In [None]:
# plots coverage and width as function of n and R^2
Rsqs = [0, 0.5, 1]
alpha=0.1

col = np.array([sns.color_palette("Set2")[1], sns.color_palette("Set2")[2], sns.color_palette("Set2")[0]])
sns.set_theme(font_scale=1.4, style='white', palette=col, rc={'lines.linewidth': 3})
fig, axs = plt.subplots(nrows=3, ncols=2, figsize=(10,10))
sns.lineplot(ax=axs[0,0],data=final_df[(final_df[r'$R^2$'] == 0)], x='n', y='coverage', hue='estimator', alpha=0.9, errorbar=None)
sns.lineplot(ax=axs[0,1],data=final_df[ (final_df[r'$R^2$'] == 0)], x='n', y='width', hue='estimator', alpha=0.9)
sns.lineplot(ax=axs[1,0],data=final_df[(final_df[r'$R^2$'] == 0.5)], x='n', y='coverage', hue='estimator', alpha=0.9, errorbar=None)
sns.lineplot(ax=axs[1,1],data=final_df[ (final_df[r'$R^2$'] == 0.5)], x='n', y='width', hue='estimator', alpha=0.9)
sns.lineplot(ax=axs[2,0],data=final_df[(final_df[r'$R^2$'] == 1)], x='n', y='coverage', hue='estimator', alpha=0.9, errorbar=None)
sns.lineplot(ax=axs[2,1],data=final_df[ (final_df[r'$R^2$'] == 1)], x='n', y='width', hue='estimator', alpha=0.9)


grid = plt.GridSpec(3, 1)
for i in range(3):
    # create fake subplot just to title set of subplots
    fake = fig.add_subplot(grid[i])
    # '\n' is important
    fake.set_title(f'R² = {Rsqs[i]}\n', size=18)
    fake.set_axis_off()


for i in range(axs.shape[0]):
    axs[i,0].axhline(1-alpha, color="#888888", linestyle='dashed', zorder=1, alpha=0.9)
    for j in range(axs.shape[1]):
        if (i == 0) & (j == 1):
            handles, labels = axs[i,j].get_legend_handles_labels()
            axs[i,j].legend(handles=handles, labels=labels)
        else:
            # remove the legend
            axs[i,j].get_legend().remove()
            axs[i,0].set_ylim([0.5,1])

sns.despine(top=True, right=True)
plt.tight_layout()

# save plot
plt.savefig('./mean_results/mean_comparison.pdf')
plt.show()

In [None]:
# for reading data after it has been saved
datadir = './mean_results/'
filenames = os.listdir(datadir)
data = [ pd.read_csv(os.path.join(datadir, fn)) for fn in filenames if 'Rsq' in fn ]
final_df = pd.concat(data, axis=0, ignore_index=True)